835 files changed, 54487 insertions, 34229 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 0a93dc1cb4a..55abfd62654 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -11,8 +11,7 @@ config 9P_FS
 
 if 9P_FS
 config 9P_FSCACHE
-	bool "Enable 9P client caching support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	bool "Enable 9P client caching support"
 	depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
 	help
 	  Choose Y here to enable persistent, read-only local
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 15b67916620..7af425f53be 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -23,6 +23,7 @@
 #include "acl.h"
 #include "v9fs.h"
 #include "v9fs_vfs.h"
+#include "fid.h"
 
 static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
@@ -113,16 +114,12 @@ struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type)
 
 }
 
-static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
+static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
 {
 	int retval;
 	char *name;
 	size_t size;
 	void *buffer;
-	struct inode *inode = dentry->d_inode;
-
-	set_cached_acl(inode, type, acl);
-
 	if (!acl)
 		return 0;
 
@@ -144,17 +141,16 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
 	default:
 		BUG();
 	}
-	retval = v9fs_xattr_set(dentry, name, buffer, size, 0);
+	retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0);
 err_free_out:
 	kfree(buffer);
 	return retval;
 }
 
-int v9fs_acl_chmod(struct dentry *dentry)
+int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
 {
 	int retval = 0;
 	struct posix_acl *acl;
-	struct inode *inode = dentry->d_inode;
 
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
@@ -163,25 +159,30 @@ int v9fs_acl_chmod(struct dentry *dentry)
 		retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
 		if (retval)
 			return retval;
-		retval = v9fs_set_acl(dentry, ACL_TYPE_ACCESS, acl);
+		set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+		retval = v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl);
 		posix_acl_release(acl);
 	}
 	return retval;
 }
 
-int v9fs_set_create_acl(struct dentry *dentry,
-			struct posix_acl **dpacl, struct posix_acl **pacl)
+int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
+			struct posix_acl *dacl, struct posix_acl *acl)
 {
-	if (dentry) {
-		v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, *dpacl);
-		v9fs_set_acl(dentry, ACL_TYPE_ACCESS, *pacl);
-	}
-	posix_acl_release(*dpacl);
-	posix_acl_release(*pacl);
-	*dpacl = *pacl = NULL;
+	set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
+	set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+	v9fs_set_acl(fid, ACL_TYPE_DEFAULT, dacl);
+	v9fs_set_acl(fid, ACL_TYPE_ACCESS, acl);
 	return 0;
 }
 
+void v9fs_put_acl(struct posix_acl *dacl,
+		  struct posix_acl *acl)
+{
+	posix_acl_release(dacl);
+	posix_acl_release(acl);
+}
+
 int v9fs_acl_mode(struct inode *dir, umode_t *modep,
 		  struct posix_acl **dpacl, struct posix_acl **pacl)
 {
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 55955641196..e4f7e882272 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -17,27 +17,33 @@
 #ifdef CONFIG_9P_FS_POSIX_ACL
 extern int v9fs_get_acl(struct inode *, struct p9_fid *);
 extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type);
-extern int v9fs_acl_chmod(struct dentry *);
-extern int v9fs_set_create_acl(struct dentry *,
-			       struct posix_acl **, struct posix_acl **);
+extern int v9fs_acl_chmod(struct inode *, struct p9_fid *);
+extern int v9fs_set_create_acl(struct inode *, struct p9_fid *,
+			       struct posix_acl *, struct posix_acl *);
 extern int v9fs_acl_mode(struct inode *dir, umode_t *modep,
 			 struct posix_acl **dpacl, struct posix_acl **pacl);
+extern void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
 #else
 #define v9fs_iop_get_acl NULL
 static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 {
 	return 0;
 }
-static inline int v9fs_acl_chmod(struct dentry *dentry)
+static inline int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
 {
 	return 0;
 }
-static inline int v9fs_set_create_acl(struct dentry *dentry,
-				      struct posix_acl **dpacl,
-				      struct posix_acl **pacl)
+static inline int v9fs_set_create_acl(struct inode *inode,
+				      struct p9_fid *fid,
+				      struct posix_acl *dacl,
+				      struct posix_acl *acl)
 {
 	return 0;
 }
+static inline void v9fs_put_acl(struct posix_acl *dacl,
+				struct posix_acl *acl)
+{
+}
 static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep,
 				struct posix_acl **dpacl,
 				struct posix_acl **pacl)
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index da8eefbe830..d51ec9fafcc 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -41,29 +41,16 @@
  *
  */
 
-int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
+static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid)
 {
-	struct v9fs_dentry *dent;
-
-	p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n",
-		 fid->fid, dentry->d_name.name);
-
-	dent = dentry->d_fsdata;
-	if (!dent) {
-		dent = kmalloc(sizeof(struct v9fs_dentry), GFP_KERNEL);
-		if (!dent)
-			return -ENOMEM;
-
-		spin_lock_init(&dent->lock);
-		INIT_LIST_HEAD(&dent->fidlist);
-		dentry->d_fsdata = dent;
-	}
-
-	spin_lock(&dent->lock);
-	list_add(&fid->dlist, &dent->fidlist);
-	spin_unlock(&dent->lock);
+	hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata);
+}
 
-	return 0;
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
+{
+	spin_lock(&dentry->d_lock);
+	__add_fid(dentry, fid);
+	spin_unlock(&dentry->d_lock);
 }
 
 /**
@@ -74,24 +61,25 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
  *
  */
 
-static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
+static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
 {
-	struct v9fs_dentry *dent;
 	struct p9_fid *fid, *ret;
 
 	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
-		 dentry->d_name.name, dentry, uid, any);
-	dent = (struct v9fs_dentry *) dentry->d_fsdata;
+		 dentry->d_name.name, dentry, from_kuid(&init_user_ns, uid),
+		 any);
 	ret = NULL;
-	if (dent) {
-		spin_lock(&dent->lock);
-		list_for_each_entry(fid, &dent->fidlist, dlist) {
-			if (any || fid->uid == uid) {
+	/* we'll recheck under lock if there's anything to look in */
+	if (dentry->d_fsdata) {
+		struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata;
+		spin_lock(&dentry->d_lock);
+		hlist_for_each_entry(fid, h, dlist) {
+			if (any || uid_eq(fid->uid, uid)) {
 				ret = fid;
 				break;
 			}
 		}
-		spin_unlock(&dent->lock);
+		spin_unlock(&dentry->d_lock);
 	}
 
 	return ret;
@@ -126,7 +114,7 @@ err_out:
 }
 
 static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
-					       uid_t uid, int any)
+					       kuid_t uid, int any)
 {
 	struct dentry *ds;
 	char **wnames, *uname;
@@ -214,8 +202,17 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
 	}
 	kfree(wnames);
 fid_out:
-	if (!IS_ERR(fid))
-		v9fs_fid_add(dentry, fid);
+	if (!IS_ERR(fid)) {
+		spin_lock(&dentry->d_lock);
+		if (d_unhashed(dentry)) {
+			spin_unlock(&dentry->d_lock);
+			p9_client_clunk(fid);
+			fid = ERR_PTR(-ENOENT);
+		} else {
+			__add_fid(dentry, fid);
+			spin_unlock(&dentry->d_lock);
+		}
+	}
 err_out:
 	up_read(&v9ses->rename_sem);
 	return fid;
@@ -233,7 +230,7 @@ err_out:
 
 struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
-	uid_t uid;
+	kuid_t uid;
 	int  any, access;
 	struct v9fs_session_info *v9ses;
 
@@ -253,7 +250,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 		break;
 
 	default:
-		uid = ~0;
+		uid = INVALID_UID;
 		any = 0;
 		break;
 	}
@@ -272,7 +269,7 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
 	return ret;
 }
 
-static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
+static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, kuid_t uid)
 {
 	struct p9_fid *fid, *ret;
 
@@ -289,7 +286,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
 	int err;
 	struct p9_fid *fid;
 
-	fid = v9fs_fid_clone_with_uid(dentry, 0);
+	fid = v9fs_fid_clone_with_uid(dentry, GLOBAL_ROOT_UID);
 	if (IS_ERR(fid))
 		goto error_out;
 	/*
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index bb0b6e7f58f..2b6787fcb62 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -23,28 +23,8 @@
 #define FS_9P_FID_H
 #include <linux/list.h>
 
-/**
- * struct v9fs_dentry - 9p private data stored in dentry d_fsdata
- * @lock: protects the fidlist
- * @fidlist: list of FIDs currently associated with this dentry
- *
- * This structure defines the 9p private data associated with
- * a particular dentry.  In particular, this private data is used
- * to lookup which 9P FID handle should be used for a particular VFS
- * operation.  FID handles are associated with dentries instead of
- * inodes in order to more closely map functionality to the Plan 9
- * expected behavior for FID reclaimation and tracking.
- *
- * See Also: Mapping FIDs to Linux VFS model in
- * Design and Implementation of the Linux 9P File System documentation
- */
-struct v9fs_dentry {
-	spinlock_t lock; /* protect fidlist */
-	struct list_head fidlist;
-};
-
 struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
-int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
 struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
 #endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index d934f04e773..58e6cbce415 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -161,7 +161,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 				ret = r;
 				continue;
 			}
-			v9ses->dfltuid = option;
+			v9ses->dfltuid = make_kuid(current_user_ns(), option);
+			if (!uid_valid(v9ses->dfltuid)) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "uid field, but not a uid?\n");
+				ret = -EINVAL;
+				continue;
+			}
 			break;
 		case Opt_dfltgid:
 			r = match_int(&args[0], &option);
@@ -171,7 +177,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 				ret = r;
 				continue;
 			}
-			v9ses->dfltgid = option;
+			v9ses->dfltgid = make_kgid(current_user_ns(), option);
+			if (!gid_valid(v9ses->dfltgid)) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "gid field, but not a gid?\n");
+				ret = -EINVAL;
+				continue;
+			}
 			break;
 		case Opt_afid:
 			r = match_int(&args[0], &option);
@@ -248,8 +260,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			else if (strcmp(s, "client") == 0) {
 				v9ses->flags |= V9FS_ACCESS_CLIENT;
 			} else {
+				uid_t uid;
 				v9ses->flags |= V9FS_ACCESS_SINGLE;
-				v9ses->uid = simple_strtoul(s, &e, 10);
+				uid = simple_strtoul(s, &e, 10);
 				if (*e != '\0') {
 					ret = -EINVAL;
 					pr_info("Unknown access argument %s\n",
@@ -257,6 +270,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 					kfree(s);
 					goto free_and_return;
 				}
+				v9ses->uid = make_kuid(current_user_ns(), uid);
+				if (!uid_valid(v9ses->uid)) {
+					ret = -EINVAL;
+					pr_info("Uknown uid %s\n", s);
+					kfree(s);
+					goto free_and_return;
+				}
 			}
 
 			kfree(s);
@@ -319,7 +339,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	list_add(&v9ses->slist, &v9fs_sessionlist);
 	spin_unlock(&v9fs_sessionlist_lock);
 
-	v9ses->uid = ~0;
+	v9ses->uid = INVALID_UID;
 	v9ses->dfltuid = V9FS_DEFUID;
 	v9ses->dfltgid = V9FS_DEFGID;
 
@@ -364,7 +384,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 
 		v9ses->flags &= ~V9FS_ACCESS_MASK;
 		v9ses->flags |= V9FS_ACCESS_ANY;
-		v9ses->uid = ~0;
+		v9ses->uid = INVALID_UID;
 	}
 	if (!v9fs_proto_dotl(v9ses) ||
 		!((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
@@ -375,7 +395,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		v9ses->flags &= ~V9FS_ACL_MASK;
 	}
 
-	fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
+	fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID,
 							v9ses->aname);
 	if (IS_ERR(fid)) {
 		retval = PTR_ERR(fid);
@@ -387,7 +407,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE)
 		fid->uid = v9ses->uid;
 	else
-		fid->uid = ~0;
+		fid->uid = INVALID_UID;
 
 #ifdef CONFIG_9P_FSCACHE
 	/* register the session for caching */
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 34c59f14a1c..a8e127c8962 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -109,9 +109,9 @@ struct v9fs_session_info {
 	char *uname;		/* user name to mount as */
 	char *aname;		/* name of remote hierarchy being mounted */
 	unsigned int maxdata;	/* max data for client interface */
-	unsigned int dfltuid;	/* default uid/muid for legacy support */
-	unsigned int dfltgid;	/* default gid for legacy support */
-	u32 uid;		/* if ACCESS_SINGLE, the uid that has access */
+	kuid_t dfltuid;		/* default uid/muid for legacy support */
+	kgid_t dfltgid;		/* default gid for legacy support */
+	kuid_t uid;		/* if ACCESS_SINGLE, the uid that has access */
 	struct p9_client *clnt;	/* 9p client */
 	struct list_head slist; /* list of sessions registered with v9fs */
 	struct backing_dev_info bdi;
@@ -165,8 +165,8 @@ extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
 #define V9FS_PORT	564
 #define V9FS_DEFUSER	"nobody"
 #define V9FS_DEFANAME	""
-#define V9FS_DEFUID	(-2)
-#define V9FS_DEFGID	(-2)
+#define V9FS_DEFUID	KUIDT_INIT(-2)
+#define V9FS_DEFGID	KGIDT_INIT(-2)
 
 static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
 {
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 0ad61c6a65a..9ff073f4090 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,6 +33,7 @@
 #include <linux/pagemap.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/aio.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -147,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
  * @offset: offset in the page
  */
 
-static void v9fs_invalidate_page(struct page *page, unsigned long offset)
+static void v9fs_invalidate_page(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	/*
 	 * If called with zero offset, we should release
 	 * the private state assocated with the page
 	 */
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		v9fs_fscache_invalidate_page(page);
 }
 
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 64600b5d052..f039b104a98 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -83,21 +83,12 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 
 static void v9fs_dentry_release(struct dentry *dentry)
 {
-	struct v9fs_dentry *dent;
-	struct p9_fid *temp, *current_fid;
-
+	struct hlist_node *p, *n;
 	p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
 		 dentry->d_name.name, dentry);
-	dent = dentry->d_fsdata;
-	if (dent) {
-		list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
-									dlist) {
-			p9_client_clunk(current_fid);
-		}
-
-		kfree(dent);
-		dentry->d_fsdata = NULL;
-	}
+	hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
+		p9_client_clunk(hlist_entry(p, struct p9_fid, dlist));
+	dentry->d_fsdata = NULL;
 }
 
 static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
@@ -137,6 +128,7 @@ out_valid:
 
 const struct dentry_operations v9fs_cached_dentry_operations = {
 	.d_revalidate = v9fs_lookup_revalidate,
+	.d_weak_revalidate = v9fs_lookup_revalidate,
 	.d_delete = v9fs_cached_dentry_delete,
 	.d_release = v9fs_dentry_release,
 };
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index ff911e77965..4d0c2e0be7e 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -52,10 +52,9 @@
  */
 
 struct p9_rdir {
-	struct mutex mutex;
 	int head;
 	int tail;
-	uint8_t *buf;
+	uint8_t buf[];
 };
 
 /**
@@ -93,46 +92,24 @@ static void p9stat_init(struct p9_wstat *stbuf)
  *
  */
 
-static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
+static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
 {
-	struct p9_rdir *rdir;
-	struct p9_fid *fid;
-	int err = 0;
-
-	fid = filp->private_data;
-	if (!fid->rdir) {
-		rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
-
-		if (rdir == NULL) {
-			err = -ENOMEM;
-			goto exit;
-		}
-		spin_lock(&filp->f_dentry->d_lock);
-		if (!fid->rdir) {
-			rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir);
-			mutex_init(&rdir->mutex);
-			rdir->head = rdir->tail = 0;
-			fid->rdir = (void *) rdir;
-			rdir = NULL;
-		}
-		spin_unlock(&filp->f_dentry->d_lock);
-		kfree(rdir);
-	}
-exit:
-	return err;
+	struct p9_fid *fid = filp->private_data;
+	if (!fid->rdir)
+		fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
+	return fid->rdir;
 }
 
 /**
- * v9fs_dir_readdir - read a directory
- * @filp: opened file structure
- * @dirent: directory structure ???
- * @filldir: function to populate directory structure ???
+ * v9fs_dir_readdir - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
  *
  */
 
-static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 {
-	int over;
+	bool over;
 	struct p9_wstat st;
 	int err = 0;
 	struct p9_fid *fid;
@@ -140,25 +117,21 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int reclen = 0;
 	struct p9_rdir *rdir;
 
-	p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
-	fid = filp->private_data;
+	p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+	fid = file->private_data;
 
 	buflen = fid->clnt->msize - P9_IOHDRSZ;
 
-	err = v9fs_alloc_rdir_buf(filp, buflen);
-	if (err)
-		goto exit;
-	rdir = (struct p9_rdir *) fid->rdir;
+	rdir = v9fs_alloc_rdir_buf(file, buflen);
+	if (!rdir)
+		return -ENOMEM;
 
-	err = mutex_lock_interruptible(&rdir->mutex);
-	if (err)
-		return err;
-	while (err == 0) {
+	while (1) {
 		if (rdir->tail == rdir->head) {
-			err = v9fs_file_readn(filp, rdir->buf, NULL,
-							buflen, filp->f_pos);
+			err = v9fs_file_readn(file, rdir->buf, NULL,
+							buflen, ctx->pos);
 			if (err <= 0)
-				goto unlock_and_exit;
+				return err;
 
 			rdir->head = 0;
 			rdir->tail = err;
@@ -169,70 +142,52 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 					  rdir->tail - rdir->head, &st);
 			if (err) {
 				p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
-				err = -EIO;
 				p9stat_free(&st);
-				goto unlock_and_exit;
+				return -EIO;
 			}
 			reclen = st.size+2;
 
-			over = filldir(dirent, st.name, strlen(st.name),
-			    filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
-
+			over = !dir_emit(ctx, st.name, strlen(st.name),
+					 v9fs_qid2ino(&st.qid), dt_type(&st));
 			p9stat_free(&st);
+			if (over)
+				return 0;
 
-			if (over) {
-				err = 0;
-				goto unlock_and_exit;
-			}
 			rdir->head += reclen;
-			filp->f_pos += reclen;
+			ctx->pos += reclen;
 		}
 	}
-
-unlock_and_exit:
-	mutex_unlock(&rdir->mutex);
-exit:
-	return err;
 }
 
 /**
- * v9fs_dir_readdir_dotl - read a directory
- * @filp: opened file structure
- * @dirent: buffer to fill dirent structures
- * @filldir: function to populate dirent structures
+ * v9fs_dir_readdir_dotl - iterate through a directory
+ * @file: opened file structure
+ * @ctx: actor we feed the entries to
  *
  */
-static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
-						filldir_t filldir)
+static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
 {
-	int over;
 	int err = 0;
 	struct p9_fid *fid;
 	int buflen;
 	struct p9_rdir *rdir;
 	struct p9_dirent curdirent;
-	u64 oldoffset = 0;
 
-	p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
-	fid = filp->private_data;
+	p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
+	fid = file->private_data;
 
 	buflen = fid->clnt->msize - P9_READDIRHDRSZ;
 
-	err = v9fs_alloc_rdir_buf(filp, buflen);
-	if (err)
-		goto exit;
-	rdir = (struct p9_rdir *) fid->rdir;
-
-	err = mutex_lock_interruptible(&rdir->mutex);
-	if (err)
-		return err;
+	rdir = v9fs_alloc_rdir_buf(file, buflen);
+	if (!rdir)
+		return -ENOMEM;
 
-	while (err == 0) {
+	while (1) {
 		if (rdir->tail == rdir->head) {
 			err = p9_client_readdir(fid, rdir->buf, buflen,
-						filp->f_pos);
+						ctx->pos);
 			if (err <= 0)
-				goto unlock_and_exit;
+				return err;
 
 			rdir->head = 0;
 			rdir->tail = err;
@@ -245,36 +200,19 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
 					    &curdirent);
 			if (err < 0) {
 				p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
-				err = -EIO;
-				goto unlock_and_exit;
+				return -EIO;
 			}
 
-			/* d_off in dirent structure tracks the offset into
-			 * the next dirent in the dir. However, filldir()
-			 * expects offset into the current dirent. Hence
-			 * while calling filldir send the offset from the
-			 * previous dirent structure.
-			 */
-			over = filldir(dirent, curdirent.d_name,
-					strlen(curdirent.d_name),
-					oldoffset, v9fs_qid2ino(&curdirent.qid),
-					curdirent.d_type);
-			oldoffset = curdirent.d_off;
-
-			if (over) {
-				err = 0;
-				goto unlock_and_exit;
-			}
+			if (!dir_emit(ctx, curdirent.d_name,
+				      strlen(curdirent.d_name),
+				      v9fs_qid2ino(&curdirent.qid),
+				      curdirent.d_type))
+				return 0;
 
-			filp->f_pos = curdirent.d_off;
+			ctx->pos = curdirent.d_off;
 			rdir->head += err;
 		}
 	}
-
-unlock_and_exit:
-	mutex_unlock(&rdir->mutex);
-exit:
-	return err;
 }
 
 
@@ -300,7 +238,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 const struct file_operations v9fs_dir_operations = {
 	.read = generic_read_dir,
 	.llseek = generic_file_llseek,
-	.readdir = v9fs_dir_readdir,
+	.iterate = v9fs_dir_readdir,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 };
@@ -308,7 +246,7 @@ const struct file_operations v9fs_dir_operations = {
 const struct file_operations v9fs_dir_operations_dotl = {
 	.read = generic_read_dir,
 	.llseek = generic_file_llseek,
-	.readdir = v9fs_dir_readdir_dotl,
+	.iterate = v9fs_dir_readdir_dotl,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
         .fsync = v9fs_file_fsync_dotl,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index c2483e97bee..d384a8b77ee 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -80,10 +80,6 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 			p9_client_clunk(fid);
 			return err;
 		}
-		if (file->f_flags & O_TRUNC) {
-			i_size_write(inode, 0);
-			inode->i_blocks = 0;
-		}
 		if ((file->f_flags & O_APPEND) &&
 			(!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
 			generic_file_llseek(file, 0, SEEK_END);
@@ -133,7 +129,7 @@ out_error:
 static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	int res = 0;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
 	p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
 
@@ -302,7 +298,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
 
 static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	int ret = -ENOLCK;
 
 	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
@@ -338,7 +334,7 @@ out_err:
 static int v9fs_file_flock_dotl(struct file *filp, int cmd,
 	struct file_lock *fl)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	int ret = -ENOLCK;
 
 	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
@@ -529,7 +525,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	if (!count)
 		goto out;
 
-	retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode,
+	retval = v9fs_file_write_internal(file_inode(filp),
 					filp->private_data,
 					data, count, &origin, 1);
 	/* update offset on successful write */
@@ -604,7 +600,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct v9fs_inode *v9inode;
 	struct page *page = vmf->page;
 	struct file *filp = vma->vm_file;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
 
 	p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
@@ -620,6 +616,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	lock_page(page);
 	if (page->mapping != inode->i_mapping)
 		goto out_unlock;
+	wait_for_stable_page(page);
 
 	return VM_FAULT_LOCKED;
 out_unlock:
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 890bed538f9..d86edc8d3fd 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -192,9 +192,6 @@ int v9fs_uflags2omode(int uflags, int extended)
 		break;
 	}
 
-	if (uflags & O_TRUNC)
-		ret |= P9_OTRUNC;
-
 	if (extended) {
 		if (uflags & O_EXCL)
 			ret |= P9_OEXCL;
@@ -228,9 +225,9 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
 	wstat->uid = NULL;
 	wstat->gid = NULL;
 	wstat->muid = NULL;
-	wstat->n_uid = ~0;
-	wstat->n_gid = ~0;
-	wstat->n_muid = ~0;
+	wstat->n_uid = INVALID_UID;
+	wstat->n_gid = INVALID_GID;
+	wstat->n_muid = INVALID_UID;
 	wstat->extension = NULL;
 }
 
@@ -695,9 +692,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 				   "inode creation failed %d\n", err);
 			goto error;
 		}
-		err = v9fs_fid_add(dentry, fid);
-		if (err < 0)
-			goto error;
+		v9fs_fid_add(dentry, fid);
 		d_instantiate(dentry, inode);
 	}
 	return ofid;
@@ -793,7 +788,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct p9_fid *dfid, *fid;
 	struct inode *inode;
 	char *name;
-	int result = 0;
 
 	p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n",
 		 dir, dentry->d_name.name, dentry, flags);
@@ -811,13 +805,11 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	name = (char *) dentry->d_name.name;
 	fid = p9_client_walk(dfid, 1, &name, 1);
 	if (IS_ERR(fid)) {
-		result = PTR_ERR(fid);
-		if (result == -ENOENT) {
-			inode = NULL;
-			goto inst_out;
+		if (fid == ERR_PTR(-ENOENT)) {
+			d_add(dentry, NULL);
+			return NULL;
 		}
-
-		return ERR_PTR(result);
+		return ERR_CAST(fid);
 	}
 	/*
 	 * Make sure we don't use a wrong inode due to parallel
@@ -829,14 +821,9 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	else
 		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
-		result = PTR_ERR(inode);
-		inode = NULL;
-		goto error;
+		p9_client_clunk(fid);
+		return ERR_CAST(inode);
 	}
-	result = v9fs_fid_add(dentry, fid);
-	if (result < 0)
-		goto error_iput;
-inst_out:
 	/*
 	 * If we had a rename on the server and a parallel lookup
 	 * for the new name, then make sure we instantiate with
@@ -845,15 +832,13 @@ inst_out:
 	 * k/b.
 	 */
 	res = d_materialise_unique(dentry, inode);
-	if (!IS_ERR(res))
-		return res;
-	result = PTR_ERR(res);
-error_iput:
-	iput(inode);
-error:
-	p9_client_clunk(fid);
-
-	return ERR_PTR(result);
+	if (!res)
+		v9fs_fid_add(dentry, fid);
+	else if (!IS_ERR(res))
+		v9fs_fid_add(res, fid);
+	else
+		p9_client_clunk(fid);
+	return res;
 }
 
 static int
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 40895546e10..53687bbf229 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -57,7 +57,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
  * group of the new file system object.
  */
 
-static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
+static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
 {
 	BUG_ON(dir_inode == NULL);
 
@@ -186,7 +186,6 @@ static int v9fs_mapped_dotl_flags(int flags)
 		{ O_CREAT,	P9_DOTL_CREATE },
 		{ O_EXCL,	P9_DOTL_EXCL },
 		{ O_NOCTTY,	P9_DOTL_NOCTTY },
-		{ O_TRUNC,	P9_DOTL_TRUNC },
 		{ O_APPEND,	P9_DOTL_APPEND },
 		{ O_NONBLOCK,	P9_DOTL_NONBLOCK },
 		{ O_DSYNC,	P9_DOTL_DSYNC },
@@ -246,7 +245,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 			  int *opened)
 {
 	int err = 0;
-	gid_t gid;
+	kgid_t gid;
 	umode_t mode;
 	char *name = NULL;
 	struct p9_qid qid;
@@ -268,8 +267,14 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	}
 
 	/* Only creates */
-	if (!(flags & O_CREAT) || dentry->d_inode)
-		return finish_no_open(file, res);
+	if (!(flags & O_CREAT))
+		return	finish_no_open(file, res);
+	else if (dentry->d_inode) {
+		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+			return -EEXIST;
+		else
+			return finish_no_open(file, res);
+	}
 
 	v9ses = v9fs_inode2v9ses(dir);
 
@@ -325,13 +330,11 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
 		goto error;
 	}
-	err = v9fs_fid_add(dentry, fid);
-	if (err < 0)
-		goto error;
-	d_instantiate(dentry, inode);
-
 	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, &dacl, &pacl);
+	v9fs_set_create_acl(inode, fid, dacl, pacl);
+
+	v9fs_fid_add(dentry, fid);
+	d_instantiate(dentry, inode);
 
 	v9inode = V9FS_I(inode);
 	mutex_lock(&v9inode->v_mutex);
@@ -364,6 +367,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 #endif
 	*opened |= FILE_CREATED;
 out:
+	v9fs_put_acl(dacl, pacl);
 	dput(res);
 	return err;
 
@@ -373,7 +377,6 @@ error:
 err_clunk_old_fid:
 	if (ofid)
 		p9_client_clunk(ofid);
-	v9fs_set_create_acl(NULL, &dacl, &pacl);
 	goto out;
 }
 
@@ -391,7 +394,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 	int err;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL, *dfid = NULL;
-	gid_t gid;
+	kgid_t gid;
 	char *name;
 	umode_t mode;
 	struct inode *inode;
@@ -430,17 +433,17 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 	if (err < 0)
 		goto error;
 
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+			 err);
+		fid = NULL;
+		goto error;
+	}
+
 	/* instantiate inode and assign the unopened fid to the dentry */
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		fid = p9_client_walk(dfid, 1, &name, 1);
-		if (IS_ERR(fid)) {
-			err = PTR_ERR(fid);
-			p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-				 err);
-			fid = NULL;
-			goto error;
-		}
-
 		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
@@ -448,11 +451,11 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 				 err);
 			goto error;
 		}
-		err = v9fs_fid_add(dentry, fid);
-		if (err < 0)
-			goto error;
+		v9fs_fid_add(dentry, fid);
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
 		d_instantiate(dentry, inode);
 		fid = NULL;
+		err = 0;
 	} else {
 		/*
 		 * Not in cached mode. No need to populate
@@ -464,16 +467,15 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 			err = PTR_ERR(inode);
 			goto error;
 		}
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
 		d_instantiate(dentry, inode);
 	}
-	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, &dacl, &pacl);
 	inc_nlink(dir);
 	v9fs_invalidate_inode_attr(dir);
 error:
 	if (fid)
 		p9_client_clunk(fid);
-	v9fs_set_create_acl(NULL, &dacl, &pacl);
+	v9fs_put_acl(dacl, pacl);
 	return err;
 }
 
@@ -567,10 +569,11 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	struct p9_iattr_dotl p9attr;
+	struct inode *inode = dentry->d_inode;
 
 	p9_debug(P9_DEBUG_VFS, "\n");
 
-	retval = inode_change_ok(dentry->d_inode, iattr);
+	retval = inode_change_ok(inode, iattr);
 	if (retval)
 		return retval;
 
@@ -591,23 +594,23 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 		return PTR_ERR(fid);
 
 	/* Write all dirty data */
-	if (S_ISREG(dentry->d_inode->i_mode))
-		filemap_write_and_wait(dentry->d_inode->i_mapping);
+	if (S_ISREG(inode->i_mode))
+		filemap_write_and_wait(inode->i_mapping);
 
 	retval = p9_client_setattr(fid, &p9attr);
 	if (retval < 0)
 		return retval;
 
 	if ((iattr->ia_valid & ATTR_SIZE) &&
-	    iattr->ia_size != i_size_read(dentry->d_inode))
-		truncate_setsize(dentry->d_inode, iattr->ia_size);
+	    iattr->ia_size != i_size_read(inode))
+		truncate_setsize(inode, iattr->ia_size);
 
-	v9fs_invalidate_inode_attr(dentry->d_inode);
-	setattr_copy(dentry->d_inode, iattr);
-	mark_inode_dirty(dentry->d_inode);
+	v9fs_invalidate_inode_attr(inode);
+	setattr_copy(inode, iattr);
+	mark_inode_dirty(inode);
 	if (iattr->ia_valid & ATTR_MODE) {
 		/* We also want to update ACL when we update mode bits */
-		retval = v9fs_acl_chmod(dentry);
+		retval = v9fs_acl_chmod(inode, fid);
 		if (retval < 0)
 			return retval;
 	}
@@ -692,7 +695,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 		const char *symname)
 {
 	int err;
-	gid_t gid;
+	kgid_t gid;
 	char *name;
 	struct p9_qid qid;
 	struct inode *inode;
@@ -741,11 +744,10 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 				 err);
 			goto error;
 		}
-		err = v9fs_fid_add(dentry, fid);
-		if (err < 0)
-			goto error;
+		v9fs_fid_add(dentry, fid);
 		d_instantiate(dentry, inode);
 		fid = NULL;
+		err = 0;
 	} else {
 		/* Not in cached mode. No need to populate inode with stat */
 		inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
@@ -832,7 +834,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
 		dev_t rdev)
 {
 	int err;
-	gid_t gid;
+	kgid_t gid;
 	char *name;
 	umode_t mode;
 	struct v9fs_session_info *v9ses;
@@ -875,17 +877,17 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
 		goto error;
 
 	v9fs_invalidate_inode_attr(dir);
+	fid = p9_client_walk(dfid, 1, &name, 1);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+			 err);
+		fid = NULL;
+		goto error;
+	}
+
 	/* instantiate inode and assign the unopened fid to the dentry */
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-		fid = p9_client_walk(dfid, 1, &name, 1);
-		if (IS_ERR(fid)) {
-			err = PTR_ERR(fid);
-			p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-				 err);
-			fid = NULL;
-			goto error;
-		}
-
 		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
@@ -893,11 +895,11 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
 				 err);
 			goto error;
 		}
-		err = v9fs_fid_add(dentry, fid);
-		if (err < 0)
-			goto error;
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
+		v9fs_fid_add(dentry, fid);
 		d_instantiate(dentry, inode);
 		fid = NULL;
+		err = 0;
 	} else {
 		/*
 		 * Not in cached mode. No need to populate inode with stat.
@@ -908,14 +910,13 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
 			err = PTR_ERR(inode);
 			goto error;
 		}
+		v9fs_set_create_acl(inode, fid, dacl, pacl);
 		d_instantiate(dentry, inode);
 	}
-	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, &dacl, &pacl);
 error:
 	if (fid)
 		p9_client_clunk(fid);
-	v9fs_set_create_acl(NULL, &dacl, &pacl);
+	v9fs_put_acl(dacl, pacl);
 	return err;
 }
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 137d5039689..2756dcd5de6 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -363,5 +363,6 @@ struct file_system_type v9fs_fs_type = {
 	.mount = v9fs_mount,
 	.kill_sb = v9fs_kill_super,
 	.owner = THIS_MODULE,
-	.fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT,
+	.fs_flags = FS_RENAME_DOES_D_MOVE,
 };
+MODULE_ALIAS_FS("9p");
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 29653b70a9c..c45e016b190 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -111,19 +111,26 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
 int v9fs_xattr_set(struct dentry *dentry, const char *name,
 		   const void *value, size_t value_len, int flags)
 {
+	struct p9_fid *fid = v9fs_fid_lookup(dentry);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+	return v9fs_fid_xattr_set(fid, name, value, value_len, flags);
+}
+
+int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
+		   const void *value, size_t value_len, int flags)
+{
 	u64 offset = 0;
 	int retval, msize, write_count;
-	struct p9_fid *fid = NULL;
 
 	p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
 		 name, value_len, flags);
 
-	fid = v9fs_fid_clone(dentry);
-	if (IS_ERR(fid)) {
-		retval = PTR_ERR(fid);
-		fid = NULL;
-		goto error;
-	}
+	/* Clone it */
+	fid = p9_client_walk(fid, 0, NULL, 1);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
 	/*
 	 * On success fid points to xattr
 	 */
@@ -131,7 +138,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
 	if (retval < 0) {
 		p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
 			 retval);
-		goto error;
+		p9_client_clunk(fid);
+		return retval;
 	}
 	msize = fid->clnt->msize;
 	while (value_len) {
@@ -144,17 +152,12 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
 		if (write_count < 0) {
 			/* error in xattr write */
 			retval = write_count;
-			goto error;
+			break;
 		}
 		offset += write_count;
 		value_len -= write_count;
 	}
-	/* Total read xattr bytes */
-	retval = offset;
-error:
-	if (fid)
-		retval = p9_client_clunk(fid);
-	return retval;
+	return p9_client_clunk(fid);
 }
 
 ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index eaa837c53bd..eec348a3df7 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -27,6 +27,8 @@ extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *,
 				  void *, size_t);
 extern ssize_t v9fs_xattr_get(struct dentry *, const char *,
 			      void *, size_t);
+extern int v9fs_fid_xattr_set(struct p9_fid *, const char *,
+			  const void *, size_t, int);
 extern int v9fs_xattr_set(struct dentry *, const char *,
 			  const void *, size_t, int);
 extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/Kconfig b/fs/Kconfig
index 780725a463b..c229f828eb0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -211,6 +211,7 @@ source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
 source "fs/f2fs/Kconfig"
+source "fs/efivarfs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 0efd1524b97..370b24cee4d 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -65,6 +65,20 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS
 	  This config option changes the default setting of coredump_filter
 	  seen at boot time.  If unsure, say Y.
 
+config BINFMT_SCRIPT
+	tristate "Kernel support for scripts starting with #!"
+	default y
+	help
+	  Say Y here if you want to execute interpreted scripts starting with
+	  #! followed by the path to an interpreter.
+
+	  You can build this support as a module; however, until that module
+	  gets loaded, you cannot run scripts.  Thus, if you want to load this
+	  module from an initramfs, the portion of the initramfs before loading
+	  this module must consist of compiled binaries only.
+
+	  Most systems will not boot if you say M or N here.  If unsure, say Y.
+
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
 	depends on !MMU && (!FRV || BROKEN)
diff --git a/fs/Makefile b/fs/Makefile
index 9d53192236f..4fe6df3ec28 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -7,10 +7,10 @@
 
 obj-y :=	open.o read_write.o file_table.o super.o \
 		char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
-		ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
+		ioctl.o readdir.o select.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
-		pnode.o drop_caches.o splice.o sync.o utimes.o \
+		pnode.o splice.o sync.o utimes.o \
 		stack.o fs_struct.o statfs.o
 
 ifeq ($(CONFIG_BLOCK),y)
@@ -34,10 +34,7 @@ obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
 obj-$(CONFIG_BINFMT_EM86)	+= binfmt_em86.o
 obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc.o
-
-# binfmt_script is always there
-obj-y				+= binfmt_script.o
-
+obj-$(CONFIG_BINFMT_SCRIPT)	+= binfmt_script.o
 obj-$(CONFIG_BINFMT_ELF)	+= binfmt_elf.o
 obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
@@ -49,6 +46,7 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 obj-$(CONFIG_COREDUMP)		+= coredump.o
+obj-$(CONFIG_SYSCTL)		+= drop_caches.o
 
 obj-$(CONFIG_FHANDLE)		+= fhandle.o
 
@@ -127,3 +125,4 @@ obj-$(CONFIG_F2FS_FS)		+= f2fs/
 obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
+obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a7460..c5a7787dd5e 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,6 @@
 config ADFS_FS
-	tristate "ADFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	tristate "ADFS file system support"
+	depends on BLOCK
 	help
 	  The Acorn Disc Filing System is the standard file system of the
 	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index b3be2e7c564..0d138c0de29 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -17,47 +17,43 @@
 static DEFINE_RWLOCK(adfs_dir_lock);
 
 static int
-adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+adfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
 	struct object_info obj;
 	struct adfs_dir dir;
 	int ret = 0;
 
-	if (filp->f_pos >> 32)
-		goto out;
+	if (ctx->pos >> 32)
+		return 0;
 
 	ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
 	if (ret)
-		goto out;
+		return ret;
 
-	switch ((unsigned long)filp->f_pos) {
-	case 0:
-		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(file, ctx))
 			goto free_out;
-		filp->f_pos += 1;
-
-	case 1:
-		if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0)
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
+		if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR))
 			goto free_out;
-		filp->f_pos += 1;
-
-	default:
-		break;
+		ctx->pos = 2;
 	}
 
 	read_lock(&adfs_dir_lock);
 
-	ret = ops->setpos(&dir, filp->f_pos - 2);
+	ret = ops->setpos(&dir, ctx->pos - 2);
 	if (ret)
 		goto unlock_out;
 	while (ops->getnext(&dir, &obj) == 0) {
-		if (filldir(dirent, obj.name, obj.name_len,
-			    filp->f_pos, obj.file_id, DT_UNKNOWN) < 0)
-			goto unlock_out;
-		filp->f_pos += 1;
+		if (!dir_emit(ctx, obj.name, obj.name_len,
+			    obj.file_id, DT_UNKNOWN))
+			break;
+		ctx->pos++;
 	}
 
 unlock_out:
@@ -65,8 +61,6 @@ unlock_out:
 
 free_out:
 	ops->free(&dir);
-
-out:
 	return ret;
 }
 
@@ -192,13 +186,12 @@ out:
 const struct file_operations adfs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
-	.readdir	= adfs_readdir,
+	.iterate	= adfs_readdir,
 	.fsync		= generic_file_fsync,
 };
 
 static int
-adfs_hash(const struct dentry *parent, const struct inode *inode,
-		struct qstr *qstr)
+adfs_hash(const struct dentry *parent, struct qstr *qstr)
 {
 	const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
 	const unsigned char *name;
@@ -234,8 +227,7 @@ adfs_hash(const struct dentry *parent, const struct inode *inode,
  * requirements of the underlying filesystem.
  */
 static int
-adfs_compare(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+adfs_compare(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	int i;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index d5712293579..0ff4bae2c2a 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -524,6 +524,7 @@ static struct file_system_type adfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("adfs");
 
 static int __init init_adfs_fs(void)
 {
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
index cfad9afb476..a04d9e848d0 100644
--- a/fs/affs/Kconfig
+++ b/fs/affs/Kconfig
@@ -1,6 +1,6 @@
 config AFFS_FS
-	tristate "Amiga FFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	tristate "Amiga FFS file system support"
+	depends on BLOCK
 	help
 	  The Fast File System (FFS) is the common file system used on hard
 	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index eb82ee53ee0..d9a43674cb9 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -125,9 +125,8 @@ static void
 affs_fix_dcache(struct inode *inode, u32 entry_ino)
 {
 	struct dentry *dentry;
-	struct hlist_node *p;
 	spin_lock(&inode->i_lock);
-	hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		if (entry_ino == (u32)(long)dentry->d_fsdata) {
 			dentry->d_fsdata = (void *)inode->i_ino;
 			break;
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 8ca8f3a5559..f1eba8c3644 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -15,12 +15,12 @@
 
 #include "affs.h"
 
-static int affs_readdir(struct file *, void *, filldir_t);
+static int affs_readdir(struct file *, struct dir_context *);
 
 const struct file_operations affs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
-	.readdir	= affs_readdir,
+	.iterate	= affs_readdir,
 	.fsync		= affs_file_fsync,
 };
 
@@ -40,52 +40,35 @@ const struct inode_operations affs_dir_inode_operations = {
 };
 
 static int
-affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+affs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode		*inode = filp->f_path.dentry->d_inode;
+	struct inode		*inode = file_inode(file);
 	struct super_block	*sb = inode->i_sb;
-	struct buffer_head	*dir_bh;
-	struct buffer_head	*fh_bh;
+	struct buffer_head	*dir_bh = NULL;
+	struct buffer_head	*fh_bh = NULL;
 	unsigned char		*name;
 	int			 namelen;
 	u32			 i;
 	int			 hash_pos;
 	int			 chain_pos;
-	u32			 f_pos;
 	u32			 ino;
-	int			 stored;
-	int			 res;
 
-	pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos);
+	pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos);
 
-	stored = 0;
-	res    = -EIO;
-	dir_bh = NULL;
-	fh_bh  = NULL;
-	f_pos  = filp->f_pos;
-
-	if (f_pos == 0) {
-		filp->private_data = (void *)0;
-		if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0)
+	if (ctx->pos < 2) {
+		file->private_data = (void *)0;
+		if (!dir_emit_dots(file, ctx))
 			return 0;
-		filp->f_pos = f_pos = 1;
-		stored++;
-	}
-	if (f_pos == 1) {
-		if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0)
-			return stored;
-		filp->f_pos = f_pos = 2;
-		stored++;
 	}
 
 	affs_lock_dir(inode);
-	chain_pos = (f_pos - 2) & 0xffff;
-	hash_pos  = (f_pos - 2) >> 16;
+	chain_pos = (ctx->pos - 2) & 0xffff;
+	hash_pos  = (ctx->pos - 2) >> 16;
 	if (chain_pos == 0xffff) {
 		affs_warning(sb, "readdir", "More than 65535 entries in chain");
 		chain_pos = 0;
 		hash_pos++;
-		filp->f_pos = ((hash_pos << 16) | chain_pos) + 2;
+		ctx->pos = ((hash_pos << 16) | chain_pos) + 2;
 	}
 	dir_bh = affs_bread(sb, inode->i_ino);
 	if (!dir_bh)
@@ -94,8 +77,8 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	/* If the directory hasn't changed since the last call to readdir(),
 	 * we can jump directly to where we left off.
 	 */
-	ino = (u32)(long)filp->private_data;
-	if (ino && filp->f_version == inode->i_version) {
+	ino = (u32)(long)file->private_data;
+	if (ino && file->f_version == inode->i_version) {
 		pr_debug("AFFS: readdir() left off=%d\n", ino);
 		goto inside;
 	}
@@ -105,7 +88,7 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		fh_bh = affs_bread(sb, ino);
 		if (!fh_bh) {
 			affs_error(sb, "readdir","Cannot read block %d", i);
-			goto readdir_out;
+			return -EIO;
 		}
 		ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
 		affs_brelse(fh_bh);
@@ -119,38 +102,34 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
 		if (!ino)
 			continue;
-		f_pos = (hash_pos << 16) + 2;
+		ctx->pos = (hash_pos << 16) + 2;
 inside:
 		do {
 			fh_bh = affs_bread(sb, ino);
 			if (!fh_bh) {
 				affs_error(sb, "readdir","Cannot read block %d", ino);
-				goto readdir_done;
+				break;
 			}
 
 			namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
 			name = AFFS_TAIL(sb, fh_bh)->name + 1;
 			pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
-				 namelen, name, ino, hash_pos, f_pos);
-			if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0)
+				 namelen, name, ino, hash_pos, (u32)ctx->pos);
+			if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
 				goto readdir_done;
-			stored++;
-			f_pos++;
+			ctx->pos++;
 			ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
 			affs_brelse(fh_bh);
 			fh_bh = NULL;
 		} while (ino);
 	}
 readdir_done:
-	filp->f_pos = f_pos;
-	filp->f_version = inode->i_version;
-	filp->private_data = (void *)(long)ino;
-	res = stored;
+	file->f_version = inode->i_version;
+	file->private_data = (void *)(long)ino;
 
 readdir_out:
 	affs_brelse(dir_bh);
 	affs_brelse(fh_bh);
 	affs_unlock_dir(inode);
-	pr_debug("AFFS: readdir()=%d\n", stored);
-	return res;
+	return 0;
 }
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index ff65884a783..c36cbb4537a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,12 @@
 typedef int (*toupper_t)(int);
 
 static int	 affs_toupper(int ch);
-static int	 affs_hash_dentry(const struct dentry *,
-		const struct inode *, struct qstr *);
-static int       affs_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int	 affs_hash_dentry(const struct dentry *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 static int	 affs_intl_toupper(int ch);
-static int	 affs_intl_hash_dentry(const struct dentry *,
-		const struct inode *, struct qstr *);
-static int       affs_intl_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int	 affs_intl_hash_dentry(const struct dentry *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 
 const struct dentry_operations affs_dentry_operations = {
@@ -86,14 +80,12 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
 }
 
 static int
-affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
 {
 	return __affs_hash_dentry(qstr, affs_toupper);
 }
 static int
-affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
 {
 	return __affs_hash_dentry(qstr, affs_intl_toupper);
 }
@@ -131,15 +123,13 @@ static inline int __affs_compare_dentry(unsigned int len,
 }
 
 static int
-affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return __affs_compare_dentry(len, str, name, affs_toupper);
 }
 static int
-affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return __affs_compare_dentry(len, str, name, affs_intl_toupper);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index b84dc735250..45161a832bb 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -622,6 +622,7 @@ static struct file_system_type affs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("affs");
 
 static int __init init_affs_fs(void)
 {
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 8f975f25b48..ebba3b18e5d 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -1,6 +1,6 @@
 config AFS_FS
-	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
-	depends on INET && EXPERIMENTAL
+	tristate "Andrew File System support (AFS)"
+	depends on INET
 	select AF_RXRPC
 	select DNS_RESOLVER
 	help
@@ -22,8 +22,7 @@ config AFS_DEBUG
 	  If unsure, say N.
 
 config AFS_FSCACHE
-	bool "Provide AFS client caching support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	bool "Provide AFS client caching support"
 	depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
 	help
 	  Say Y here if you want AFS data to be cached locally on disk through
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index c548aa346f0..3c462ff6db6 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -119,8 +119,8 @@ struct afs_file_status {
 	u64			size;		/* file size */
 	afs_dataversion_t	data_version;	/* current data version */
 	u32			author;		/* author ID */
-	u32			owner;		/* owner ID */
-	u32			group;		/* group ID */
+	kuid_t			owner;		/* owner ID */
+	kgid_t			group;		/* group ID */
 	afs_access_t		caller_access;	/* access rights for authenticated caller */
 	afs_access_t		anon_access;	/* access rights for unauthenticated caller */
 	umode_t			mode;		/* UNIX mode */
@@ -133,13 +133,6 @@ struct afs_file_status {
 /*
  * AFS file status change request
  */
-struct afs_store_status {
-	u32			mask;		/* which bits of the struct are set */
-	u32			mtime_client;	/* last time client changed data */
-	u32			owner;		/* owner ID */
-	u32			group;		/* group ID */
-	umode_t			mode;		/* UNIX mode */
-};
 
 #define AFS_SET_MTIME		0x01		/* set the mtime */
 #define AFS_SET_OWNER		0x02		/* set the owner ID */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index db477906ba4..34494fbead0 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -22,7 +22,7 @@
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 				 unsigned int flags);
 static int afs_dir_open(struct inode *inode, struct file *file);
-static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
+static int afs_readdir(struct file *file, struct dir_context *ctx);
 static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
 static int afs_d_delete(const struct dentry *dentry);
 static void afs_d_release(struct dentry *dentry);
@@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
 const struct file_operations afs_dir_file_operations = {
 	.open		= afs_dir_open,
 	.release	= afs_release,
-	.readdir	= afs_readdir,
+	.iterate	= afs_readdir,
 	.lock		= afs_lock,
 	.llseek		= generic_file_llseek,
 };
@@ -119,9 +119,9 @@ struct afs_dir_page {
 };
 
 struct afs_lookup_cookie {
+	struct dir_context ctx;
 	struct afs_fid	fid;
-	const char	*name;
-	size_t		nlen;
+	struct qstr name;
 	int		found;
 };
 
@@ -228,20 +228,18 @@ static int afs_dir_open(struct inode *inode, struct file *file)
 /*
  * deal with one block in an AFS directory
  */
-static int afs_dir_iterate_block(unsigned *fpos,
+static int afs_dir_iterate_block(struct dir_context *ctx,
 				 union afs_dir_block *block,
-				 unsigned blkoff,
-				 void *cookie,
-				 filldir_t filldir)
+				 unsigned blkoff)
 {
 	union afs_dirent *dire;
 	unsigned offset, next, curr;
 	size_t nlen;
-	int tmp, ret;
+	int tmp;
 
-	_enter("%u,%x,%p,,",*fpos,blkoff,block);
+	_enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
 
-	curr = (*fpos - blkoff) / sizeof(union afs_dirent);
+	curr = (ctx->pos - blkoff) / sizeof(union afs_dirent);
 
 	/* walk through the block, an entry at a time */
 	for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
@@ -256,7 +254,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
 			_debug("ENT[%Zu.%u]: unused",
 			       blkoff / sizeof(union afs_dir_block), offset);
 			if (offset >= curr)
-				*fpos = blkoff +
+				ctx->pos = blkoff +
 					next * sizeof(union afs_dirent);
 			continue;
 		}
@@ -302,19 +300,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
 			continue;
 
 		/* found the next entry */
-		ret = filldir(cookie,
-			      dire->u.name,
-			      nlen,
-			      blkoff + offset * sizeof(union afs_dirent),
+		if (!dir_emit(ctx, dire->u.name, nlen,
 			      ntohl(dire->u.vnode),
-			      filldir == afs_lookup_filldir ?
-			      ntohl(dire->u.unique) : DT_UNKNOWN);
-		if (ret < 0) {
+			      ctx->actor == afs_lookup_filldir ?
+			      ntohl(dire->u.unique) : DT_UNKNOWN)) {
 			_leave(" = 0 [full]");
 			return 0;
 		}
 
-		*fpos = blkoff + next * sizeof(union afs_dirent);
+		ctx->pos = blkoff + next * sizeof(union afs_dirent);
 	}
 
 	_leave(" = 1 [more]");
@@ -324,8 +318,8 @@ static int afs_dir_iterate_block(unsigned *fpos,
 /*
  * iterate through the data blob that lists the contents of an AFS directory
  */
-static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
-			   filldir_t filldir, struct key *key)
+static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
+			   struct key *key)
 {
 	union afs_dir_block *dblock;
 	struct afs_dir_page *dbuf;
@@ -333,7 +327,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
 	unsigned blkoff, limit;
 	int ret;
 
-	_enter("{%lu},%u,,", dir->i_ino, *fpos);
+	_enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
 
 	if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
 		_leave(" = -ESTALE");
@@ -341,13 +335,13 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
 	}
 
 	/* round the file position up to the next entry boundary */
-	*fpos += sizeof(union afs_dirent) - 1;
-	*fpos &= ~(sizeof(union afs_dirent) - 1);
+	ctx->pos += sizeof(union afs_dirent) - 1;
+	ctx->pos &= ~(sizeof(union afs_dirent) - 1);
 
 	/* walk through the blocks in sequence */
 	ret = 0;
-	while (*fpos < dir->i_size) {
-		blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
+	while (ctx->pos < dir->i_size) {
+		blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1);
 
 		/* fetch the appropriate page from the directory */
 		page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
@@ -364,8 +358,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
 		do {
 			dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
 					       sizeof(union afs_dir_block)];
-			ret = afs_dir_iterate_block(fpos, dblock, blkoff,
-						    cookie, filldir);
+			ret = afs_dir_iterate_block(ctx, dblock, blkoff);
 			if (ret != 1) {
 				afs_dir_put_page(page);
 				goto out;
@@ -373,7 +366,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
 
 			blkoff += sizeof(union afs_dir_block);
 
-		} while (*fpos < dir->i_size && blkoff < limit);
+		} while (ctx->pos < dir->i_size && blkoff < limit);
 
 		afs_dir_put_page(page);
 		ret = 0;
@@ -387,23 +380,10 @@ out:
 /*
  * read an AFS directory
  */
-static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
+static int afs_readdir(struct file *file, struct dir_context *ctx)
 {
-	unsigned fpos;
-	int ret;
-
-	_enter("{%Ld,{%lu}}",
-	       file->f_pos, file->f_path.dentry->d_inode->i_ino);
-
-	ASSERT(file->private_data != NULL);
-
-	fpos = file->f_pos;
-	ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos,
-			      cookie, filldir, file->private_data);
-	file->f_pos = fpos;
-
-	_leave(" = %d", ret);
-	return ret;
+	return afs_dir_iterate(file_inode(file), 
+			      ctx, file->private_data);
 }
 
 /*
@@ -416,15 +396,16 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 {
 	struct afs_lookup_cookie *cookie = _cookie;
 
-	_enter("{%s,%Zu},%s,%u,,%llu,%u",
-	       cookie->name, cookie->nlen, name, nlen,
+	_enter("{%s,%u},%s,%u,,%llu,%u",
+	       cookie->name.name, cookie->name.len, name, nlen,
 	       (unsigned long long) ino, dtype);
 
 	/* insanity checks first */
 	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
 	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
 
-	if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
+	if (cookie->name.len != nlen ||
+	    memcmp(cookie->name.name, name, nlen) != 0) {
 		_leave(" = 0 [no]");
 		return 0;
 	}
@@ -444,24 +425,18 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
 			 struct afs_fid *fid, struct key *key)
 {
-	struct afs_lookup_cookie cookie;
-	struct afs_super_info *as;
-	unsigned fpos;
+	struct afs_super_info *as = dir->i_sb->s_fs_info;
+	struct afs_lookup_cookie cookie = {
+		.ctx.actor = afs_lookup_filldir,
+		.name = dentry->d_name,
+		.fid.vid = as->volume->vid
+	};
 	int ret;
 
 	_enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
 
-	as = dir->i_sb->s_fs_info;
-
 	/* search the directory */
-	cookie.name	= dentry->d_name.name;
-	cookie.nlen	= dentry->d_name.len;
-	cookie.fid.vid	= as->volume->vid;
-	cookie.found	= 0;
-
-	fpos = 0;
-	ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
-			      key);
+	ret = afs_dir_iterate(dir, &cookie.ctx, key);
 	if (ret < 0) {
 		_leave(" = %d [iter]", ret);
 		return ret;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f6e9234d56..66d50fe2ee4 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,8 @@
 #include "internal.h"
 
 static int afs_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned long offset);
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+			       unsigned int length);
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
 static int afs_launder_page(struct page *page);
 
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page)
  * - release a page and clean up its private data if offset is 0 (indicating
  *   the entire page)
  */
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static void afs_invalidatepage(struct page *page, unsigned int offset,
+			       unsigned int length)
 {
 	struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
 
-	_enter("{%lu},%lu", page->index, offset);
+	_enter("{%lu},%u,%u", page->index, offset, length);
 
 	BUG_ON(!PageLocked(page));
 
 	/* we clean up only if the entire page is being invalidated */
-	if (offset == 0) {
+	if (offset == 0 && length == PAGE_CACHE_SIZE) {
 #ifdef CONFIG_AFS_FSCACHE
 		if (PageFsCache(page)) {
 			struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 757d664575d..a8cf2cff836 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -252,7 +252,8 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
  */
 static int afs_do_setlk(struct file *file, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+	struct inode *inode = file_inode(file);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_lock_type_t type;
 	struct key *key = file->private_data;
 	int ret;
@@ -273,7 +274,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 
 	type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 
 	/* make sure we've got a callback on this file and that our view of the
 	 * data version is up to date */
@@ -420,7 +421,7 @@ given_lock:
 	afs_vnode_fetch_status(vnode, NULL, key);
 
 error:
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	_leave(" = %d", ret);
 	return ret;
 
@@ -514,7 +515,7 @@ error:
  */
 int afs_lock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 
 	_enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
 	       vnode->fid.vid, vnode->fid.vnode, cmd,
@@ -537,7 +538,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
  */
 int afs_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 
 	_enter("{%x:%u},%d,{t=%x,fl=%x}",
 	       vnode->fid.vid, vnode->fid.vnode, cmd,
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index b960ff05ea0..c2e930ec288 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -42,6 +42,8 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	umode_t mode;
 	u64 data_version, size;
 	u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
+	kuid_t owner;
+	kgid_t group;
 
 #define EXTRACT(DST)				\
 	do {					\
@@ -56,7 +58,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	size = ntohl(*bp++);
 	data_version = ntohl(*bp++);
 	EXTRACT(status->author);
-	EXTRACT(status->owner);
+	owner = make_kuid(&init_user_ns, ntohl(*bp++));
+	changed |= !uid_eq(owner, status->owner);
+	status->owner = owner;
 	EXTRACT(status->caller_access); /* call ticket dependent */
 	EXTRACT(status->anon_access);
 	EXTRACT(status->mode);
@@ -65,7 +69,9 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	bp++; /* seg size */
 	status->mtime_client = ntohl(*bp++);
 	status->mtime_server = ntohl(*bp++);
-	EXTRACT(status->group);
+	group = make_kgid(&init_user_ns, ntohl(*bp++));
+	changed |= !gid_eq(group, status->group);
+	status->group = group;
 	bp++; /* sync counter */
 	data_version |= (u64) ntohl(*bp++) << 32;
 	EXTRACT(status->lock_count);
@@ -181,12 +187,12 @@ static void xdr_encode_AFS_StoreStatus(__be32 **_bp, struct iattr *attr)
 
 	if (attr->ia_valid & ATTR_UID) {
 		mask |= AFS_SET_OWNER;
-		owner = attr->ia_uid;
+		owner = from_kuid(&init_user_ns, attr->ia_uid);
 	}
 
 	if (attr->ia_valid & ATTR_GID) {
 		mask |= AFS_SET_GROUP;
-		group = attr->ia_gid;
+		group = from_kgid(&init_user_ns, attr->ia_gid);
 	}
 
 	if (attr->ia_valid & ATTR_MODE) {
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 95cffd38239..789bc253b5f 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -69,7 +69,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 
 	set_nlink(inode, vnode->status.nlink);
 	inode->i_uid		= vnode->status.owner;
-	inode->i_gid		= 0;
+	inode->i_gid		= GLOBAL_ROOT_GID;
 	inode->i_size		= vnode->status.size;
 	inode->i_ctime.tv_sec	= vnode->status.mtime_server;
 	inode->i_ctime.tv_nsec	= 0;
@@ -175,8 +175,8 @@ struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
 	inode->i_mode		= S_IFDIR | S_IRUGO | S_IXUGO;
 	inode->i_op		= &afs_autocell_inode_operations;
 	set_nlink(inode, 2);
-	inode->i_uid		= 0;
-	inode->i_gid		= 0;
+	inode->i_uid		= GLOBAL_ROOT_UID;
+	inode->i_gid		= GLOBAL_ROOT_GID;
 	inode->i_ctime.tv_sec	= get_seconds();
 	inode->i_ctime.tv_nsec	= 0;
 	inode->i_atime		= inode->i_mtime = inode->i_ctime;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 096b23f821a..526e4bbbde5 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -190,7 +190,7 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
 		return ret;
 
 	m = file->private_data;
-	m->private = PDE(inode)->data;
+	m->private = PDE_DATA(inode);
 
 	return 0;
 }
@@ -448,7 +448,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = PDE(inode)->data;
+	cell = PDE_DATA(inode);
 	if (!cell)
 		return -ENOENT;
 
@@ -554,7 +554,7 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = PDE(inode)->data;
+	cell = PDE_DATA(inode);
 	if (!cell)
 		return -ENOENT;
 
@@ -659,7 +659,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = PDE(inode)->data;
+	cell = PDE_DATA(inode);
 	if (!cell)
 		return -ENOENT;
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 43165009428..c4861557e38 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -24,6 +24,8 @@
 #include <linux/parser.h>
 #include <linux/statfs.h>
 #include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
 #include "internal.h"
 
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
@@ -43,6 +45,7 @@ struct file_system_type afs_fs_type = {
 	.kill_sb	= afs_kill_super,
 	.fs_flags	= 0,
 };
+MODULE_ALIAS_FS("afs");
 
 static const struct super_operations afs_super_ops = {
 	.statfs		= afs_statfs,
@@ -363,6 +366,10 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,
 
 	memset(&params, 0, sizeof(params));
 
+	ret = -EINVAL;
+	if (current->nsproxy->net_ns != &init_net)
+		goto error;
+
 	/* parse the options and device name */
 	if (options) {
 		ret = afs_parse_options(&params, options, &dev_name);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9aa52d93c73..a890db4b989 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
+#include <linux/aio.h>
 #include "internal.h"
 
 static int afs_write_back_from_locked_page(struct afs_writeback *wb,
@@ -120,7 +121,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 		    struct page **pagep, void **fsdata)
 {
 	struct afs_writeback *candidate, *wb;
-	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 	struct page *page;
 	struct key *key = file->private_data;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
@@ -245,7 +246,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 		  loff_t pos, unsigned len, unsigned copied,
 		  struct page *page, void *fsdata)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 	loff_t i_size, maybe_i_size;
 
 	_enter("{%x:%u},{%lx}",
@@ -627,8 +628,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
 ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
 		       unsigned long nr_segs, loff_t pos)
 {
-	struct dentry *dentry = iocb->ki_filp->f_path.dentry;
-	struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
 	ssize_t result;
 	size_t count = iov_length(iov, nr_segs);
 
diff --git a/fs/aio.c b/fs/aio.c
index 71f613cf4a8..9b5ca113741 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -8,6 +8,8 @@
  *
  *	See ../COPYING for licensing terms.
  */
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/errno.h>
@@ -18,8 +20,6 @@
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
 
-#define DEBUG 0
-
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/file.h>
@@ -39,11 +39,78 @@
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
 
-#if DEBUG > 1
-#define dprintk		printk
-#else
-#define dprintk(x...)	do { ; } while (0)
-#endif
+#include "internal.h"
+
+#define AIO_RING_MAGIC			0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES	1
+#define AIO_RING_INCOMPAT_FEATURES	0
+struct aio_ring {
+	unsigned	id;	/* kernel internal index number */
+	unsigned	nr;	/* number of io_events */
+	unsigned	head;
+	unsigned	tail;
+
+	unsigned	magic;
+	unsigned	compat_features;
+	unsigned	incompat_features;
+	unsigned	header_length;	/* size of aio_ring */
+
+
+	struct io_event		io_events[0];
+}; /* 128 bytes + ring size */
+
+#define AIO_RING_PAGES	8
+
+struct kioctx {
+	atomic_t		users;
+	atomic_t		dead;
+
+	/* This needs improving */
+	unsigned long		user_id;
+	struct hlist_node	list;
+
+	/*
+	 * This is what userspace passed to io_setup(), it's not used for
+	 * anything but counting against the global max_reqs quota.
+	 *
+	 * The real limit is nr_events - 1, which will be larger (see
+	 * aio_setup_ring())
+	 */
+	unsigned		max_reqs;
+
+	/* Size of ringbuffer, in units of struct io_event */
+	unsigned		nr_events;
+
+	unsigned long		mmap_base;
+	unsigned long		mmap_size;
+
+	struct page		**ring_pages;
+	long			nr_pages;
+
+	struct rcu_head		rcu_head;
+	struct work_struct	rcu_work;
+
+	struct {
+		atomic_t	reqs_active;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		spinlock_t	ctx_lock;
+		struct list_head active_reqs;	/* used for cancellation */
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		struct mutex	ring_lock;
+		wait_queue_head_t wait;
+	} ____cacheline_aligned_in_smp;
+
+	struct {
+		unsigned	tail;
+		spinlock_t	completion_lock;
+	} ____cacheline_aligned_in_smp;
+
+	struct page		*internal_pages[AIO_RING_PAGES];
+};
 
 /*------ sysctl variables----*/
 static DEFINE_SPINLOCK(aio_nr_lock);
@@ -54,11 +121,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request
 static struct kmem_cache	*kiocb_cachep;
 static struct kmem_cache	*kioctx_cachep;
 
-static struct workqueue_struct *aio_wq;
-
-static void aio_kick_handler(struct work_struct *);
-static void aio_queue_work(struct kioctx *);
-
 /* aio_setup
  *	Creates the slab caches used by the aio routines, panic on
  *	failure as this is done early during the boot sequence.
@@ -68,10 +130,7 @@ static int __init aio_setup(void)
 	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
-	aio_wq = alloc_workqueue("aio", 0, 1);	/* used to limit concurrency */
-	BUG_ON(!aio_wq);
-
-	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
+	pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
 
 	return 0;
 }
@@ -79,29 +138,21 @@ __initcall(aio_setup);
 
 static void aio_free_ring(struct kioctx *ctx)
 {
-	struct aio_ring_info *info = &ctx->ring_info;
 	long i;
 
-	for (i=0; i<info->nr_pages; i++)
-		put_page(info->ring_pages[i]);
+	for (i = 0; i < ctx->nr_pages; i++)
+		put_page(ctx->ring_pages[i]);
 
-	if (info->mmap_size) {
-		BUG_ON(ctx->mm != current->mm);
-		vm_munmap(info->mmap_base, info->mmap_size);
-	}
-
-	if (info->ring_pages && info->ring_pages != info->internal_pages)
-		kfree(info->ring_pages);
-	info->ring_pages = NULL;
-	info->nr = 0;
+	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
+		kfree(ctx->ring_pages);
 }
 
 static int aio_setup_ring(struct kioctx *ctx)
 {
 	struct aio_ring *ring;
-	struct aio_ring_info *info = &ctx->ring_info;
 	unsigned nr_events = ctx->max_reqs;
-	unsigned long size;
+	struct mm_struct *mm = current->mm;
+	unsigned long size, populate;
 	int nr_pages;
 
 	/* Compensate for the ring buffer's head/tail overlap entry */
@@ -116,43 +167,44 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
 
-	info->nr = 0;
-	info->ring_pages = info->internal_pages;
+	ctx->nr_events = 0;
+	ctx->ring_pages = ctx->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
-		info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
-		if (!info->ring_pages)
+		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+					  GFP_KERNEL);
+		if (!ctx->ring_pages)
 			return -ENOMEM;
 	}
 
-	info->mmap_size = nr_pages * PAGE_SIZE;
-	dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
-	down_write(&ctx->mm->mmap_sem);
-	info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
-					PROT_READ|PROT_WRITE,
-					MAP_ANONYMOUS|MAP_PRIVATE, 0);
-	if (IS_ERR((void *)info->mmap_base)) {
-		up_write(&ctx->mm->mmap_sem);
-		info->mmap_size = 0;
+	ctx->mmap_size = nr_pages * PAGE_SIZE;
+	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
+	down_write(&mm->mmap_sem);
+	ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
+				       PROT_READ|PROT_WRITE,
+				       MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
+	if (IS_ERR((void *)ctx->mmap_base)) {
+		up_write(&mm->mmap_sem);
+		ctx->mmap_size = 0;
 		aio_free_ring(ctx);
 		return -EAGAIN;
 	}
 
-	dprintk("mmap address: 0x%08lx\n", info->mmap_base);
-	info->nr_pages = get_user_pages(current, ctx->mm,
-					info->mmap_base, nr_pages, 
-					1, 0, info->ring_pages, NULL);
-	up_write(&ctx->mm->mmap_sem);
+	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
+	ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
+				       1, 0, ctx->ring_pages, NULL);
+	up_write(&mm->mmap_sem);
 
-	if (unlikely(info->nr_pages != nr_pages)) {
+	if (unlikely(ctx->nr_pages != nr_pages)) {
 		aio_free_ring(ctx);
 		return -EAGAIN;
 	}
+	if (populate)
+		mm_populate(ctx->mmap_base, populate);
 
-	ctx->user_id = info->mmap_base;
-
-	info->nr = nr_events;		/* trusted copy */
+	ctx->user_id = ctx->mmap_base;
+	ctx->nr_events = nr_events; /* trusted copy */
 
-	ring = kmap_atomic(info->ring_pages[0]);
+	ring = kmap_atomic(ctx->ring_pages[0]);
 	ring->nr = nr_events;	/* user copy */
 	ring->id = ctx->user_id;
 	ring->head = ring->tail = 0;
@@ -161,72 +213,130 @@ static int aio_setup_ring(struct kioctx *ctx)
 	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
 	ring->header_length = sizeof(struct aio_ring);
 	kunmap_atomic(ring);
+	flush_dcache_page(ctx->ring_pages[0]);
 
 	return 0;
 }
 
-
-/* aio_ring_event: returns a pointer to the event at the given index from
- * kmap_atomic().  Release the pointer with put_aio_ring_event();
- */
 #define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-#define aio_ring_event(info, nr) ({					\
-	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\
-	struct io_event *__event;					\
-	__event = kmap_atomic(						\
-			(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \
-	__event += pos % AIO_EVENTS_PER_PAGE;				\
-	__event;							\
-})
-
-#define put_aio_ring_event(event) do {		\
-	struct io_event *__event = (event);	\
-	(void)__event;				\
-	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \
-} while(0)
-
-static void ctx_rcu_free(struct rcu_head *head)
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+{
+	struct kioctx *ctx = req->ki_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->ctx_lock, flags);
+
+	if (!req->ki_list.next)
+		list_add(&req->ki_list, &ctx->active_reqs);
+
+	req->ki_cancel = cancel;
+
+	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+EXPORT_SYMBOL(kiocb_set_cancel_fn);
+
+static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
+			struct io_event *res)
+{
+	kiocb_cancel_fn *old, *cancel;
+	int ret = -EINVAL;
+
+	/*
+	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
+	 * actually has a cancel function, hence the cmpxchg()
+	 */
+
+	cancel = ACCESS_ONCE(kiocb->ki_cancel);
+	do {
+		if (!cancel || cancel == KIOCB_CANCELLED)
+			return ret;
+
+		old = cancel;
+		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
+	} while (cancel != old);
+
+	atomic_inc(&kiocb->ki_users);
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	memset(res, 0, sizeof(*res));
+	res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
+	res->data = kiocb->ki_user_data;
+	ret = cancel(kiocb, res);
+
+	spin_lock_irq(&ctx->ctx_lock);
+
+	return ret;
+}
+
+static void free_ioctx_rcu(struct rcu_head *head)
 {
 	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
 
-/* __put_ioctx
- *	Called when the last user of an aio context has gone away,
- *	and the struct needs to be freed.
+/*
+ * When this function runs, the kioctx has been removed from the "hash table"
+ * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
+ * now it's safe to cancel any that need to be.
  */
-static void __put_ioctx(struct kioctx *ctx)
+static void free_ioctx(struct kioctx *ctx)
 {
-	unsigned nr_events = ctx->max_reqs;
-	BUG_ON(ctx->reqs_active);
+	struct aio_ring *ring;
+	struct io_event res;
+	struct kiocb *req;
+	unsigned head, avail;
 
-	cancel_delayed_work_sync(&ctx->wq);
-	aio_free_ring(ctx);
-	mmdrop(ctx->mm);
-	ctx->mm = NULL;
-	if (nr_events) {
-		spin_lock(&aio_nr_lock);
-		BUG_ON(aio_nr - nr_events > aio_nr);
-		aio_nr -= nr_events;
-		spin_unlock(&aio_nr_lock);
+	spin_lock_irq(&ctx->ctx_lock);
+
+	while (!list_empty(&ctx->active_reqs)) {
+		req = list_first_entry(&ctx->active_reqs,
+				       struct kiocb, ki_list);
+
+		list_del_init(&req->ki_list);
+		kiocb_cancel(ctx, req, &res);
 	}
-	pr_debug("__put_ioctx: freeing %p\n", ctx);
-	call_rcu(&ctx->rcu_head, ctx_rcu_free);
-}
 
-static inline int try_get_ioctx(struct kioctx *kioctx)
-{
-	return atomic_inc_not_zero(&kioctx->users);
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	head = ring->head;
+	kunmap_atomic(ring);
+
+	while (atomic_read(&ctx->reqs_active) > 0) {
+		wait_event(ctx->wait,
+				head != ctx->tail ||
+				atomic_read(&ctx->reqs_active) <= 0);
+
+		avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
+
+		atomic_sub(avail, &ctx->reqs_active);
+		head += avail;
+		head %= ctx->nr_events;
+	}
+
+	WARN_ON(atomic_read(&ctx->reqs_active) < 0);
+
+	aio_free_ring(ctx);
+
+	pr_debug("freeing %p\n", ctx);
+
+	/*
+	 * Here the call_rcu() is between the wait_event() for reqs_active to
+	 * hit 0, and freeing the ioctx.
+	 *
+	 * aio_complete() decrements reqs_active, but it has to touch the ioctx
+	 * after to issue a wakeup so we use rcu.
+	 */
+	call_rcu(&ctx->rcu_head, free_ioctx_rcu);
 }
 
-static inline void put_ioctx(struct kioctx *kioctx)
+static void put_ioctx(struct kioctx *ctx)
 {
-	BUG_ON(atomic_read(&kioctx->users) <= 0);
-	if (unlikely(atomic_dec_and_test(&kioctx->users)))
-		__put_ioctx(kioctx);
+	if (unlikely(atomic_dec_and_test(&ctx->users)))
+		free_ioctx(ctx);
 }
 
 /* ioctx_alloc
@@ -234,7 +344,7 @@ static inline void put_ioctx(struct kioctx *kioctx)
  */
 static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
-	struct mm_struct *mm;
+	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx;
 	int err = -ENOMEM;
 
@@ -253,17 +363,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-ENOMEM);
 
 	ctx->max_reqs = nr_events;
-	mm = ctx->mm = current->mm;
-	atomic_inc(&mm->mm_count);
 
 	atomic_set(&ctx->users, 2);
+	atomic_set(&ctx->dead, 0);
 	spin_lock_init(&ctx->ctx_lock);
-	spin_lock_init(&ctx->ring_info.ring_lock);
+	spin_lock_init(&ctx->completion_lock);
+	mutex_init(&ctx->ring_lock);
 	init_waitqueue_head(&ctx->wait);
 
 	INIT_LIST_HEAD(&ctx->active_reqs);
-	INIT_LIST_HEAD(&ctx->run_list);
-	INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
 
 	if (aio_setup_ring(ctx) < 0)
 		goto out_freectx;
@@ -283,64 +391,63 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
 	spin_unlock(&mm->ioctx_lock);
 
-	dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
-		ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
+	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
+		 ctx, ctx->user_id, mm, ctx->nr_events);
 	return ctx;
 
 out_cleanup:
 	err = -EAGAIN;
 	aio_free_ring(ctx);
 out_freectx:
-	mmdrop(mm);
 	kmem_cache_free(kioctx_cachep, ctx);
-	dprintk("aio: error allocating ioctx %d\n", err);
+	pr_debug("error allocating ioctx %d\n", err);
 	return ERR_PTR(err);
 }
 
-/* kill_ctx
- *	Cancels all outstanding aio requests on an aio context.  Used 
- *	when the processes owning a context have all exited to encourage 
+static void kill_ioctx_work(struct work_struct *work)
+{
+	struct kioctx *ctx = container_of(work, struct kioctx, rcu_work);
+
+	wake_up_all(&ctx->wait);
+	put_ioctx(ctx);
+}
+
+static void kill_ioctx_rcu(struct rcu_head *head)
+{
+	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+
+	INIT_WORK(&ctx->rcu_work, kill_ioctx_work);
+	schedule_work(&ctx->rcu_work);
+}
+
+/* kill_ioctx
+ *	Cancels all outstanding aio requests on an aio context.  Used
+ *	when the processes owning a context have all exited to encourage
  *	the rapid destruction of the kioctx.
  */
-static void kill_ctx(struct kioctx *ctx)
+static void kill_ioctx(struct kioctx *ctx)
 {
-	int (*cancel)(struct kiocb *, struct io_event *);
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	struct io_event res;
+	if (!atomic_xchg(&ctx->dead, 1)) {
+		hlist_del_rcu(&ctx->list);
 
-	spin_lock_irq(&ctx->ctx_lock);
-	ctx->dead = 1;
-	while (!list_empty(&ctx->active_reqs)) {
-		struct list_head *pos = ctx->active_reqs.next;
-		struct kiocb *iocb = list_kiocb(pos);
-		list_del_init(&iocb->ki_list);
-		cancel = iocb->ki_cancel;
-		kiocbSetCancelled(iocb);
-		if (cancel) {
-			iocb->ki_users++;
-			spin_unlock_irq(&ctx->ctx_lock);
-			cancel(iocb, &res);
-			spin_lock_irq(&ctx->ctx_lock);
-		}
-	}
+		/*
+		 * It'd be more correct to do this in free_ioctx(), after all
+		 * the outstanding kiocbs have finished - but by then io_destroy
+		 * has already returned, so io_setup() could potentially return
+		 * -EAGAIN with no ioctxs actually in use (as far as userspace
+		 *  could tell).
+		 */
+		spin_lock(&aio_nr_lock);
+		BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
+		aio_nr -= ctx->max_reqs;
+		spin_unlock(&aio_nr_lock);
 
-	if (!ctx->reqs_active)
-		goto out;
+		if (ctx->mmap_size)
+			vm_munmap(ctx->mmap_base, ctx->mmap_size);
 
-	add_wait_queue(&ctx->wait, &wait);
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	while (ctx->reqs_active) {
-		spin_unlock_irq(&ctx->ctx_lock);
-		io_schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-		spin_lock_irq(&ctx->ctx_lock);
+		/* Between hlist_del_rcu() and dropping the initial ref */
+		call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
 	}
-	__set_task_state(tsk, TASK_RUNNING);
-	remove_wait_queue(&ctx->wait, &wait);
-
-out:
-	spin_unlock_irq(&ctx->ctx_lock);
 }
 
 /* wait_on_sync_kiocb:
@@ -348,9 +455,9 @@ out:
  */
 ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
 {
-	while (iocb->ki_users) {
+	while (atomic_read(&iocb->ki_users)) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!iocb->ki_users)
+		if (!atomic_read(&iocb->ki_users))
 			break;
 		io_schedule();
 	}
@@ -359,28 +466,26 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
 }
 EXPORT_SYMBOL(wait_on_sync_kiocb);
 
-/* exit_aio: called when the last user of mm goes away.  At this point, 
- * there is no way for any new requests to be submited or any of the 
- * io_* syscalls to be called on the context.  However, there may be 
- * outstanding requests which hold references to the context; as they 
- * go away, they will call put_ioctx and release any pinned memory
- * associated with the request (held via struct page * references).
+/*
+ * exit_aio: called when the last user of mm goes away.  At this point, there is
+ * no way for any new requests to be submited or any of the io_* syscalls to be
+ * called on the context.
+ *
+ * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
+ * them.
  */
 void exit_aio(struct mm_struct *mm)
 {
 	struct kioctx *ctx;
+	struct hlist_node *n;
 
-	while (!hlist_empty(&mm->ioctx_list)) {
-		ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
-		hlist_del_rcu(&ctx->list);
-
-		kill_ctx(ctx);
-
+	hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
 		if (1 != atomic_read(&ctx->users))
 			printk(KERN_DEBUG
 				"exit_aio:ioctx still alive: %d %d %d\n",
-				atomic_read(&ctx->users), ctx->dead,
-				ctx->reqs_active);
+				atomic_read(&ctx->users),
+				atomic_read(&ctx->dead),
+				atomic_read(&ctx->reqs_active));
 		/*
 		 * We don't need to bother with munmap() here -
 		 * exit_mmap(mm) is coming and it'll unmap everything.
@@ -388,150 +493,50 @@ void exit_aio(struct mm_struct *mm)
 		 * as indicator that it needs to unmap the area,
 		 * just set it to 0; aio_free_ring() is the only
 		 * place that uses ->mmap_size, so it's safe.
-		 * That way we get all munmap done to current->mm -
-		 * all other callers have ctx->mm == current->mm.
 		 */
-		ctx->ring_info.mmap_size = 0;
-		put_ioctx(ctx);
+		ctx->mmap_size = 0;
+
+		kill_ioctx(ctx);
 	}
 }
 
 /* aio_get_req
- *	Allocate a slot for an aio request.  Increments the users count
+ *	Allocate a slot for an aio request.  Increments the ki_users count
  * of the kioctx so that the kioctx stays around until all requests are
  * complete.  Returns NULL if no requests are free.
  *
- * Returns with kiocb->users set to 2.  The io submit code path holds
+ * Returns with kiocb->ki_users set to 2.  The io submit code path holds
  * an extra reference while submitting the i/o.
  * This prevents races between the aio code path referencing the
  * req (after submitting it) and aio_complete() freeing the req.
  */
-static struct kiocb *__aio_get_req(struct kioctx *ctx)
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 {
-	struct kiocb *req = NULL;
+	struct kiocb *req;
 
-	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
-	if (unlikely(!req))
+	if (atomic_read(&ctx->reqs_active) >= ctx->nr_events)
 		return NULL;
 
-	req->ki_flags = 0;
-	req->ki_users = 2;
-	req->ki_key = 0;
-	req->ki_ctx = ctx;
-	req->ki_cancel = NULL;
-	req->ki_retry = NULL;
-	req->ki_dtor = NULL;
-	req->private = NULL;
-	req->ki_iovec = NULL;
-	INIT_LIST_HEAD(&req->ki_run_list);
-	req->ki_eventfd = NULL;
-
-	return req;
-}
-
-/*
- * struct kiocb's are allocated in batches to reduce the number of
- * times the ctx lock is acquired and released.
- */
-#define KIOCB_BATCH_SIZE	32L
-struct kiocb_batch {
-	struct list_head head;
-	long count; /* number of requests left to allocate */
-};
-
-static void kiocb_batch_init(struct kiocb_batch *batch, long total)
-{
-	INIT_LIST_HEAD(&batch->head);
-	batch->count = total;
-}
-
-static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
-{
-	struct kiocb *req, *n;
-
-	if (list_empty(&batch->head))
-		return;
-
-	spin_lock_irq(&ctx->ctx_lock);
-	list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
-		list_del(&req->ki_batch);
-		list_del(&req->ki_list);
-		kmem_cache_free(kiocb_cachep, req);
-		ctx->reqs_active--;
-	}
-	if (unlikely(!ctx->reqs_active && ctx->dead))
-		wake_up_all(&ctx->wait);
-	spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * Allocate a batch of kiocbs.  This avoids taking and dropping the
- * context lock a lot during setup.
- */
-static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
-{
-	unsigned short allocated, to_alloc;
-	long avail;
-	struct kiocb *req, *n;
-	struct aio_ring *ring;
-
-	to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
-	for (allocated = 0; allocated < to_alloc; allocated++) {
-		req = __aio_get_req(ctx);
-		if (!req)
-			/* allocation failed, go with what we've got */
-			break;
-		list_add(&req->ki_batch, &batch->head);
-	}
-
-	if (allocated == 0)
-		goto out;
-
-	spin_lock_irq(&ctx->ctx_lock);
-	ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
-
-	avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
-	BUG_ON(avail < 0);
-	if (avail < allocated) {
-		/* Trim back the number of requests. */
-		list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
-			list_del(&req->ki_batch);
-			kmem_cache_free(kiocb_cachep, req);
-			if (--allocated <= avail)
-				break;
-		}
-	}
-
-	batch->count -= allocated;
-	list_for_each_entry(req, &batch->head, ki_batch) {
-		list_add(&req->ki_list, &ctx->active_reqs);
-		ctx->reqs_active++;
-	}
-
-	kunmap_atomic(ring);
-	spin_unlock_irq(&ctx->ctx_lock);
+	if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1)
+		goto out_put;
 
-out:
-	return allocated;
-}
+	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+	if (unlikely(!req))
+		goto out_put;
 
-static inline struct kiocb *aio_get_req(struct kioctx *ctx,
-					struct kiocb_batch *batch)
-{
-	struct kiocb *req;
+	atomic_set(&req->ki_users, 2);
+	req->ki_ctx = ctx;
 
-	if (list_empty(&batch->head))
-		if (kiocb_batch_refill(ctx, batch) == 0)
-			return NULL;
-	req = list_first_entry(&batch->head, struct kiocb, ki_batch);
-	list_del(&req->ki_batch);
 	return req;
+out_put:
+	atomic_dec(&ctx->reqs_active);
+	return NULL;
 }
 
-static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+static void kiocb_free(struct kiocb *req)
 {
-	assert_spin_locked(&ctx->ctx_lock);
-
+	if (req->ki_filp)
+		fput(req->ki_filp);
 	if (req->ki_eventfd != NULL)
 		eventfd_ctx_put(req->ki_eventfd);
 	if (req->ki_dtor)
@@ -539,48 +544,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 	if (req->ki_iovec != &req->ki_inline_vec)
 		kfree(req->ki_iovec);
 	kmem_cache_free(kiocb_cachep, req);
-	ctx->reqs_active--;
-
-	if (unlikely(!ctx->reqs_active && ctx->dead))
-		wake_up_all(&ctx->wait);
 }
 
-/* __aio_put_req
- *	Returns true if this put was the last user of the request.
- */
-static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+void aio_put_req(struct kiocb *req)
 {
-	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
-		req, atomic_long_read(&req->ki_filp->f_count));
-
-	assert_spin_locked(&ctx->ctx_lock);
-
-	req->ki_users--;
-	BUG_ON(req->ki_users < 0);
-	if (likely(req->ki_users))
-		return 0;
-	list_del(&req->ki_list);		/* remove from active_reqs */
-	req->ki_cancel = NULL;
-	req->ki_retry = NULL;
-
-	fput(req->ki_filp);
-	req->ki_filp = NULL;
-	really_put_req(ctx, req);
-	return 1;
-}
-
-/* aio_put_req
- *	Returns true if this put was the last user of the kiocb,
- *	false if the request is still in use.
- */
-int aio_put_req(struct kiocb *req)
-{
-	struct kioctx *ctx = req->ki_ctx;
-	int ret;
-	spin_lock_irq(&ctx->ctx_lock);
-	ret = __aio_put_req(ctx, req);
-	spin_unlock_irq(&ctx->ctx_lock);
-	return ret;
+	if (atomic_dec_and_test(&req->ki_users))
+		kiocb_free(req);
 }
 EXPORT_SYMBOL(aio_put_req);
 
@@ -588,18 +557,12 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx, *ret = NULL;
-	struct hlist_node *n;
 
 	rcu_read_lock();
 
-	hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
-		/*
-		 * RCU protects us against accessing freed memory but
-		 * we have to be careful not to get a reference when the
-		 * reference count already dropped to 0 (ctx->dead test
-		 * is unreliable because of races).
-		 */
-		if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
+	hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
+		if (ctx->user_id == ctx_id) {
+			atomic_inc(&ctx->users);
 			ret = ctx;
 			break;
 		}
@@ -609,295 +572,16 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	return ret;
 }
 
-/*
- * Queue up a kiocb to be retried. Assumes that the kiocb
- * has already been marked as kicked, and places it on
- * the retry run list for the corresponding ioctx, if it
- * isn't already queued. Returns 1 if it actually queued
- * the kiocb (to tell the caller to activate the work
- * queue to process it), or 0, if it found that it was
- * already queued.
- */
-static inline int __queue_kicked_iocb(struct kiocb *iocb)
-{
-	struct kioctx *ctx = iocb->ki_ctx;
-
-	assert_spin_locked(&ctx->ctx_lock);
-
-	if (list_empty(&iocb->ki_run_list)) {
-		list_add_tail(&iocb->ki_run_list,
-			&ctx->run_list);
-		return 1;
-	}
-	return 0;
-}
-
-/* aio_run_iocb
- *	This is the core aio execution routine. It is
- *	invoked both for initial i/o submission and
- *	subsequent retries via the aio_kick_handler.
- *	Expects to be invoked with iocb->ki_ctx->lock
- *	already held. The lock is released and reacquired
- *	as needed during processing.
- *
- * Calls the iocb retry method (already setup for the
- * iocb on initial submission) for operation specific
- * handling, but takes care of most of common retry
- * execution details for a given iocb. The retry method
- * needs to be non-blocking as far as possible, to avoid
- * holding up other iocbs waiting to be serviced by the
- * retry kernel thread.
- *
- * The trickier parts in this code have to do with
- * ensuring that only one retry instance is in progress
- * for a given iocb at any time. Providing that guarantee
- * simplifies the coding of individual aio operations as
- * it avoids various potential races.
- */
-static ssize_t aio_run_iocb(struct kiocb *iocb)
-{
-	struct kioctx	*ctx = iocb->ki_ctx;
-	ssize_t (*retry)(struct kiocb *);
-	ssize_t ret;
-
-	if (!(retry = iocb->ki_retry)) {
-		printk("aio_run_iocb: iocb->ki_retry = NULL\n");
-		return 0;
-	}
-
-	/*
-	 * We don't want the next retry iteration for this
-	 * operation to start until this one has returned and
-	 * updated the iocb state. However, wait_queue functions
-	 * can trigger a kick_iocb from interrupt context in the
-	 * meantime, indicating that data is available for the next
-	 * iteration. We want to remember that and enable the
-	 * next retry iteration _after_ we are through with
-	 * this one.
-	 *
-	 * So, in order to be able to register a "kick", but
-	 * prevent it from being queued now, we clear the kick
-	 * flag, but make the kick code *think* that the iocb is
-	 * still on the run list until we are actually done.
-	 * When we are done with this iteration, we check if
-	 * the iocb was kicked in the meantime and if so, queue
-	 * it up afresh.
-	 */
-
-	kiocbClearKicked(iocb);
-
-	/*
-	 * This is so that aio_complete knows it doesn't need to
-	 * pull the iocb off the run list (We can't just call
-	 * INIT_LIST_HEAD because we don't want a kick_iocb to
-	 * queue this on the run list yet)
-	 */
-	iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
-	spin_unlock_irq(&ctx->ctx_lock);
-
-	/* Quit retrying if the i/o has been cancelled */
-	if (kiocbIsCancelled(iocb)) {
-		ret = -EINTR;
-		aio_complete(iocb, ret, 0);
-		/* must not access the iocb after this */
-		goto out;
-	}
-
-	/*
-	 * Now we are all set to call the retry method in async
-	 * context.
-	 */
-	ret = retry(iocb);
-
-	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
-		/*
-		 * There's no easy way to restart the syscall since other AIO's
-		 * may be already running. Just fail this IO with EINTR.
-		 */
-		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
-			     ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
-			ret = -EINTR;
-		aio_complete(iocb, ret, 0);
-	}
-out:
-	spin_lock_irq(&ctx->ctx_lock);
-
-	if (-EIOCBRETRY == ret) {
-		/*
-		 * OK, now that we are done with this iteration
-		 * and know that there is more left to go,
-		 * this is where we let go so that a subsequent
-		 * "kick" can start the next iteration
-		 */
-
-		/* will make __queue_kicked_iocb succeed from here on */
-		INIT_LIST_HEAD(&iocb->ki_run_list);
-		/* we must queue the next iteration ourselves, if it
-		 * has already been kicked */
-		if (kiocbIsKicked(iocb)) {
-			__queue_kicked_iocb(iocb);
-
-			/*
-			 * __queue_kicked_iocb will always return 1 here, because
-			 * iocb->ki_run_list is empty at this point so it should
-			 * be safe to unconditionally queue the context into the
-			 * work queue.
-			 */
-			aio_queue_work(ctx);
-		}
-	}
-	return ret;
-}
-
-/*
- * __aio_run_iocbs:
- * 	Process all pending retries queued on the ioctx
- * 	run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static int __aio_run_iocbs(struct kioctx *ctx)
-{
-	struct kiocb *iocb;
-	struct list_head run_list;
-
-	assert_spin_locked(&ctx->ctx_lock);
-
-	list_replace_init(&ctx->run_list, &run_list);
-	while (!list_empty(&run_list)) {
-		iocb = list_entry(run_list.next, struct kiocb,
-			ki_run_list);
-		list_del(&iocb->ki_run_list);
-		/*
-		 * Hold an extra reference while retrying i/o.
-		 */
-		iocb->ki_users++;       /* grab extra reference */
-		aio_run_iocb(iocb);
-		__aio_put_req(ctx, iocb);
- 	}
-	if (!list_empty(&ctx->run_list))
-		return 1;
-	return 0;
-}
-
-static void aio_queue_work(struct kioctx * ctx)
-{
-	unsigned long timeout;
-	/*
-	 * if someone is waiting, get the work started right
-	 * away, otherwise, use a longer delay
-	 */
-	smp_mb();
-	if (waitqueue_active(&ctx->wait))
-		timeout = 1;
-	else
-		timeout = HZ/10;
-	queue_delayed_work(aio_wq, &ctx->wq, timeout);
-}
-
-/*
- * aio_run_all_iocbs:
- *	Process all pending retries queued on the ioctx
- *	run list, and keep running them until the list
- *	stays empty.
- * Assumes it is operating within the aio issuer's mm context.
- */
-static inline void aio_run_all_iocbs(struct kioctx *ctx)
-{
-	spin_lock_irq(&ctx->ctx_lock);
-	while (__aio_run_iocbs(ctx))
-		;
-	spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * aio_kick_handler:
- * 	Work queue handler triggered to process pending
- * 	retries on an ioctx. Takes on the aio issuer's
- *	mm context before running the iocbs, so that
- *	copy_xxx_user operates on the issuer's address
- *      space.
- * Run on aiod's context.
- */
-static void aio_kick_handler(struct work_struct *work)
-{
-	struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
-	mm_segment_t oldfs = get_fs();
-	struct mm_struct *mm;
-	int requeue;
-
-	set_fs(USER_DS);
-	use_mm(ctx->mm);
-	spin_lock_irq(&ctx->ctx_lock);
-	requeue =__aio_run_iocbs(ctx);
-	mm = ctx->mm;
-	spin_unlock_irq(&ctx->ctx_lock);
- 	unuse_mm(mm);
-	set_fs(oldfs);
-	/*
-	 * we're in a worker thread already; no point using non-zero delay
-	 */
-	if (requeue)
-		queue_delayed_work(aio_wq, &ctx->wq, 0);
-}
-
-
-/*
- * Called by kick_iocb to queue the kiocb for retry
- * and if required activate the aio work queue to process
- * it
- */
-static void try_queue_kicked_iocb(struct kiocb *iocb)
-{
- 	struct kioctx	*ctx = iocb->ki_ctx;
-	unsigned long flags;
-	int run = 0;
-
-	spin_lock_irqsave(&ctx->ctx_lock, flags);
-	/* set this inside the lock so that we can't race with aio_run_iocb()
-	 * testing it and putting the iocb on the run list under the lock */
-	if (!kiocbTryKick(iocb))
-		run = __queue_kicked_iocb(iocb);
-	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
-	if (run)
-		aio_queue_work(ctx);
-}
-
-/*
- * kick_iocb:
- *      Called typically from a wait queue callback context
- *      to trigger a retry of the iocb.
- *      The retry is usually executed by aio workqueue
- *      threads (See aio_kick_handler).
- */
-void kick_iocb(struct kiocb *iocb)
-{
-	/* sync iocbs are easy: they can only ever be executing from a 
-	 * single context. */
-	if (is_sync_kiocb(iocb)) {
-		kiocbSetKicked(iocb);
-	        wake_up_process(iocb->ki_obj.tsk);
-		return;
-	}
-
-	try_queue_kicked_iocb(iocb);
-}
-EXPORT_SYMBOL(kick_iocb);
-
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
- *	Returns true if this is the last user of the request.  The 
- *	only other user of the request can be the cancellation code.
  */
-int aio_complete(struct kiocb *iocb, long res, long res2)
+void aio_complete(struct kiocb *iocb, long res, long res2)
 {
 	struct kioctx	*ctx = iocb->ki_ctx;
-	struct aio_ring_info	*info;
 	struct aio_ring	*ring;
-	struct io_event	*event;
+	struct io_event	*ev_page, *event;
 	unsigned long	flags;
-	unsigned long	tail;
-	int		ret;
+	unsigned tail, pos;
 
 	/*
 	 * Special case handling for sync iocbs:
@@ -907,61 +591,81 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
 	 *  - the sync task helpfully left a reference to itself in the iocb
 	 */
 	if (is_sync_kiocb(iocb)) {
-		BUG_ON(iocb->ki_users != 1);
+		BUG_ON(atomic_read(&iocb->ki_users) != 1);
 		iocb->ki_user_data = res;
-		iocb->ki_users = 0;
+		atomic_set(&iocb->ki_users, 0);
 		wake_up_process(iocb->ki_obj.tsk);
-		return 1;
+		return;
 	}
 
-	info = &ctx->ring_info;
-
-	/* add a completion event to the ring buffer.
-	 * must be done holding ctx->ctx_lock to prevent
-	 * other code from messing with the tail
-	 * pointer since we might be called from irq
-	 * context.
+	/*
+	 * Take rcu_read_lock() in case the kioctx is being destroyed, as we
+	 * need to issue a wakeup after decrementing reqs_active.
 	 */
-	spin_lock_irqsave(&ctx->ctx_lock, flags);
+	rcu_read_lock();
 
-	if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
-		list_del_init(&iocb->ki_run_list);
+	if (iocb->ki_list.next) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&ctx->ctx_lock, flags);
+		list_del(&iocb->ki_list);
+		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+	}
 
 	/*
 	 * cancelled requests don't get events, userland was given one
 	 * when the event got cancelled.
 	 */
-	if (kiocbIsCancelled(iocb))
+	if (unlikely(xchg(&iocb->ki_cancel,
+			  KIOCB_CANCELLED) == KIOCB_CANCELLED)) {
+		atomic_dec(&ctx->reqs_active);
+		/* Still need the wake_up in case free_ioctx is waiting */
 		goto put_rq;
+	}
 
-	ring = kmap_atomic(info->ring_pages[0]);
+	/*
+	 * Add a completion event to the ring buffer. Must be done holding
+	 * ctx->completion_lock to prevent other code from messing with the tail
+	 * pointer since we might be called from irq context.
+	 */
+	spin_lock_irqsave(&ctx->completion_lock, flags);
 
-	tail = info->tail;
-	event = aio_ring_event(info, tail);
-	if (++tail >= info->nr)
+	tail = ctx->tail;
+	pos = tail + AIO_EVENTS_OFFSET;
+
+	if (++tail >= ctx->nr_events)
 		tail = 0;
 
+	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
+
 	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
 	event->data = iocb->ki_user_data;
 	event->res = res;
 	event->res2 = res2;
 
-	dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
-		ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
-		res, res2);
+	kunmap_atomic(ev_page);
+	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
+
+	pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
+		 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+		 res, res2);
 
 	/* after flagging the request as done, we
 	 * must never even look at it again
 	 */
 	smp_wmb();	/* make event visible before updating tail */
 
-	info->tail = tail;
-	ring->tail = tail;
+	ctx->tail = tail;
 
-	put_aio_ring_event(event);
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	ring->tail = tail;
 	kunmap_atomic(ring);
+	flush_dcache_page(ctx->ring_pages[0]);
+
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
-	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+	pr_debug("added to ring %p at [%u]\n", iocb, tail);
 
 	/*
 	 * Check if the user asked us to deliver the result through an
@@ -973,7 +677,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
 
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
-	ret = __aio_put_req(ctx, iocb);
+	aio_put_req(iocb);
 
 	/*
 	 * We have to order our ring_info tail store above and test
@@ -986,233 +690,133 @@ put_rq:
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
 
-	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
-	return ret;
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(aio_complete);
 
-/* aio_read_evt
- *	Pull an event off of the ioctx's event ring.  Returns the number of 
- *	events fetched (0 or 1 ;-)
- *	FIXME: make this use cmpxchg.
- *	TODO: make the ringbuffer user mmap()able (requires FIXME).
+/* aio_read_events
+ *	Pull an event off of the ioctx's event ring.  Returns the number of
+ *	events fetched
  */
-static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
+static long aio_read_events_ring(struct kioctx *ctx,
+				 struct io_event __user *event, long nr)
 {
-	struct aio_ring_info *info = &ioctx->ring_info;
 	struct aio_ring *ring;
-	unsigned long head;
-	int ret = 0;
-
-	ring = kmap_atomic(info->ring_pages[0]);
-	dprintk("in aio_read_evt h%lu t%lu m%lu\n",
-		 (unsigned long)ring->head, (unsigned long)ring->tail,
-		 (unsigned long)ring->nr);
-
-	if (ring->head == ring->tail)
-		goto out;
+	unsigned head, pos;
+	long ret = 0;
+	int copy_ret;
 
-	spin_lock(&info->ring_lock);
-
-	head = ring->head % info->nr;
-	if (head != ring->tail) {
-		struct io_event *evp = aio_ring_event(info, head);
-		*ent = *evp;
-		head = (head + 1) % info->nr;
-		smp_mb(); /* finish reading the event before updatng the head */
-		ring->head = head;
-		ret = 1;
-		put_aio_ring_event(evp);
-	}
-	spin_unlock(&info->ring_lock);
+	mutex_lock(&ctx->ring_lock);
 
-out:
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	head = ring->head;
 	kunmap_atomic(ring);
-	dprintk("leaving aio_read_evt: %d  h%lu t%lu\n", ret,
-		 (unsigned long)ring->head, (unsigned long)ring->tail);
-	return ret;
-}
 
-struct aio_timeout {
-	struct timer_list	timer;
-	int			timed_out;
-	struct task_struct	*p;
-};
+	pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events);
 
-static void timeout_func(unsigned long data)
-{
-	struct aio_timeout *to = (struct aio_timeout *)data;
+	if (head == ctx->tail)
+		goto out;
 
-	to->timed_out = 1;
-	wake_up_process(to->p);
-}
+	while (ret < nr) {
+		long avail;
+		struct io_event *ev;
+		struct page *page;
 
-static inline void init_timeout(struct aio_timeout *to)
-{
-	setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
-	to->timed_out = 0;
-	to->p = current;
-}
-
-static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
-			       const struct timespec *ts)
-{
-	to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
-	if (time_after(to->timer.expires, jiffies))
-		add_timer(&to->timer);
-	else
-		to->timed_out = 1;
-}
+		avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
+		if (head == ctx->tail)
+			break;
 
-static inline void clear_timeout(struct aio_timeout *to)
-{
-	del_singleshot_timer_sync(&to->timer);
-}
+		avail = min(avail, nr - ret);
+		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
+			    ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
 
-static int read_events(struct kioctx *ctx,
-			long min_nr, long nr,
-			struct io_event __user *event,
-			struct timespec __user *timeout)
-{
-	long			start_jiffies = jiffies;
-	struct task_struct	*tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	int			ret;
-	int			i = 0;
-	struct io_event		ent;
-	struct aio_timeout	to;
-	int			retry = 0;
-
-	/* needed to zero any padding within an entry (there shouldn't be 
-	 * any, but C is fun!
-	 */
-	memset(&ent, 0, sizeof(ent));
-retry:
-	ret = 0;
-	while (likely(i < nr)) {
-		ret = aio_read_evt(ctx, &ent);
-		if (unlikely(ret <= 0))
-			break;
+		pos = head + AIO_EVENTS_OFFSET;
+		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
+		pos %= AIO_EVENTS_PER_PAGE;
 
-		dprintk("read event: %Lx %Lx %Lx %Lx\n",
-			ent.data, ent.obj, ent.res, ent.res2);
+		ev = kmap(page);
+		copy_ret = copy_to_user(event + ret, ev + pos,
+					sizeof(*ev) * avail);
+		kunmap(page);
 
-		/* Could we split the check in two? */
-		ret = -EFAULT;
-		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
-			dprintk("aio: lost an event due to EFAULT.\n");
-			break;
+		if (unlikely(copy_ret)) {
+			ret = -EFAULT;
+			goto out;
 		}
-		ret = 0;
 
-		/* Good, event copied to userland, update counts. */
-		event ++;
-		i ++;
+		ret += avail;
+		head += avail;
+		head %= ctx->nr_events;
 	}
 
-	if (min_nr <= i)
-		return i;
-	if (ret)
-		return ret;
+	ring = kmap_atomic(ctx->ring_pages[0]);
+	ring->head = head;
+	kunmap_atomic(ring);
+	flush_dcache_page(ctx->ring_pages[0]);
 
-	/* End fast path */
+	pr_debug("%li  h%u t%u\n", ret, head, ctx->tail);
 
-	/* racey check, but it gets redone */
-	if (!retry && unlikely(!list_empty(&ctx->run_list))) {
-		retry = 1;
-		aio_run_all_iocbs(ctx);
-		goto retry;
-	}
+	atomic_sub(ret, &ctx->reqs_active);
+out:
+	mutex_unlock(&ctx->ring_lock);
 
-	init_timeout(&to);
-	if (timeout) {
-		struct timespec	ts;
-		ret = -EFAULT;
-		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
-			goto out;
+	return ret;
+}
 
-		set_timeout(start_jiffies, &to, &ts);
-	}
+static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
+			    struct io_event __user *event, long *i)
+{
+	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
 
-	while (likely(i < nr)) {
-		add_wait_queue_exclusive(&ctx->wait, &wait);
-		do {
-			set_task_state(tsk, TASK_INTERRUPTIBLE);
-			ret = aio_read_evt(ctx, &ent);
-			if (ret)
-				break;
-			if (min_nr <= i)
-				break;
-			if (unlikely(ctx->dead)) {
-				ret = -EINVAL;
-				break;
-			}
-			if (to.timed_out)	/* Only check after read evt */
-				break;
-			/* Try to only show up in io wait if there are ops
-			 *  in flight */
-			if (ctx->reqs_active)
-				io_schedule();
-			else
-				schedule();
-			if (signal_pending(tsk)) {
-				ret = -EINTR;
-				break;
-			}
-			/*ret = aio_read_evt(ctx, &ent);*/
-		} while (1) ;
-
-		set_task_state(tsk, TASK_RUNNING);
-		remove_wait_queue(&ctx->wait, &wait);
-
-		if (unlikely(ret <= 0))
-			break;
+	if (ret > 0)
+		*i += ret;
 
-		ret = -EFAULT;
-		if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
-			dprintk("aio: lost an event due to EFAULT.\n");
-			break;
-		}
+	if (unlikely(atomic_read(&ctx->dead)))
+		ret = -EINVAL;
 
-		/* Good, event copied to userland, update counts. */
-		event ++;
-		i ++;
-	}
+	if (!*i)
+		*i = ret;
 
-	if (timeout)
-		clear_timeout(&to);
-out:
-	destroy_timer_on_stack(&to.timer);
-	return i ? i : ret;
+	return ret < 0 || *i >= min_nr;
 }
 
-/* Take an ioctx and remove it from the list of ioctx's.  Protects 
- * against races with itself via ->dead.
- */
-static void io_destroy(struct kioctx *ioctx)
+static long read_events(struct kioctx *ctx, long min_nr, long nr,
+			struct io_event __user *event,
+			struct timespec __user *timeout)
 {
-	struct mm_struct *mm = current->mm;
-	int was_dead;
+	ktime_t until = { .tv64 = KTIME_MAX };
+	long ret = 0;
 
-	/* delete the entry from the list is someone else hasn't already */
-	spin_lock(&mm->ioctx_lock);
-	was_dead = ioctx->dead;
-	ioctx->dead = 1;
-	hlist_del_rcu(&ioctx->list);
-	spin_unlock(&mm->ioctx_lock);
+	if (timeout) {
+		struct timespec	ts;
 
-	dprintk("aio_release(%p)\n", ioctx);
-	if (likely(!was_dead))
-		put_ioctx(ioctx);	/* twice for the list */
+		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
+			return -EFAULT;
 
-	kill_ctx(ioctx);
+		until = timespec_to_ktime(ts);
+	}
 
 	/*
-	 * Wake up any waiters.  The setting of ctx->dead must be seen
-	 * by other CPUs at this point.  Right now, we rely on the
-	 * locking done by the above calls to ensure this consistency.
+	 * Note that aio_read_events() is being called as the conditional - i.e.
+	 * we're calling it after prepare_to_wait() has set task state to
+	 * TASK_INTERRUPTIBLE.
+	 *
+	 * But aio_read_events() can block, and if it blocks it's going to flip
+	 * the task state back to TASK_RUNNING.
+	 *
+	 * This should be ok, provided it doesn't flip the state back to
+	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
+	 * will only happen if the mutex_lock() call blocks, and we then find
+	 * the ringbuffer empty. So in practice we should be ok, but it's
+	 * something to be aware of when touching this code.
 	 */
-	wake_up_all(&ioctx->wait);
+	wait_event_interruptible_hrtimeout(ctx->wait,
+			aio_read_events(ctx, min_nr, nr, event, &ret), until);
+
+	if (!ret && signal_pending(current))
+		ret = -EINTR;
+
+	return ret;
 }
 
 /* sys_io_setup:
@@ -1250,7 +854,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 	if (!IS_ERR(ioctx)) {
 		ret = put_user(ioctx->user_id, ctxp);
 		if (ret)
-			io_destroy(ioctx);
+			kill_ioctx(ioctx);
 		put_ioctx(ioctx);
 	}
 
@@ -1268,7 +872,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 {
 	struct kioctx *ioctx = lookup_ioctx(ctx);
 	if (likely(NULL != ioctx)) {
-		io_destroy(ioctx);
+		kill_ioctx(ioctx);
 		put_ioctx(ioctx);
 		return 0;
 	}
@@ -1299,29 +903,22 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
 	BUG_ON(ret > 0 && iocb->ki_left == 0);
 }
 
-static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
+typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
+			    unsigned long, loff_t);
+
+static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
-			 unsigned long, loff_t);
 	ssize_t ret = 0;
-	unsigned short opcode;
-
-	if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
-		(iocb->ki_opcode == IOCB_CMD_PREAD)) {
-		rw_op = file->f_op->aio_read;
-		opcode = IOCB_CMD_PREADV;
-	} else {
-		rw_op = file->f_op->aio_write;
-		opcode = IOCB_CMD_PWRITEV;
-	}
 
 	/* This matches the pread()/pwrite() logic */
 	if (iocb->ki_pos < 0)
 		return -EINVAL;
 
+	if (rw == WRITE)
+		file_start_write(file);
 	do {
 		ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
 			    iocb->ki_nr_segs - iocb->ki_cur_seg,
@@ -1332,8 +929,10 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
 	/* retry all partial writes.  retry partial reads as long as its a
 	 * regular file. */
 	} while (ret > 0 && iocb->ki_left > 0 &&
-		 (opcode == IOCB_CMD_PWRITEV ||
+		 (rw == WRITE ||
 		  (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
+	if (rw == WRITE)
+		file_end_write(file);
 
 	/* This means we must have transferred all that we could */
 	/* No need to retry anymore */
@@ -1342,81 +941,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
 
 	/* If we managed to write some out we return that, rather than
 	 * the eventual error. */
-	if (opcode == IOCB_CMD_PWRITEV
-	    && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY
+	if (rw == WRITE
+	    && ret < 0 && ret != -EIOCBQUEUED
 	    && iocb->ki_nbytes - iocb->ki_left)
 		ret = iocb->ki_nbytes - iocb->ki_left;
 
 	return ret;
 }
 
-static ssize_t aio_fdsync(struct kiocb *iocb)
-{
-	struct file *file = iocb->ki_filp;
-	ssize_t ret = -EINVAL;
-
-	if (file->f_op->aio_fsync)
-		ret = file->f_op->aio_fsync(iocb, 1);
-	return ret;
-}
-
-static ssize_t aio_fsync(struct kiocb *iocb)
-{
-	struct file *file = iocb->ki_filp;
-	ssize_t ret = -EINVAL;
-
-	if (file->f_op->aio_fsync)
-		ret = file->f_op->aio_fsync(iocb, 0);
-	return ret;
-}
-
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
+static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
 {
 	ssize_t ret;
 
+	kiocb->ki_nr_segs = kiocb->ki_nbytes;
+
 #ifdef CONFIG_COMPAT
 	if (compat)
-		ret = compat_rw_copy_check_uvector(type,
+		ret = compat_rw_copy_check_uvector(rw,
 				(struct compat_iovec __user *)kiocb->ki_buf,
-				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+				kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
 				&kiocb->ki_iovec);
 	else
 #endif
-		ret = rw_copy_check_uvector(type,
+		ret = rw_copy_check_uvector(rw,
 				(struct iovec __user *)kiocb->ki_buf,
-				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+				kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
 				&kiocb->ki_iovec);
 	if (ret < 0)
-		goto out;
-
-	ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret);
-	if (ret < 0)
-		goto out;
+		return ret;
 
-	kiocb->ki_nr_segs = kiocb->ki_nbytes;
-	kiocb->ki_cur_seg = 0;
-	/* ki_nbytes/left now reflect bytes instead of segs */
+	/* ki_nbytes now reflect bytes instead of segs */
 	kiocb->ki_nbytes = ret;
-	kiocb->ki_left = ret;
-
-	ret = 0;
-out:
-	return ret;
+	return 0;
 }
 
-static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb)
+static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
 {
-	int bytes;
-
-	bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left);
-	if (bytes < 0)
-		return bytes;
+	if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes)))
+		return -EFAULT;
 
 	kiocb->ki_iovec = &kiocb->ki_inline_vec;
 	kiocb->ki_iovec->iov_base = kiocb->ki_buf;
-	kiocb->ki_iovec->iov_len = bytes;
+	kiocb->ki_iovec->iov_len = kiocb->ki_nbytes;
 	kiocb->ki_nr_segs = 1;
-	kiocb->ki_cur_seg = 0;
 	return 0;
 }
 
@@ -1425,96 +992,95 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc
  *	Performs the initial checks and aio retry method
  *	setup for the kiocb at the time of io submission.
  */
-static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
+static ssize_t aio_run_iocb(struct kiocb *req, bool compat)
 {
-	struct file *file = kiocb->ki_filp;
-	ssize_t ret = 0;
+	struct file *file = req->ki_filp;
+	ssize_t ret;
+	int rw;
+	fmode_t mode;
+	aio_rw_op *rw_op;
 
-	switch (kiocb->ki_opcode) {
+	switch (req->ki_opcode) {
 	case IOCB_CMD_PREAD:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_READ)))
-			break;
-		ret = -EFAULT;
-		if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
-			kiocb->ki_left)))
-			break;
-		ret = aio_setup_single_vector(READ, file, kiocb);
-		if (ret)
-			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_read)
-			kiocb->ki_retry = aio_rw_vect_retry;
-		break;
-	case IOCB_CMD_PWRITE:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_WRITE)))
-			break;
-		ret = -EFAULT;
-		if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
-			kiocb->ki_left)))
-			break;
-		ret = aio_setup_single_vector(WRITE, file, kiocb);
-		if (ret)
-			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_write)
-			kiocb->ki_retry = aio_rw_vect_retry;
-		break;
 	case IOCB_CMD_PREADV:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_READ)))
-			break;
-		ret = aio_setup_vectored_rw(READ, kiocb, compat);
-		if (ret)
-			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_read)
-			kiocb->ki_retry = aio_rw_vect_retry;
-		break;
+		mode	= FMODE_READ;
+		rw	= READ;
+		rw_op	= file->f_op->aio_read;
+		goto rw_common;
+
+	case IOCB_CMD_PWRITE:
 	case IOCB_CMD_PWRITEV:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_WRITE)))
-			break;
-		ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
+		mode	= FMODE_WRITE;
+		rw	= WRITE;
+		rw_op	= file->f_op->aio_write;
+		goto rw_common;
+rw_common:
+		if (unlikely(!(file->f_mode & mode)))
+			return -EBADF;
+
+		if (!rw_op)
+			return -EINVAL;
+
+		ret = (req->ki_opcode == IOCB_CMD_PREADV ||
+		       req->ki_opcode == IOCB_CMD_PWRITEV)
+			? aio_setup_vectored_rw(rw, req, compat)
+			: aio_setup_single_vector(rw, req);
 		if (ret)
-			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_write)
-			kiocb->ki_retry = aio_rw_vect_retry;
+			return ret;
+
+		ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+		if (ret < 0)
+			return ret;
+
+		req->ki_nbytes = ret;
+		req->ki_left = ret;
+
+		ret = aio_rw_vect_retry(req, rw, rw_op);
 		break;
+
 	case IOCB_CMD_FDSYNC:
-		ret = -EINVAL;
-		if (file->f_op->aio_fsync)
-			kiocb->ki_retry = aio_fdsync;
+		if (!file->f_op->aio_fsync)
+			return -EINVAL;
+
+		ret = file->f_op->aio_fsync(req, 1);
 		break;
+
 	case IOCB_CMD_FSYNC:
-		ret = -EINVAL;
-		if (file->f_op->aio_fsync)
-			kiocb->ki_retry = aio_fsync;
+		if (!file->f_op->aio_fsync)
+			return -EINVAL;
+
+		ret = file->f_op->aio_fsync(req, 0);
 		break;
+
 	default:
-		dprintk("EINVAL: io_submit: no operation provided\n");
-		ret = -EINVAL;
+		pr_debug("EINVAL: no operation provided\n");
+		return -EINVAL;
 	}
 
-	if (!kiocb->ki_retry)
-		return ret;
+	if (ret != -EIOCBQUEUED) {
+		/*
+		 * There's no easy way to restart the syscall since other AIO's
+		 * may be already running. Just fail this IO with EINTR.
+		 */
+		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+			     ret == -ERESTARTNOHAND ||
+			     ret == -ERESTART_RESTARTBLOCK))
+			ret = -EINTR;
+		aio_complete(req, ret, 0);
+	}
 
 	return 0;
 }
 
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb, struct kiocb_batch *batch,
-			 bool compat)
+			 struct iocb *iocb, bool compat)
 {
 	struct kiocb *req;
-	struct file *file;
 	ssize_t ret;
 
 	/* enforce forwards compatibility on users */
 	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
-		pr_debug("EINVAL: io_submit: reserve field set\n");
+		pr_debug("EINVAL: reserve field set\n");
 		return -EINVAL;
 	}
 
@@ -1528,16 +1094,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		return -EINVAL;
 	}
 
-	file = fget(iocb->aio_fildes);
-	if (unlikely(!file))
-		return -EBADF;
-
-	req = aio_get_req(ctx, batch);  /* returns with 2 references to req */
-	if (unlikely(!req)) {
-		fput(file);
+	req = aio_get_req(ctx);
+	if (unlikely(!req))
 		return -EAGAIN;
+
+	req->ki_filp = fget(iocb->aio_fildes);
+	if (unlikely(!req->ki_filp)) {
+		ret = -EBADF;
+		goto out_put_req;
 	}
-	req->ki_filp = file;
+
 	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
 		/*
 		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
@@ -1553,9 +1119,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		}
 	}
 
-	ret = put_user(req->ki_key, &user_iocb->aio_key);
+	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
 	if (unlikely(ret)) {
-		dprintk("EFAULT: aio_key\n");
+		pr_debug("EFAULT: aio_key\n");
 		goto out_put_req;
 	}
 
@@ -1567,41 +1133,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
 	req->ki_opcode = iocb->aio_lio_opcode;
 
-	ret = aio_setup_iocb(req, compat);
-
+	ret = aio_run_iocb(req, compat);
 	if (ret)
 		goto out_put_req;
 
-	spin_lock_irq(&ctx->ctx_lock);
-	/*
-	 * We could have raced with io_destroy() and are currently holding a
-	 * reference to ctx which should be destroyed. We cannot submit IO
-	 * since ctx gets freed as soon as io_submit() puts its reference.  The
-	 * check here is reliable: io_destroy() sets ctx->dead before waiting
-	 * for outstanding IO and the barrier between these two is realized by
-	 * unlock of mm->ioctx_lock and lock of ctx->ctx_lock.  Analogously we
-	 * increment ctx->reqs_active before checking for ctx->dead and the
-	 * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
-	 * don't see ctx->dead set here, io_destroy() waits for our IO to
-	 * finish.
-	 */
-	if (ctx->dead) {
-		spin_unlock_irq(&ctx->ctx_lock);
-		ret = -EINVAL;
-		goto out_put_req;
-	}
-	aio_run_iocb(req);
-	if (!list_empty(&ctx->run_list)) {
-		/* drain the run list */
-		while (__aio_run_iocbs(ctx))
-			;
-	}
-	spin_unlock_irq(&ctx->ctx_lock);
-
 	aio_put_req(req);	/* drop extra ref to req */
 	return 0;
-
 out_put_req:
+	atomic_dec(&ctx->reqs_active);
 	aio_put_req(req);	/* drop extra ref to req */
 	aio_put_req(req);	/* drop i/o ref to req */
 	return ret;
@@ -1614,7 +1153,6 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 	long ret = 0;
 	int i = 0;
 	struct blk_plug plug;
-	struct kiocb_batch batch;
 
 	if (unlikely(nr < 0))
 		return -EINVAL;
@@ -1627,12 +1165,10 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 
 	ctx = lookup_ioctx(ctx_id);
 	if (unlikely(!ctx)) {
-		pr_debug("EINVAL: io_submit: invalid context id\n");
+		pr_debug("EINVAL: invalid context id\n");
 		return -EINVAL;
 	}
 
-	kiocb_batch_init(&batch, nr);
-
 	blk_start_plug(&plug);
 
 	/*
@@ -1653,13 +1189,12 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
+		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
 		if (ret)
 			break;
 	}
 	blk_finish_plug(&plug);
 
-	kiocb_batch_free(ctx, &batch);
 	put_ioctx(ctx);
 	return i ? i : ret;
 }
@@ -1692,10 +1227,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
 
 	assert_spin_locked(&ctx->ctx_lock);
 
+	if (key != KIOCB_KEY)
+		return NULL;
+
 	/* TODO: use a hash or array, this sucks. */
 	list_for_each(pos, &ctx->active_reqs) {
 		struct kiocb *kiocb = list_kiocb(pos);
-		if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key)
+		if (kiocb->ki_obj.user == iocb)
 			return kiocb;
 	}
 	return NULL;
@@ -1714,7 +1252,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
 SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 		struct io_event __user *, result)
 {
-	int (*cancel)(struct kiocb *iocb, struct io_event *res);
+	struct io_event res;
 	struct kioctx *ctx;
 	struct kiocb *kiocb;
 	u32 key;
@@ -1729,32 +1267,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->ctx_lock);
-	ret = -EAGAIN;
+
 	kiocb = lookup_kiocb(ctx, iocb, key);
-	if (kiocb && kiocb->ki_cancel) {
-		cancel = kiocb->ki_cancel;
-		kiocb->ki_users ++;
-		kiocbSetCancelled(kiocb);
-	} else
-		cancel = NULL;
+	if (kiocb)
+		ret = kiocb_cancel(ctx, kiocb, &res);
+	else
+		ret = -EINVAL;
+
 	spin_unlock_irq(&ctx->ctx_lock);
 
-	if (NULL != cancel) {
-		struct io_event tmp;
-		pr_debug("calling cancel\n");
-		memset(&tmp, 0, sizeof(tmp));
-		tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
-		tmp.data = kiocb->ki_user_data;
-		ret = cancel(kiocb, &tmp);
-		if (!ret) {
-			/* Cancellation succeeded -- copy the result
-			 * into the user's buffer.
-			 */
-			if (copy_to_user(result, &tmp, sizeof(tmp)))
-				ret = -EFAULT;
-		}
-	} else
-		ret = -EINVAL;
+	if (!ret) {
+		/* Cancellation succeeded -- copy the result
+		 * into the user's buffer.
+		 */
+		if (copy_to_user(result, &res, sizeof(res)))
+			ret = -EFAULT;
+	}
 
 	put_ioctx(ctx);
 
@@ -1771,8 +1299,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
  *	< min_nr if the timeout specified by timeout has elapsed
  *	before sufficient events are available, where timeout == NULL
  *	specifies an infinite timeout. Note that the timeout pointed to by
- *	timeout is relative and will be updated if not NULL and the
- *	operation blocks. Will fail with -ENOSYS if not implemented.
+ *	timeout is relative.  Will fail with -ENOSYS if not implemented.
  */
 SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 		long, min_nr,
@@ -1788,7 +1315,5 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 			ret = read_events(ioctx, min_nr, nr, events, timeout);
 		put_ioctx(ioctx);
 	}
-
-	asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
 	return ret;
 }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 28d39fb84ae..47a65df8c87 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -131,7 +131,6 @@ struct file *anon_inode_getfile(const char *name,
 	struct qstr this;
 	struct path path;
 	struct file *file;
-	int error;
 
 	if (IS_ERR(anon_inode_inode))
 		return ERR_PTR(-ENODEV);
@@ -143,7 +142,7 @@ struct file *anon_inode_getfile(const char *name,
 	 * Link the inode to a directory entry by creating a unique name
 	 * using the inode sequence number.
 	 */
-	error = -ENOMEM;
+	file = ERR_PTR(-ENOMEM);
 	this.name = name;
 	this.len = strlen(name);
 	this.hash = 0;
@@ -160,15 +159,12 @@ struct file *anon_inode_getfile(const char *name,
 
 	d_instantiate(path.dentry, anon_inode_inode);
 
-	error = -ENFILE;
 	file = alloc_file(&path, OPEN_FMODE(flags), fops);
-	if (!file)
+	if (IS_ERR(file))
 		goto err_dput;
 	file->f_mapping = anon_inode_inode->i_mapping;
 
-	file->f_pos = 0;
 	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
-	file->f_version = 0;
 	file->private_data = priv;
 
 	return file;
@@ -177,7 +173,7 @@ err_dput:
 	path_put(&path);
 err_module:
 	module_put(fops->owner);
-	return ERR_PTR(error);
+	return file;
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfile);
 
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b785e770795..3f1128b37e4 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -273,7 +273,7 @@ static inline int autofs_prepare_pipe(struct file *pipe)
 {
 	if (!pipe->f_op || !pipe->f_op->write)
 		return -EINVAL;
-	if (!S_ISFIFO(pipe->f_dentry->d_inode->i_mode))
+	if (!S_ISFIFO(file_inode(pipe)->i_mode))
 		return -EINVAL;
 	/* We want a packet pipe */
 	pipe->f_flags |= O_DIRECT;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 9f68a37bb2b..743c7c2c949 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -159,7 +159,7 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
 	struct inode *inode;
 
 	if (f) {
-		inode = f->f_path.dentry->d_inode;
+		inode = file_inode(f);
 		sbi = autofs4_sbi(inode->i_sb);
 	}
 	return sbi;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 01443ce43ee..3d9d3f5d5dd 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -61,15 +61,6 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 		/* This is an autofs submount, we can't expire it */
 		if (autofs_type_indirect(sbi->type))
 			goto done;
-
-		/*
-		 * Otherwise it's an offset mount and we need to check
-		 * if we can umount its mount, if there is one.
-		 */
-		if (!d_mountpoint(path.dentry)) {
-			status = 0;
-			goto done;
-		}
 	}
 
 	/* Update the expiry counter if fs is busy */
@@ -118,7 +109,7 @@ cont:
 
 	spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
 	/* Already gone or negative dentry (under construction) - try next */
-	if (q->d_count == 0 || !simple_positive(q)) {
+	if (!d_count(q) || !simple_positive(q)) {
 		spin_unlock(&q->d_lock);
 		next = q->d_u.d_child.next;
 		goto cont;
@@ -276,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
 			else
 				ino_count++;
 
-			if (p->d_count > ino_count) {
+			if (d_count(p) > ino_count) {
 				top_ino->last_used = jiffies;
 				dput(p);
 				return 1;
@@ -418,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 		if (!exp_leaves) {
 			/* Path walk currently on this dentry? */
 			ino_count = atomic_read(&ino->count) + 1;
-			if (dentry->d_count > ino_count)
+			if (d_count(dentry) > ino_count)
 				goto next;
 
 			if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -432,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 		} else {
 			/* Path walk currently on this dentry? */
 			ino_count = atomic_read(&ino->count) + 1;
-			if (dentry->d_count > ino_count)
+			if (d_count(dentry) > ino_count)
 				goto next;
 
 			expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index cddc74b9cdb..b3db517e89e 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -26,6 +26,7 @@ static struct file_system_type autofs_fs_type = {
 	.mount		= autofs_mount,
 	.kill_sb	= autofs4_kill_sb,
 };
+MODULE_ALIAS_FS("autofs");
 
 static int __init init_autofs4_fs(void)
 {
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c93447604da..92ef341ba0c 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -41,7 +41,7 @@ const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
 	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
-	.readdir	= dcache_readdir,
+	.iterate	= dcache_readdir,
 	.llseek		= dcache_dir_lseek,
 	.unlocked_ioctl	= autofs4_root_ioctl,
 #ifdef CONFIG_COMPAT
@@ -53,7 +53,7 @@ const struct file_operations autofs4_dir_operations = {
 	.open		= autofs4_dir_open,
 	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
-	.readdir	= dcache_readdir,
+	.iterate	= dcache_readdir,
 	.llseek		= dcache_dir_lseek,
 };
 
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 		spin_lock(&active->d_lock);
 
 		/* Already gone? */
-		if (active->d_count == 0)
+		if (!d_count(active))
 			goto next;
 
 		qstr = &active->d_name;
@@ -383,8 +383,10 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 				goto done;
 			}
 		} else {
-			if (!simple_empty(dentry))
+			if (!simple_empty(dentry)) {
+				spin_unlock(&sbi->fs_lock);
 				goto done;
+			}
 		}
 		ino->flags |= AUTOFS_INF_PENDING;
 		spin_unlock(&sbi->fs_lock);
@@ -406,7 +408,7 @@ done:
 	return NULL;
 }
 
-int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
+static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -587,7 +589,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	
 	/* This allows root to remove symlinks */
 	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	if (atomic_dec_and_test(&ino->count)) {
 		p_ino = autofs4_dentry_ino(dentry->d_parent);
@@ -874,7 +876,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 static long autofs4_root_ioctl(struct file *filp,
 			       unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
 }
 
@@ -882,7 +884,7 @@ static long autofs4_root_ioctl(struct file *filp,
 static long autofs4_root_compat_ioctl(struct file *filp,
 			     unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	int ret;
 
 	if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 03bc1d347d8..3db70dae40d 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -42,10 +42,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 	while (wq) {
 		nwq = wq->next;
 		wq->status = -ENOENT; /* Magic is gone - report failure */
-		if (wq->name.name) {
-			kfree(wq->name.name);
-			wq->name.name = NULL;
-		}
+		kfree(wq->name.name);
+		wq->name.name = NULL;
 		wq->wait_ctr--;
 		wake_up_interruptible(&wq->queue);
 		wq = nwq;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 922ad460bff..7c93953030f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -45,7 +45,7 @@ static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	return -EIO;
 }
 
-static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int bad_file_readdir(struct file *file, struct dir_context *ctx)
 {
 	return -EIO;
 }
@@ -152,7 +152,7 @@ static const struct file_operations bad_file_ops =
 	.write		= bad_file_write,
 	.aio_read	= bad_file_aio_read,
 	.aio_write	= bad_file_aio_write,
-	.readdir	= bad_file_readdir,
+	.iterate	= bad_file_readdir,
 	.poll		= bad_file_poll,
 	.unlocked_ioctl	= bad_file_unlocked_ioctl,
 	.compat_ioctl	= bad_file_compat_ioctl,
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
index 7835d30f211..edc5cc2aefa 100644
--- a/fs/befs/Kconfig
+++ b/fs/befs/Kconfig
@@ -1,6 +1,6 @@
 config BEFS_FS
-	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	tristate "BeOS file system (BeFS) support (read only)"
+	depends on BLOCK
 	select NLS
 	help
 	  The BeOS File System (BeFS) is the native file system of Be, Inc's
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index a66c9b1136e..74e397db0b8 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -436,8 +436,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
 		goto error;
 	}
 
-	if ((this_node = (befs_btree_node *)
-	     kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
+	if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
 		befs_error(sb, "befs_btree_read() failed to allocate %u "
 			   "bytes of memory", sizeof (befs_btree_node));
 		goto error;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 2b3bda8d5e6..e9c75e20db3 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -31,7 +31,7 @@ MODULE_LICENSE("GPL");
 /* The units the vfs expects inode->i_blocks to be in */
 #define VFS_BLOCK_SIZE 512
 
-static int befs_readdir(struct file *, void *, filldir_t);
+static int befs_readdir(struct file *, struct dir_context *);
 static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 static int befs_readpage(struct file *file, struct page *page);
 static sector_t befs_bmap(struct address_space *mapping, sector_t block);
@@ -66,7 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
 
 static const struct file_operations befs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= befs_readdir,
+	.iterate	= befs_readdir,
 	.llseek		= generic_file_llseek,
 };
 
@@ -211,9 +211,9 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 }
 
 static int
-befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+befs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
 	befs_off_t value;
@@ -221,15 +221,14 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	size_t keysize;
 	unsigned char d_type;
 	char keybuf[BEFS_NAME_LEN + 1];
-	char *nlsname;
-	int nlsnamelen;
-	const char *dirname = filp->f_path.dentry->d_name.name;
+	const char *dirname = file->f_path.dentry->d_name.name;
 
 	befs_debug(sb, "---> befs_readdir() "
-		   "name %s, inode %ld, filp->f_pos %Ld",
-		   dirname, inode->i_ino, filp->f_pos);
+		   "name %s, inode %ld, ctx->pos %Ld",
+		   dirname, inode->i_ino, ctx->pos);
 
-	result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1,
+more:
+	result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
 				 keybuf, &keysize, &value);
 
 	if (result == BEFS_ERR) {
@@ -251,24 +250,29 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	/* Convert to NLS */
 	if (BEFS_SB(sb)->nls) {
+		char *nlsname;
+		int nlsnamelen;
 		result =
 		    befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
 		if (result < 0) {
 			befs_debug(sb, "<--- befs_readdir() ERROR");
 			return result;
 		}
-		result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos,
-				 (ino_t) value, d_type);
+		if (!dir_emit(ctx, nlsname, nlsnamelen,
+				 (ino_t) value, d_type)) {
+			kfree(nlsname);
+			return 0;
+		}
 		kfree(nlsname);
-
 	} else {
-		result = filldir(dirent, keybuf, keysize, filp->f_pos,
-				 (ino_t) value, d_type);
+		if (!dir_emit(ctx, keybuf, keysize,
+				 (ino_t) value, d_type))
+			return 0;
 	}
+	ctx->pos++;
+	goto more;
 
-	filp->f_pos++;
-
-	befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos);
+	befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos);
 
 	return 0;
 }
@@ -951,6 +955,7 @@ static struct file_system_type befs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,	
 };
+MODULE_ALIAS_FS("befs");
 
 static int __init
 init_befs_fs(void)
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
index c2336c62024..3728a6479c6 100644
--- a/fs/bfs/Kconfig
+++ b/fs/bfs/Kconfig
@@ -1,6 +1,6 @@
 config BFS_FS
-	tristate "BFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	tristate "BFS file system support"
+	depends on BLOCK
 	help
 	  Boot File System (BFS) is a file system used under SCO UnixWare to
 	  allow the bootloader access to the kernel image and other important
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 2785ef91191..a399e6d9dc7 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -26,58 +26,51 @@ static struct buffer_head *bfs_find_entry(struct inode *dir,
 				const unsigned char *name, int namelen,
 				struct bfs_dirent **res_dir);
 
-static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
+static int bfs_readdir(struct file *f, struct dir_context *ctx)
 {
-	struct inode *dir = f->f_path.dentry->d_inode;
+	struct inode *dir = file_inode(f);
 	struct buffer_head *bh;
 	struct bfs_dirent *de;
-	struct bfs_sb_info *info = BFS_SB(dir->i_sb);
 	unsigned int offset;
 	int block;
 
-	mutex_lock(&info->bfs_lock);
-
-	if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
+	if (ctx->pos & (BFS_DIRENT_SIZE - 1)) {
 		printf("Bad f_pos=%08lx for %s:%08lx\n",
-					(unsigned long)f->f_pos,
+					(unsigned long)ctx->pos,
 					dir->i_sb->s_id, dir->i_ino);
-		mutex_unlock(&info->bfs_lock);
-		return -EBADF;
+		return -EINVAL;
 	}
 
-	while (f->f_pos < dir->i_size) {
-		offset = f->f_pos & (BFS_BSIZE - 1);
-		block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS);
+	while (ctx->pos < dir->i_size) {
+		offset = ctx->pos & (BFS_BSIZE - 1);
+		block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS);
 		bh = sb_bread(dir->i_sb, block);
 		if (!bh) {
-			f->f_pos += BFS_BSIZE - offset;
+			ctx->pos += BFS_BSIZE - offset;
 			continue;
 		}
 		do {
 			de = (struct bfs_dirent *)(bh->b_data + offset);
 			if (de->ino) {
 				int size = strnlen(de->name, BFS_NAMELEN);
-				if (filldir(dirent, de->name, size, f->f_pos,
+				if (!dir_emit(ctx, de->name, size,
 						le16_to_cpu(de->ino),
-						DT_UNKNOWN) < 0) {
+						DT_UNKNOWN)) {
 					brelse(bh);
-					mutex_unlock(&info->bfs_lock);
 					return 0;
 				}
 			}
 			offset += BFS_DIRENT_SIZE;
-			f->f_pos += BFS_DIRENT_SIZE;
-		} while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size));
+			ctx->pos += BFS_DIRENT_SIZE;
+		} while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size));
 		brelse(bh);
 	}
-
-	mutex_unlock(&info->bfs_lock);
-	return 0;	
+	return 0;
 }
 
 const struct file_operations bfs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= bfs_readdir,
+	.iterate	= bfs_readdir,
 	.fsync		= generic_file_fsync,
 	.llseek		= generic_file_llseek,
 };
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 737aaa3f709..5e376bb9341 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -473,6 +473,7 @@ static struct file_system_type bfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("bfs");
 
 static int __init init_bfs_fs(void)
 {
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 6043567b95c..89dec7f789a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -62,7 +62,6 @@ static int aout_core_dump(struct coredump_params *cprm)
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 	has_dumped = 1;
-	current->flags |= PF_DUMPCORE;
        	strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
 	dump.u_ar0 = offsetof(struct user, regs);
 	dump.signal = cprm->siginfo->si_signo;
@@ -214,7 +213,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
 	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
 	     N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
 	    N_TRSIZE(ex) || N_DRSIZE(ex) ||
-	    i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+	    i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
 		return -ENOEXEC;
 	}
 
@@ -256,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
 		(current->mm->start_data = N_DATADDR(ex));
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
-	current->mm->free_area_cache = current->mm->mmap_base;
-	current->mm->cached_hole_size = 0;
 
 	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
 	if (retval < 0) {
@@ -287,15 +284,12 @@ static int load_aout_binary(struct linux_binprm * bprm)
 			return error;
 		}
 
-		error = bprm->file->f_op->read(bprm->file,
-			  (char __user *)text_addr,
-			  ex.a_text+ex.a_data, &pos);
+		error = read_code(bprm->file, text_addr, pos,
+				  ex.a_text+ex.a_data);
 		if ((signed long)error < 0) {
 			send_sig(SIGKILL, current, 0);
 			return error;
 		}
-			 
-		flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
 	} else {
 		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
 		    (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
@@ -311,14 +305,9 @@ static int load_aout_binary(struct linux_binprm * bprm)
 		}
 
 		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
-			loff_t pos = fd_offset;
 			vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			bprm->file->f_op->read(bprm->file,
-					(char __user *)N_TXTADDR(ex),
-					ex.a_text+ex.a_data, &pos);
-			flush_icache_range((unsigned long) N_TXTADDR(ex),
-					   (unsigned long) N_TXTADDR(ex) +
-					   ex.a_text+ex.a_data);
+			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
+				  ex.a_text + ex.a_data);
 			goto beyond_if;
 		}
 
@@ -367,7 +356,7 @@ static int load_aout_library(struct file *file)
 	int retval;
 	struct exec ex;
 
-	inode = file->f_path.dentry->d_inode;
+	inode = file_inode(file);
 
 	retval = -ENOEXEC;
 	error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
@@ -397,8 +386,6 @@ static int load_aout_library(struct file *file)
 	start_addr =  ex.a_entry & 0xfffff000;
 
 	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
-		loff_t pos = N_TXTOFF(ex);
-
 		if (printk_ratelimit())
 		{
 			printk(KERN_WARNING 
@@ -407,11 +394,8 @@ static int load_aout_library(struct file *file)
 		}
 		vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
 		
-		file->f_op->read(file, (char __user *)start_addr,
-			ex.a_text + ex.a_data, &pos);
-		flush_icache_range((unsigned long) start_addr,
-				   (unsigned long) start_addr + ex.a_text + ex.a_data);
-
+		read_code(file, start_addr, N_TXTOFF(ex),
+			  ex.a_text + ex.a_data);
 		retval = 0;
 		goto out;
 	}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0c42cdbabec..100edcc5e31 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -33,6 +33,7 @@
 #include <linux/elf.h>
 #include <linux/utsname.h>
 #include <linux/coredump.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -239,6 +240,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid));
  	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
 	NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
+#ifdef ELF_HWCAP2
+	NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
+#endif
 	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
 		NEW_AUX_ENT(AT_PLATFORM,
@@ -321,6 +325,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	return 0;
 }
 
+#ifndef elf_map
+
 static unsigned long elf_map(struct file *filep, unsigned long addr,
 		struct elf_phdr *eppnt, int prot, int type,
 		unsigned long total_size)
@@ -355,6 +361,8 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
 	return(map_addr);
 }
 
+#endif /* !elf_map */
+
 static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
 {
 	int i, first_idx = -1, last_idx = -1;
@@ -730,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
-	current->mm->free_area_cache = current->mm->mmap_base;
-	current->mm->cached_hole_size = 0;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
 				 executable_stack);
 	if (retval < 0) {
@@ -798,7 +804,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			 * follow the loader, and is not movable.  */
 #ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
 			/* Memory randomization might have been switched off
-			 * in runtime via sysctl.
+			 * in runtime via sysctl or explicit setting of
+			 * personality flags.
 			 * If that is the case, retain the original non-zero
 			 * load_bias value in order to establish proper
 			 * non-randomized mappings.
@@ -1132,6 +1139,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 			goto whole;
 		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
 			goto whole;
+		return 0;
 	}
 
 	/* Do not dump I/O mapped devices or special mappings */
@@ -1140,7 +1148,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 
 	/* By default, dump shared memory if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED) {
-		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ?
+		if (file_inode(vma->vm_file)->i_nlink == 0 ?
 		    FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
 			goto whole;
 		return 0;
@@ -1248,7 +1256,7 @@ static int writenote(struct memelfnote *men, struct file *file,
 #undef DUMP_WRITE
 
 static void fill_elf_header(struct elfhdr *elf, int segs,
-			    u16 machine, u32 flags, u8 osabi)
+			    u16 machine, u32 flags)
 {
 	memset(elf, 0, sizeof(*elf));
 
@@ -1320,8 +1328,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
 		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
-		cputime_to_timeval(p->utime, &prstatus->pr_utime);
-		cputime_to_timeval(p->stime, &prstatus->pr_stime);
+		cputime_t utime, stime;
+
+		task_cputime(p, &utime, &stime);
+		cputime_to_timeval(utime, &prstatus->pr_utime);
+		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
 	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
 	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
@@ -1630,7 +1641,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	 * Initialize the ELF file header.
 	 */
 	fill_elf_header(elf, phdrs,
-			view->e_machine, view->e_flags, view->ei_osabi);
+			view->e_machine, view->e_flags);
 
 	/*
 	 * Allocate a structure for each thread.
@@ -1870,7 +1881,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	elf_core_copy_regs(&info->prstatus->pr_reg, regs);
 
 	/* Set up header */
-	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI);
+	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
 
 	/*
 	 * Set up the notes in similar form to SVR4 core dumps made
@@ -2082,8 +2093,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 		goto cleanup;
 
 	has_dumped = 1;
-	current->flags |= PF_DUMPCORE;
-  
+
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index dc84732e554..c166f325a18 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -483,7 +483,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	size_t platform_len = 0, len;
 	char *k_platform, *k_base_platform;
 	char __user *u_platform, *u_base_platform, *p;
-	long hwcap;
 	int loop;
 	int nr;	/* reset for each csp adjustment */
 
@@ -502,8 +501,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 		return -EFAULT;
 #endif
 
-	hwcap = ELF_HWCAP;
-
 	/*
 	 * If this architecture has a platform capability string, copy it
 	 * to userspace.  In some cases (Sparc), this info is impossible
@@ -617,7 +614,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 
 	nr = 0;
 	csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
-	NEW_AUX_ENT(AT_HWCAP,	hwcap);
+	NEW_AUX_ENT(AT_HWCAP,	ELF_HWCAP);
+#ifdef ELF_HWCAP2
+	NEW_AUX_ENT(AT_HWCAP2,	ELF_HWCAP2);
+#endif
 	NEW_AUX_ENT(AT_PAGESZ,	PAGE_SIZE);
 	NEW_AUX_ENT(AT_CLKTCK,	CLOCKS_PER_SEC);
 	NEW_AUX_ENT(AT_PHDR,	exec_params->ph_addr);
@@ -909,7 +909,7 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
 
 dynamic_error:
 	printk("ELF FDPIC %s with invalid DYNAMIC section (inode=%lu)\n",
-	       what, file->f_path.dentry->d_inode->i_ino);
+	       what, file_inode(file)->i_ino);
 	return -ELIBBAD;
 }
 
@@ -926,7 +926,6 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 	struct elf32_fdpic_loadseg *seg;
 	struct elf32_phdr *phdr;
 	unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags;
-	loff_t fpos;
 	int loop, ret;
 
 	load_addr = params->load_addr;
@@ -964,14 +963,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 		if (params->phdrs[loop].p_type != PT_LOAD)
 			continue;
 
-		fpos = phdr->p_offset;
-
 		seg->addr = maddr + (phdr->p_vaddr - base);
 		seg->p_vaddr = phdr->p_vaddr;
 		seg->p_memsz = phdr->p_memsz;
 
-		ret = file->f_op->read(file, (void *) seg->addr,
-				       phdr->p_filesz, &fpos);
+		ret = read_code(file, seg->addr, phdr->p_offset,
+				       phdr->p_filesz);
 		if (ret < 0)
 			return ret;
 
@@ -1219,7 +1216,7 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
 
 	/* By default, dump shared memory if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED) {
-		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0) {
+		if (file_inode(vma->vm_file)->i_nlink == 0) {
 			dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags);
 			kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
 			       vma->vm_flags, dump_ok ? "yes" : "no");
@@ -1375,8 +1372,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
 		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
-		cputime_to_timeval(p->utime, &prstatus->pr_utime);
-		cputime_to_timeval(p->stime, &prstatus->pr_stime);
+		cputime_t utime, stime;
+
+		task_cputime(p, &utime, &stime);
+		cputime_to_timeval(utime, &prstatus->pr_utime);
+		cputime_to_timeval(stime, &prstatus->pr_stime);
 	}
 	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
 	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
@@ -1684,8 +1684,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
 	fill_elf_fdpic_header(elf, e_phnum);
 
 	has_dumped = 1;
-	current->flags |= PF_DUMPCORE;
-
 	/*
 	 * Set up the notes in similar form to SVR4 core dumps made
 	 * with info from their /proc.
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b56371981d1..d50bbe59da1 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -207,11 +207,12 @@ static int decompress_exec(
 
 	/* Read in first chunk of data and parse gzip header. */
 	fpos = offset;
-	ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
+	ret = kernel_read(bprm->file, offset, buf, LBUFSIZE);
 
 	strm.next_in = buf;
 	strm.avail_in = ret;
 	strm.total_in = 0;
+	fpos += ret;
 
 	retval = -ENOEXEC;
 
@@ -277,7 +278,7 @@ static int decompress_exec(
 	}
 
 	while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) {
-		ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
+		ret = kernel_read(bprm->file, fpos, buf, LBUFSIZE);
 		if (ret <= 0)
 			break;
 		len -= ret;
@@ -285,6 +286,7 @@ static int decompress_exec(
 		strm.next_in = buf;
 		strm.avail_in = ret;
 		strm.total_in = 0;
+		fpos += ret;
 	}
 
 	if (ret < 0) {
@@ -428,6 +430,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	unsigned long textpos = 0, datapos = 0, result;
 	unsigned long realdatastart = 0;
 	unsigned long text_len, data_len, bss_len, stack_len, flags;
+	unsigned long full_data;
 	unsigned long len, memp = 0;
 	unsigned long memp_size, extra, rlim;
 	unsigned long *reloc = 0, *rp;
@@ -438,7 +441,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	int ret;
 
 	hdr = ((struct flat_hdr *) bprm->buf);		/* exec-header */
-	inode = bprm->file->f_path.dentry->d_inode;
+	inode = file_inode(bprm->file);
 
 	text_len  = ntohl(hdr->data_start);
 	data_len  = ntohl(hdr->data_end) - ntohl(hdr->data_start);
@@ -451,6 +454,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	relocs    = ntohl(hdr->reloc_count);
 	flags     = ntohl(hdr->flags);
 	rev       = ntohl(hdr->rev);
+	full_data = data_len + relocs * sizeof(unsigned long);
 
 	if (strncmp(hdr->magic, "bFLT", 4)) {
 		/*
@@ -577,12 +581,12 @@ static int load_flat_file(struct linux_binprm * bprm,
 #ifdef CONFIG_BINFMT_ZFLAT
 		if (flags & FLAT_FLAG_GZDATA) {
 			result = decompress_exec(bprm, fpos, (char *) datapos, 
-						 data_len + (relocs * sizeof(unsigned long)), 0);
+						 full_data, 0);
 		} else
 #endif
 		{
-			result = bprm->file->f_op->read(bprm->file, (char *) datapos,
-					data_len + (relocs * sizeof(unsigned long)), &fpos);
+			result = read_code(bprm->file, datapos, fpos,
+					full_data);
 		}
 		if (IS_ERR_VALUE(result)) {
 			printk("Unable to read data+bss, errno %d\n", (int)-result);
@@ -627,30 +631,25 @@ static int load_flat_file(struct linux_binprm * bprm,
 		if (flags & FLAT_FLAG_GZIP) {
 			result = decompress_exec(bprm, sizeof (struct flat_hdr),
 					 (((char *) textpos) + sizeof (struct flat_hdr)),
-					 (text_len + data_len + (relocs * sizeof(unsigned long))
+					 (text_len + full_data
 						  - sizeof (struct flat_hdr)),
 					 0);
 			memmove((void *) datapos, (void *) realdatastart,
-					data_len + (relocs * sizeof(unsigned long)));
+					full_data);
 		} else if (flags & FLAT_FLAG_GZDATA) {
-			fpos = 0;
-			result = bprm->file->f_op->read(bprm->file,
-					(char *) textpos, text_len, &fpos);
+			result = read_code(bprm->file, textpos, 0, text_len);
 			if (!IS_ERR_VALUE(result))
 				result = decompress_exec(bprm, text_len, (char *) datapos,
-						 data_len + (relocs * sizeof(unsigned long)), 0);
+						 full_data, 0);
 		}
 		else
 #endif
 		{
-			fpos = 0;
-			result = bprm->file->f_op->read(bprm->file,
-					(char *) textpos, text_len, &fpos);
-			if (!IS_ERR_VALUE(result)) {
-				fpos = ntohl(hdr->data_start);
-				result = bprm->file->f_op->read(bprm->file, (char *) datapos,
-					data_len + (relocs * sizeof(unsigned long)), &fpos);
-			}
+			result = read_code(bprm->file, textpos, 0, text_len);
+			if (!IS_ERR_VALUE(result))
+				result = read_code(bprm->file, datapos,
+						   ntohl(hdr->data_start),
+						   full_data);
 		}
 		if (IS_ERR_VALUE(result)) {
 			printk("Unable to read code+data+bss, errno %d\n",(int)-result);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 0c8869fdd14..1c740e152f3 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -23,6 +23,7 @@
 #include <linux/binfmts.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
+#include <linux/string_helpers.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/namei.h>
@@ -234,24 +235,6 @@ static char *scanarg(char *s, char del)
 	return s;
 }
 
-static int unquote(char *from)
-{
-	char c = 0, *s = from, *p = from;
-
-	while ((c = *s++) != '\0') {
-		if (c == '\\' && *s == 'x') {
-			s++;
-			c = toupper(*s++);
-			*p = (c - (isdigit(c) ? '0' : 'A' - 10)) << 4;
-			c = toupper(*s++);
-			*p++ |= c - (isdigit(c) ? '0' : 'A' - 10);
-			continue;
-		}
-		*p++ = c;
-	}
-	return p - from;
-}
-
 static char * check_special_flags (char * sfs, Node * e)
 {
 	char * p = sfs;
@@ -354,8 +337,9 @@ static Node *create_entry(const char __user *buffer, size_t count)
 		p[-1] = '\0';
 		if (!e->mask[0])
 			e->mask = NULL;
-		e->size = unquote(e->magic);
-		if (e->mask && unquote(e->mask) != e->size)
+		e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX);
+		if (e->mask &&
+		    string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size)
 			goto Einval;
 		if (e->size + e->offset > BINPRM_BUF_SIZE)
 			goto Einval;
@@ -531,7 +515,7 @@ static void kill_node(Node *e)
 static ssize_t
 bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
 {
-	Node *e = file->f_path.dentry->d_inode->i_private;
+	Node *e = file_inode(file)->i_private;
 	ssize_t res;
 	char *page;
 
@@ -550,7 +534,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *ppos)
 {
 	struct dentry *root;
-	Node *e = file->f_path.dentry->d_inode->i_private;
+	Node *e = file_inode(file)->i_private;
 	int res = parse_command(buffer, count);
 
 	switch (res) {
@@ -720,6 +704,7 @@ static struct file_system_type bm_fs_type = {
 	.mount		= bm_mount,
 	.kill_sb	= kill_litter_super,
 };
+MODULE_ALIAS_FS("binfmt_misc");
 
 static int __init init_misc_binfmt(void)
 {
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index a3f28f331b2..8fb42916d8a 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -27,48 +27,11 @@
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 
-struct integrity_slab {
-	struct kmem_cache *slab;
-	unsigned short nr_vecs;
-	char name[8];
-};
-
-#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
-struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
-	IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
-};
-#undef IS
+#define BIP_INLINE_VECS	4
 
+static struct kmem_cache *bip_slab;
 static struct workqueue_struct *kintegrityd_wq;
 
-static inline unsigned int vecs_to_idx(unsigned int nr)
-{
-	switch (nr) {
-	case 1:
-		return 0;
-	case 2 ... 4:
-		return 1;
-	case 5 ... 16:
-		return 2;
-	case 17 ... 64:
-		return 3;
-	case 65 ... 128:
-		return 4;
-	case 129 ... BIO_MAX_PAGES:
-		return 5;
-	default:
-		BUG();
-	}
-}
-
-static inline int use_bip_pool(unsigned int idx)
-{
-	if (idx == BIOVEC_MAX_IDX)
-		return 1;
-
-	return 0;
-}
-
 /**
  * bio_integrity_alloc - Allocate integrity payload and attach it to bio
  * @bio:	bio to attach integrity metadata to
@@ -84,37 +47,41 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 						  unsigned int nr_vecs)
 {
 	struct bio_integrity_payload *bip;
-	unsigned int idx = vecs_to_idx(nr_vecs);
 	struct bio_set *bs = bio->bi_pool;
-
-	if (!bs)
-		bs = fs_bio_set;
-
-	BUG_ON(bio == NULL);
-	bip = NULL;
-
-	/* Lower order allocations come straight from slab */
-	if (!use_bip_pool(idx))
-		bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
-
-	/* Use mempool if lower order alloc failed or max vecs were requested */
-	if (bip == NULL) {
-		idx = BIOVEC_MAX_IDX;  /* so we free the payload properly later */
+	unsigned long idx = BIO_POOL_NONE;
+	unsigned inline_vecs;
+
+	if (!bs) {
+		bip = kmalloc(sizeof(struct bio_integrity_payload) +
+			      sizeof(struct bio_vec) * nr_vecs, gfp_mask);
+		inline_vecs = nr_vecs;
+	} else {
 		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
-
-		if (unlikely(bip == NULL)) {
-			printk(KERN_ERR "%s: could not alloc bip\n", __func__);
-			return NULL;
-		}
+		inline_vecs = BIP_INLINE_VECS;
 	}
 
+	if (unlikely(!bip))
+		return NULL;
+
 	memset(bip, 0, sizeof(*bip));
 
+	if (nr_vecs > inline_vecs) {
+		bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
+					  bs->bvec_integrity_pool);
+		if (!bip->bip_vec)
+			goto err;
+	} else {
+		bip->bip_vec = bip->bip_inline_vecs;
+	}
+
 	bip->bip_slab = idx;
 	bip->bip_bio = bio;
 	bio->bi_integrity = bip;
 
 	return bip;
+err:
+	mempool_free(bip, bs->bio_integrity_pool);
+	return NULL;
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
@@ -130,20 +97,18 @@ void bio_integrity_free(struct bio *bio)
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 	struct bio_set *bs = bio->bi_pool;
 
-	if (!bs)
-		bs = fs_bio_set;
-
-	BUG_ON(bip == NULL);
-
-	/* A cloned bio doesn't own the integrity metadata */
-	if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
-	    && bip->bip_buf != NULL)
+	if (bip->bip_owns_buf)
 		kfree(bip->bip_buf);
 
-	if (use_bip_pool(bip->bip_slab))
+	if (bs) {
+		if (bip->bip_slab != BIO_POOL_NONE)
+			bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
+				  bip->bip_slab);
+
 		mempool_free(bip, bs->bio_integrity_pool);
-	else
-		kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
+	} else {
+		kfree(bip);
+	}
 
 	bio->bi_integrity = NULL;
 }
@@ -419,6 +384,7 @@ int bio_integrity_prep(struct bio *bio)
 		return -EIO;
 	}
 
+	bip->bip_owns_buf = 1;
 	bip->bip_buf = buf;
 	bip->bip_size = len;
 	bip->bip_sector = bio->bi_sector;
@@ -694,11 +660,11 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
 	bp->bio1.bi_integrity = &bp->bip1;
 	bp->bio2.bi_integrity = &bp->bip2;
 
-	bp->iv1 = bip->bip_vec[0];
-	bp->iv2 = bip->bip_vec[0];
+	bp->iv1 = bip->bip_vec[bip->bip_idx];
+	bp->iv2 = bip->bip_vec[bip->bip_idx];
 
-	bp->bip1.bip_vec[0] = bp->iv1;
-	bp->bip2.bip_vec[0] = bp->iv2;
+	bp->bip1.bip_vec = &bp->iv1;
+	bp->bip2.bip_vec = &bp->iv2;
 
 	bp->iv1.bv_len = sectors * bi->tuple_size;
 	bp->iv2.bv_offset += sectors * bi->tuple_size;
@@ -746,13 +712,14 @@ EXPORT_SYMBOL(bio_integrity_clone);
 
 int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
-	unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
-
 	if (bs->bio_integrity_pool)
 		return 0;
 
-	bs->bio_integrity_pool =
-		mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
+	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
+
+	bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
+	if (!bs->bvec_integrity_pool)
+		return -1;
 
 	if (!bs->bio_integrity_pool)
 		return -1;
@@ -765,13 +732,14 @@ void bioset_integrity_free(struct bio_set *bs)
 {
 	if (bs->bio_integrity_pool)
 		mempool_destroy(bs->bio_integrity_pool);
+
+	if (bs->bvec_integrity_pool)
+		mempool_destroy(bs->bio_integrity_pool);
 }
 EXPORT_SYMBOL(bioset_integrity_free);
 
 void __init bio_integrity_init(void)
 {
-	unsigned int i;
-
 	/*
 	 * kintegrityd won't block much but may burn a lot of CPU cycles.
 	 * Make it highpri CPU intensive wq with max concurrency of 1.
@@ -781,14 +749,10 @@ void __init bio_integrity_init(void)
 	if (!kintegrityd_wq)
 		panic("Failed to create kintegrityd\n");
 
-	for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
-		unsigned int size;
-
-		size = sizeof(struct bio_integrity_payload)
-			+ bip_slab[i].nr_vecs * sizeof(struct bio_vec);
-
-		bip_slab[i].slab =
-			kmem_cache_create(bip_slab[i].name, size, 0,
-					  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-	}
+	bip_slab = kmem_cache_create("bio_integrity_payload",
+				     sizeof(struct bio_integrity_payload) +
+				     sizeof(struct bio_vec) * BIP_INLINE_VECS,
+				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	if (!bip_slab)
+		panic("Failed to create slab\n");
 }
diff --git a/fs/bio.c b/fs/bio.c
index b96fc6ce485..94bbc04dba7 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -19,6 +19,7 @@
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/uio.h>
 #include <linux/iocontext.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -160,12 +161,12 @@ unsigned int bvec_nr_vecs(unsigned short idx)
 	return bvec_slabs[idx].nr_vecs;
 }
 
-void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
 {
 	BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
 
 	if (idx == BIOVEC_MAX_IDX)
-		mempool_free(bv, bs->bvec_pool);
+		mempool_free(bv, pool);
 	else {
 		struct biovec_slab *bvs = bvec_slabs + idx;
 
@@ -173,8 +174,8 @@ void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
 	}
 }
 
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
-			      struct bio_set *bs)
+struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
+			   mempool_t *pool)
 {
 	struct bio_vec *bvl;
 
@@ -210,7 +211,7 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
 	 */
 	if (*idx == BIOVEC_MAX_IDX) {
 fallback:
-		bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+		bvl = mempool_alloc(pool, gfp_mask);
 	} else {
 		struct biovec_slab *bvs = bvec_slabs + *idx;
 		gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
@@ -252,8 +253,8 @@ static void bio_free(struct bio *bio)
 	__bio_free(bio);
 
 	if (bs) {
-		if (bio_has_allocated_vec(bio))
-			bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
+		if (bio_flagged(bio, BIO_OWNS_VEC))
+			bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
 		/*
 		 * If we have front padding, adjust the bio pointer before freeing
@@ -297,6 +298,54 @@ void bio_reset(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_reset);
 
+static void bio_alloc_rescue(struct work_struct *work)
+{
+	struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
+	struct bio *bio;
+
+	while (1) {
+		spin_lock(&bs->rescue_lock);
+		bio = bio_list_pop(&bs->rescue_list);
+		spin_unlock(&bs->rescue_lock);
+
+		if (!bio)
+			break;
+
+		generic_make_request(bio);
+	}
+}
+
+static void punt_bios_to_rescuer(struct bio_set *bs)
+{
+	struct bio_list punt, nopunt;
+	struct bio *bio;
+
+	/*
+	 * In order to guarantee forward progress we must punt only bios that
+	 * were allocated from this bio_set; otherwise, if there was a bio on
+	 * there for a stacking driver higher up in the stack, processing it
+	 * could require allocating bios from this bio_set, and doing that from
+	 * our own rescuer would be bad.
+	 *
+	 * Since bio lists are singly linked, pop them all instead of trying to
+	 * remove from the middle of the list:
+	 */
+
+	bio_list_init(&punt);
+	bio_list_init(&nopunt);
+
+	while ((bio = bio_list_pop(current->bio_list)))
+		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+
+	*current->bio_list = nopunt;
+
+	spin_lock(&bs->rescue_lock);
+	bio_list_merge(&bs->rescue_list, &punt);
+	spin_unlock(&bs->rescue_lock);
+
+	queue_work(bs->rescue_workqueue, &bs->rescue_work);
+}
+
 /**
  * bio_alloc_bioset - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
@@ -314,11 +363,27 @@ EXPORT_SYMBOL(bio_reset);
  *   previously allocated bio for IO before attempting to allocate a new one.
  *   Failure to do so can cause deadlocks under memory pressure.
  *
+ *   Note that when running under generic_make_request() (i.e. any block
+ *   driver), bios are not submitted until after you return - see the code in
+ *   generic_make_request() that converts recursion into iteration, to prevent
+ *   stack overflows.
+ *
+ *   This would normally mean allocating multiple bios under
+ *   generic_make_request() would be susceptible to deadlocks, but we have
+ *   deadlock avoidance code that resubmits any blocked bios from a rescuer
+ *   thread.
+ *
+ *   However, we do not guarantee forward progress for allocations from other
+ *   mempools. Doing multiple allocations from the same mempool under
+ *   generic_make_request() should be avoided - instead, use bio_set's front_pad
+ *   for per bio allocations.
+ *
  *   RETURNS:
  *   Pointer to new bio on success, NULL on failure.
  */
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+	gfp_t saved_gfp = gfp_mask;
 	unsigned front_pad;
 	unsigned inline_vecs;
 	unsigned long idx = BIO_POOL_NONE;
@@ -336,7 +401,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		front_pad = 0;
 		inline_vecs = nr_iovecs;
 	} else {
+		/*
+		 * generic_make_request() converts recursion to iteration; this
+		 * means if we're running beneath it, any bios we allocate and
+		 * submit will not be submitted (and thus freed) until after we
+		 * return.
+		 *
+		 * This exposes us to a potential deadlock if we allocate
+		 * multiple bios from the same bio_set() while running
+		 * underneath generic_make_request(). If we were to allocate
+		 * multiple bios (say a stacking block driver that was splitting
+		 * bios), we would deadlock if we exhausted the mempool's
+		 * reserve.
+		 *
+		 * We solve this, and guarantee forward progress, with a rescuer
+		 * workqueue per bio_set. If we go to allocate and there are
+		 * bios on current->bio_list, we first try the allocation
+		 * without __GFP_WAIT; if that fails, we punt those bios we
+		 * would be blocking to the rescuer workqueue before we retry
+		 * with the original gfp_flags.
+		 */
+
+		if (current->bio_list && !bio_list_empty(current->bio_list))
+			gfp_mask &= ~__GFP_WAIT;
+
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
+		if (!p && gfp_mask != saved_gfp) {
+			punt_bios_to_rescuer(bs);
+			gfp_mask = saved_gfp;
+			p = mempool_alloc(bs->bio_pool, gfp_mask);
+		}
+
 		front_pad = bs->front_pad;
 		inline_vecs = BIO_INLINE_VECS;
 	}
@@ -348,9 +443,17 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 	bio_init(bio);
 
 	if (nr_iovecs > inline_vecs) {
-		bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+		if (!bvl && gfp_mask != saved_gfp) {
+			punt_bios_to_rescuer(bs);
+			gfp_mask = saved_gfp;
+			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+		}
+
 		if (unlikely(!bvl))
 			goto err_free;
+
+		bio->bi_flags |= 1 << BIO_OWNS_VEC;
 	} else if (nr_iovecs) {
 		bvl = bio->bi_inline_vecs;
 	}
@@ -652,6 +755,181 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 }
 EXPORT_SYMBOL(bio_add_page);
 
+struct submit_bio_ret {
+	struct completion event;
+	int error;
+};
+
+static void submit_bio_wait_endio(struct bio *bio, int error)
+{
+	struct submit_bio_ret *ret = bio->bi_private;
+
+	ret->error = error;
+	complete(&ret->event);
+}
+
+/**
+ * submit_bio_wait - submit a bio, and wait until it completes
+ * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
+ * @bio: The &struct bio which describes the I/O
+ *
+ * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
+ * bio_endio() on failure.
+ */
+int submit_bio_wait(int rw, struct bio *bio)
+{
+	struct submit_bio_ret ret;
+
+	rw |= REQ_SYNC;
+	init_completion(&ret.event);
+	bio->bi_private = &ret;
+	bio->bi_end_io = submit_bio_wait_endio;
+	submit_bio(rw, bio);
+	wait_for_completion(&ret.event);
+
+	return ret.error;
+}
+EXPORT_SYMBOL(submit_bio_wait);
+
+/**
+ * bio_advance - increment/complete a bio by some number of bytes
+ * @bio:	bio to advance
+ * @bytes:	number of bytes to complete
+ *
+ * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
+ * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
+ * be updated on the last bvec as well.
+ *
+ * @bio will then represent the remaining, uncompleted portion of the io.
+ */
+void bio_advance(struct bio *bio, unsigned bytes)
+{
+	if (bio_integrity(bio))
+		bio_integrity_advance(bio, bytes);
+
+	bio->bi_sector += bytes >> 9;
+	bio->bi_size -= bytes;
+
+	if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
+		return;
+
+	while (bytes) {
+		if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
+			WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
+				  bio->bi_idx, bio->bi_vcnt);
+			break;
+		}
+
+		if (bytes >= bio_iovec(bio)->bv_len) {
+			bytes -= bio_iovec(bio)->bv_len;
+			bio->bi_idx++;
+		} else {
+			bio_iovec(bio)->bv_len -= bytes;
+			bio_iovec(bio)->bv_offset += bytes;
+			bytes = 0;
+		}
+	}
+}
+EXPORT_SYMBOL(bio_advance);
+
+/**
+ * bio_alloc_pages - allocates a single page for each bvec in a bio
+ * @bio: bio to allocate pages for
+ * @gfp_mask: flags for allocation
+ *
+ * Allocates pages up to @bio->bi_vcnt.
+ *
+ * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+ * freed.
+ */
+int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+	int i;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		bv->bv_page = alloc_page(gfp_mask);
+		if (!bv->bv_page) {
+			while (--bv >= bio->bi_io_vec)
+				__free_page(bv->bv_page);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_alloc_pages);
+
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+	struct bio_vec *src_bv, *dst_bv;
+	unsigned src_offset, dst_offset, bytes;
+	void *src_p, *dst_p;
+
+	src_bv = bio_iovec(src);
+	dst_bv = bio_iovec(dst);
+
+	src_offset = src_bv->bv_offset;
+	dst_offset = dst_bv->bv_offset;
+
+	while (1) {
+		if (src_offset == src_bv->bv_offset + src_bv->bv_len) {
+			src_bv++;
+			if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) {
+				src = src->bi_next;
+				if (!src)
+					break;
+
+				src_bv = bio_iovec(src);
+			}
+
+			src_offset = src_bv->bv_offset;
+		}
+
+		if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) {
+			dst_bv++;
+			if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) {
+				dst = dst->bi_next;
+				if (!dst)
+					break;
+
+				dst_bv = bio_iovec(dst);
+			}
+
+			dst_offset = dst_bv->bv_offset;
+		}
+
+		bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset,
+			    src_bv->bv_offset + src_bv->bv_len - src_offset);
+
+		src_p = kmap_atomic(src_bv->bv_page);
+		dst_p = kmap_atomic(dst_bv->bv_page);
+
+		memcpy(dst_p + dst_bv->bv_offset,
+		       src_p + src_bv->bv_offset,
+		       bytes);
+
+		kunmap_atomic(dst_p);
+		kunmap_atomic(src_p);
+
+		src_offset += bytes;
+		dst_offset += bytes;
+	}
+}
+EXPORT_SYMBOL(bio_copy_data);
+
 struct bio_map_data {
 	struct bio_vec *iovecs;
 	struct sg_iovec *sgvecs;
@@ -714,7 +992,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
 	int iov_idx = 0;
 	unsigned int iov_off = 0;
 
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		char *bv_addr = page_address(bvec->bv_page);
 		unsigned int bv_len = iovecs[i].bv_len;
 
@@ -896,7 +1174,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	return bio;
 cleanup:
 	if (!map_data)
-		bio_for_each_segment(bvec, bio, i)
+		bio_for_each_segment_all(bvec, bio, i)
 			__free_page(bvec->bv_page);
 
 	bio_put(bio);
@@ -1110,7 +1388,7 @@ static void __bio_unmap_user(struct bio *bio)
 	/*
 	 * make sure we dirty pages we wrote to
 	 */
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		if (bio_data_dir(bio) == READ)
 			set_page_dirty_lock(bvec->bv_page);
 
@@ -1216,7 +1494,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 	int i;
 	char *p = bmd->sgvecs[0].iov_base;
 
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		char *addr = page_address(bvec->bv_page);
 		int len = bmd->iovecs[i].bv_len;
 
@@ -1256,7 +1534,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 	if (!reading) {
 		void *p = data;
 
-		bio_for_each_segment(bvec, bio, i) {
+		bio_for_each_segment_all(bvec, bio, i) {
 			char *addr = page_address(bvec->bv_page);
 
 			memcpy(addr, p, bvec->bv_len);
@@ -1301,11 +1579,11 @@ EXPORT_SYMBOL(bio_copy_kern);
  */
 void bio_set_pages_dirty(struct bio *bio)
 {
-	struct bio_vec *bvec = bio->bi_io_vec;
+	struct bio_vec *bvec;
 	int i;
 
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		struct page *page = bvec[i].bv_page;
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
 
 		if (page && !PageCompound(page))
 			set_page_dirty_lock(page);
@@ -1314,11 +1592,11 @@ void bio_set_pages_dirty(struct bio *bio)
 
 static void bio_release_pages(struct bio *bio)
 {
-	struct bio_vec *bvec = bio->bi_io_vec;
+	struct bio_vec *bvec;
 	int i;
 
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		struct page *page = bvec[i].bv_page;
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
 
 		if (page)
 			put_page(page);
@@ -1367,16 +1645,16 @@ static void bio_dirty_fn(struct work_struct *work)
 
 void bio_check_pages_dirty(struct bio *bio)
 {
-	struct bio_vec *bvec = bio->bi_io_vec;
+	struct bio_vec *bvec;
 	int nr_clean_pages = 0;
 	int i;
 
-	for (i = 0; i < bio->bi_vcnt; i++) {
-		struct page *page = bvec[i].bv_page;
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
 
 		if (PageDirty(page) || PageCompound(page)) {
 			page_cache_release(page);
-			bvec[i].bv_page = NULL;
+			bvec->bv_page = NULL;
 		} else {
 			nr_clean_pages++;
 		}
@@ -1477,8 +1755,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 	trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
 				bi->bi_sector + first_sectors);
 
-	BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
-	BUG_ON(bi->bi_idx != 0);
+	BUG_ON(bio_segments(bi) > 1);
 	atomic_set(&bp->cnt, 3);
 	bp->error = 0;
 	bp->bio1 = *bi;
@@ -1488,8 +1765,8 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 	bp->bio1.bi_size = first_sectors << 9;
 
 	if (bi->bi_vcnt != 0) {
-		bp->bv1 = bi->bi_io_vec[0];
-		bp->bv2 = bi->bi_io_vec[0];
+		bp->bv1 = *bio_iovec(bi);
+		bp->bv2 = *bio_iovec(bi);
 
 		if (bio_is_rw(bi)) {
 			bp->bv2.bv_offset += first_sectors << 9;
@@ -1541,7 +1818,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index,
 	if (index >= bio->bi_idx)
 		index = bio->bi_vcnt - 1;
 
-	__bio_for_each_segment(bv, bio, i, 0) {
+	bio_for_each_segment_all(bv, bio, i) {
 		if (i == index) {
 			if (offset > bv->bv_offset)
 				sectors += (offset - bv->bv_offset) / sector_sz;
@@ -1559,29 +1836,25 @@ EXPORT_SYMBOL(bio_sector_offset);
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
  */
-static int biovec_create_pools(struct bio_set *bs, int pool_entries)
+mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
 {
 	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
 
-	bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
-	if (!bs->bvec_pool)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static void biovec_free_pools(struct bio_set *bs)
-{
-	mempool_destroy(bs->bvec_pool);
+	return mempool_create_slab_pool(pool_entries, bp->slab);
 }
 
 void bioset_free(struct bio_set *bs)
 {
+	if (bs->rescue_workqueue)
+		destroy_workqueue(bs->rescue_workqueue);
+
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
+	if (bs->bvec_pool)
+		mempool_destroy(bs->bvec_pool);
+
 	bioset_integrity_free(bs);
-	biovec_free_pools(bs);
 	bio_put_slab(bs);
 
 	kfree(bs);
@@ -1612,6 +1885,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 
 	bs->front_pad = front_pad;
 
+	spin_lock_init(&bs->rescue_lock);
+	bio_list_init(&bs->rescue_list);
+	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
+
 	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
 	if (!bs->bio_slab) {
 		kfree(bs);
@@ -1622,9 +1899,15 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (!biovec_create_pools(bs, pool_size))
-		return bs;
+	bs->bvec_pool = biovec_create_pool(bs, pool_size);
+	if (!bs->bvec_pool)
+		goto bad;
+
+	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
+	if (!bs->rescue_workqueue)
+		goto bad;
 
+	return bs;
 bad:
 	bioset_free(bs);
 	return NULL;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 172f8491a2b..bb43ce081d6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/cleancache.h>
+#include <linux/aio.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -318,37 +319,16 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
 
 /*
  * private llseek:
- * for a block special file file->f_path.dentry->d_inode->i_size is zero
+ * for a block special file file_inode(file)->i_size is zero
  * so we compute the size by hand (just as in block_read/write above)
  */
 static loff_t block_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *bd_inode = file->f_mapping->host;
-	loff_t size;
 	loff_t retval;
 
 	mutex_lock(&bd_inode->i_mutex);
-	size = i_size_read(bd_inode);
-
-	retval = -EINVAL;
-	switch (whence) {
-		case SEEK_END:
-			offset += size;
-			break;
-		case SEEK_CUR:
-			offset += file->f_pos;
-		case SEEK_SET:
-			break;
-		default:
-			goto out;
-	}
-	if (offset >= 0 && offset <= size) {
-		if (offset != file->f_pos) {
-			file->f_pos = offset;
-		}
-		retval = offset;
-	}
-out:
+	retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
 	mutex_unlock(&bd_inode->i_mutex);
 	return retval;
 }
@@ -551,6 +531,7 @@ struct block_device *bdgrab(struct block_device *bdev)
 	ihold(bdev->bd_inode);
 	return bdev;
 }
+EXPORT_SYMBOL(bdgrab);
 
 long nr_blockdev_pages(void)
 {
@@ -616,11 +597,9 @@ void bd_forget(struct inode *inode)
 	struct block_device *bdev = NULL;
 
 	spin_lock(&bdev_lock);
-	if (inode->i_bdev) {
-		if (!sb_is_blkdev_sb(inode->i_sb))
-			bdev = inode->i_bdev;
-		__bd_forget(inode);
-	}
+	if (!sb_is_blkdev_sb(inode->i_sb))
+		bdev = inode->i_bdev;
+	__bd_forget(inode);
 	spin_unlock(&bdev_lock);
 
 	if (bdev)
@@ -994,6 +973,7 @@ int revalidate_disk(struct gendisk *disk)
 
 	mutex_lock(&bdev->bd_mutex);
 	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	return ret;
@@ -1032,7 +1012,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 {
 	unsigned bsize = bdev_logical_block_size(bdev);
 
-	bdev->bd_inode->i_size = size;
+	mutex_lock(&bdev->bd_inode->i_mutex);
+	i_size_write(bdev->bd_inode, size);
+	mutex_unlock(&bdev->bd_inode->i_mutex);
 	while (bsize < PAGE_CACHE_SIZE) {
 		if (size & bsize)
 			break;
@@ -1043,7 +1025,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
  * bd_mutex locking:
@@ -1117,7 +1099,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				}
 			}
 
-			if (!ret && !bdev->bd_openers) {
+			if (!ret) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
 				bdi = blk_get_backing_dev_info(bdev);
 				if (bdi == NULL)
@@ -1398,9 +1380,8 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	return blkdev_get(bdev, filp->f_mode, filp);
 }
 
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 {
-	int ret = 0;
 	struct gendisk *disk = bdev->bd_disk;
 	struct block_device *victim = NULL;
 
@@ -1420,7 +1401,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
-			ret = disk->fops->release(disk, mode);
+			disk->fops->release(disk, mode);
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
@@ -1439,10 +1420,9 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 	bdput(bdev);
 	if (victim)
 		__blkdev_put(victim, mode, 1);
-	return ret;
 }
 
-int blkdev_put(struct block_device *bdev, fmode_t mode)
+void blkdev_put(struct block_device *bdev, fmode_t mode)
 {
 	mutex_lock(&bdev->bd_mutex);
 
@@ -1486,15 +1466,15 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 
 	mutex_unlock(&bdev->bd_mutex);
 
-	return __blkdev_put(bdev, mode, 0);
+	__blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
 
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-
-	return blkdev_put(bdev, filp->f_mode);
+	blkdev_put(bdev, filp->f_mode);
+	return 0;
 }
 
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
@@ -1555,7 +1535,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		return 0;
 
 	size -= pos;
-	if (size < INT_MAX)
+	if (size < iocb->ki_left)
 		nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
 	return generic_file_aio_read(iocb, iov, nr_segs, pos);
 }
@@ -1582,6 +1562,7 @@ static const struct address_space_operations def_blk_aops = {
 	.writepages	= generic_writepages,
 	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
+	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
 const struct file_operations def_blk_fops = {
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index d33f01c08b6..2b3b8329697 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,11 +1,13 @@
 config BTRFS_FS
-	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
-	depends on EXPERIMENTAL
+	tristate "Btrfs filesystem support"
 	select LIBCRC32C
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
+	select RAID6_PQ
+	select XOR_BLOCKS
+
 	help
 	  Btrfs is a new filesystem with extents, writable snapshotting,
 	  support for multiple devices and many more features.
@@ -50,3 +52,23 @@ config BTRFS_FS_CHECK_INTEGRITY
 	  In most cases, unless you are a btrfs developer who needs
 	  to verify the integrity of (super)-block write requests
 	  during the run of a regression test, say N
+
+config BTRFS_FS_RUN_SANITY_TESTS
+	bool "Btrfs will run sanity tests upon loading"
+	depends on BTRFS_FS
+	help
+	  This will run some basic sanity tests on the free space cache
+	  code to make sure it is acting as it should.  These are mostly
+	  regression tests and are only really interesting to btrfs devlopers.
+
+	  If unsure, say N.
+
+config BTRFS_DEBUG
+	bool "Btrfs debugging support"
+	depends on BTRFS_FS
+	help
+	  Enable run-time debugging support for the btrfs filesystem. This may
+	  enable additional and expensive checks with negative impact on
+	  performance, or export extra information via sysfs.
+
+	  If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7df3e0f0ee5..3932224f99e 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
+	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 04edf69be87..eaf133384a8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -255,13 +255,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
  * to a logical address
  */
 static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
-					int search_commit_root,
-					u64 time_seq,
-					struct __prelim_ref *ref,
-					struct ulist *parents,
-					const u64 *extent_item_pos)
+				  struct btrfs_path *path, u64 time_seq,
+				  struct __prelim_ref *ref,
+				  struct ulist *parents,
+				  const u64 *extent_item_pos)
 {
-	struct btrfs_path *path;
 	struct btrfs_root *root;
 	struct btrfs_key root_key;
 	struct extent_buffer *eb;
@@ -269,11 +267,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 	int root_level;
 	int level = ref->level;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	path->search_commit_root = !!search_commit_root;
-
 	root_key.objectid = ref->root_id;
 	root_key.type = BTRFS_ROOT_ITEM_KEY;
 	root_key.offset = (u64)-1;
@@ -314,7 +307,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 				time_seq, ref->wanted_disk_byte,
 				extent_item_pos);
 out:
-	btrfs_free_path(path);
+	path->lowest_level = 0;
+	btrfs_release_path(path);
 	return ret;
 }
 
@@ -322,7 +316,7 @@ out:
  * resolve all indirect backrefs from the list
  */
 static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
-				   int search_commit_root, u64 time_seq,
+				   struct btrfs_path *path, u64 time_seq,
 				   struct list_head *head,
 				   const u64 *extent_item_pos)
 {
@@ -349,14 +343,12 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			continue;
 		if (ref->count == 0)
 			continue;
-		err = __resolve_indirect_ref(fs_info, search_commit_root,
-					     time_seq, ref, parents,
-					     extent_item_pos);
-		if (err) {
-			if (ret == 0)
-				ret = err;
+		err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
+					     parents, extent_item_pos);
+		if (err == -ENOMEM)
+			goto out;
+		if (err)
 			continue;
-		}
 
 		/* we put the first parent into the ref at hand */
 		ULIST_ITER_INIT(&uiter);
@@ -370,7 +362,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
 			if (!new_ref) {
 				ret = -ENOMEM;
-				break;
+				goto out;
 			}
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
@@ -380,7 +372,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		}
 		ulist_reinit(parents);
 	}
-
+out:
 	ulist_free(parents);
 	return ret;
 }
@@ -424,7 +416,10 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
 		BUG_ON(!ref->wanted_disk_byte);
 		eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
 				     fs_info->tree_root->leafsize, 0);
-		BUG_ON(!eb);
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
+			return -EIO;
+		}
 		btrfs_tree_read_lock(eb);
 		if (btrfs_header_level(eb) == 0)
 			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
@@ -446,7 +441,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
  *           having a parent).
  * mode = 2: merge identical parents
  */
-static int __merge_refs(struct list_head *head, int mode)
+static void __merge_refs(struct list_head *head, int mode)
 {
 	struct list_head *pos1;
 
@@ -492,7 +487,6 @@ static int __merge_refs(struct list_head *head, int mode)
 		}
 
 	}
-	return 0;
 }
 
 /*
@@ -585,7 +579,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 		default:
 			WARN_ON(1);
 		}
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
 
 	return 0;
@@ -602,6 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 	int slot;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
+	struct btrfs_key found_key;
 	unsigned long ptr;
 	unsigned long end;
 	struct btrfs_extent_item *ei;
@@ -619,17 +615,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 
 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 	flags = btrfs_extent_flags(leaf, ei);
+	btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
 	ptr = (unsigned long)(ei + 1);
 	end = (unsigned long)ei + item_size;
 
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+	if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+	    flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		struct btrfs_tree_block_info *info;
 
 		info = (struct btrfs_tree_block_info *)ptr;
 		*info_level = btrfs_tree_block_level(leaf, info);
 		ptr += sizeof(struct btrfs_tree_block_info);
 		BUG_ON(ptr > end);
+	} else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+		*info_level = found_key.offset;
 	} else {
 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
 	}
@@ -683,7 +683,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 		default:
 			WARN_ON(1);
 		}
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 		ptr += btrfs_extent_inline_ref_size(type);
 	}
 
@@ -765,7 +766,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 		default:
 			WARN_ON(1);
 		}
-		BUG_ON(ret);
+		if (ret)
+			return ret;
+
 	}
 
 	return ret;
@@ -790,7 +793,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_head *head;
 	int info_level = 0;
 	int ret;
-	int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
 	struct list_head prefs_delayed;
 	struct list_head prefs;
 	struct __prelim_ref *ref;
@@ -799,13 +801,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	INIT_LIST_HEAD(&prefs_delayed);
 
 	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
 	key.offset = (u64)-1;
+	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+		key.type = BTRFS_METADATA_ITEM_KEY;
+	else
+		key.type = BTRFS_EXTENT_ITEM_KEY;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->search_commit_root = !!search_commit_root;
+	if (!trans)
+		path->search_commit_root = 1;
 
 	/*
 	 * grab both a lock on the path and a lock on the delayed ref head.
@@ -820,7 +826,7 @@ again:
 		goto out;
 	BUG_ON(ret == 0);
 
-	if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) {
+	if (trans) {
 		/*
 		 * look if there are updates for this ref queued and lock the
 		 * head
@@ -864,7 +870,8 @@ again:
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (key.objectid == bytenr &&
-		    key.type == BTRFS_EXTENT_ITEM_KEY) {
+		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
+		     key.type == BTRFS_METADATA_ITEM_KEY)) {
 			ret = __add_inline_refs(fs_info, path, bytenr,
 						&info_level, &prefs);
 			if (ret)
@@ -883,18 +890,14 @@ again:
 	if (ret)
 		goto out;
 
-	ret = __merge_refs(&prefs, 1);
-	if (ret)
-		goto out;
+	__merge_refs(&prefs, 1);
 
-	ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
-				      &prefs, extent_item_pos);
+	ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+				      extent_item_pos);
 	if (ret)
 		goto out;
 
-	ret = __merge_refs(&prefs, 2);
-	if (ret)
-		goto out;
+	__merge_refs(&prefs, 2);
 
 	while (!list_empty(&prefs)) {
 		ref = list_first_entry(&prefs, struct __prelim_ref, list);
@@ -903,7 +906,8 @@ again:
 		if (ref->count && ref->root_id && ref->parent == 0) {
 			/* no parent == root of tree */
 			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
-			BUG_ON(ret < 0);
+			if (ret < 0)
+				goto out;
 		}
 		if (ref->count && ref->parent) {
 			struct extent_inode_elem *eie = NULL;
@@ -914,7 +918,11 @@ again:
 							info_level);
 				eb = read_tree_block(fs_info->extent_root,
 							   ref->parent, bsz, 0);
-				BUG_ON(!eb);
+				if (!eb || !extent_buffer_uptodate(eb)) {
+					free_extent_buffer(eb);
+					ret = -EIO;
+					goto out;
+				}
 				ret = find_extent_in_eb(eb, bytenr,
 							*extent_item_pos, &eie);
 				ref->inode_list = eie;
@@ -923,6 +931,8 @@ again:
 			ret = ulist_add_merge(refs, ref->parent,
 					      (uintptr_t)ref->inode_list,
 					      (u64 *)&eie, GFP_NOFS);
+			if (ret < 0)
+				goto out;
 			if (!ret && extent_item_pos) {
 				/*
 				 * we've recorded that parent, so we must extend
@@ -933,7 +943,6 @@ again:
 					eie = eie->next;
 				eie->next = ref->inode_list;
 			}
-			BUG_ON(ret < 0);
 		}
 		kfree(ref);
 	}
@@ -1183,6 +1192,20 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 	return ret;
 }
 
+/*
+ * this iterates to turn a name (from iref/extref) into a full filesystem path.
+ * Elements of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
 char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 			u32 name_len, unsigned long name_off,
 			struct extent_buffer *eb_in, u64 parent,
@@ -1252,32 +1275,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 }
 
 /*
- * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
- * of the path are separated by '/' and the path is guaranteed to be
- * 0-terminated. the path is only given within the current file system.
- * Therefore, it never starts with a '/'. the caller is responsible to provide
- * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
- * the start point of the resulting string is returned. this pointer is within
- * dest, normally.
- * in case the path buffer would overflow, the pointer is decremented further
- * as if output was written to the buffer, though no more output is actually
- * generated. that way, the caller can determine how much space would be
- * required for the path to fit into the buffer. in that case, the returned
- * value will be smaller than dest. callers must check this!
- */
-char *btrfs_iref_to_path(struct btrfs_root *fs_root,
-			 struct btrfs_path *path,
-			 struct btrfs_inode_ref *iref,
-			 struct extent_buffer *eb_in, u64 parent,
-			 char *dest, u32 size)
-{
-	return btrfs_ref_to_path(fs_root, path,
-				 btrfs_inode_ref_name_len(eb_in, iref),
-				 (unsigned long)(iref + 1),
-				 eb_in, parent, dest, size);
-}
-
-/*
  * this makes the path point to (logical EXTENT_ITEM *)
  * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
  * tree blocks and <0 on error.
@@ -1288,12 +1285,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 {
 	int ret;
 	u64 flags;
+	u64 size = 0;
 	u32 item_size;
 	struct extent_buffer *eb;
 	struct btrfs_extent_item *ei;
 	struct btrfs_key key;
 
-	key.type = BTRFS_EXTENT_ITEM_KEY;
+	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+		key.type = BTRFS_METADATA_ITEM_KEY;
+	else
+		key.type = BTRFS_EXTENT_ITEM_KEY;
 	key.objectid = logical;
 	key.offset = (u64)-1;
 
@@ -1306,9 +1307,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 		return ret;
 
 	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
-	if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
+	if (found_key->type == BTRFS_METADATA_ITEM_KEY)
+		size = fs_info->extent_root->leafsize;
+	else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
+		size = found_key->offset;
+
+	if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
+	     found_key->type != BTRFS_METADATA_ITEM_KEY) ||
 	    found_key->objectid > logical ||
-	    found_key->objectid + found_key->offset <= logical) {
+	    found_key->objectid + size <= logical) {
 		pr_debug("logical %llu is not within any extent\n",
 			 (unsigned long long)logical);
 		return -ENOENT;
@@ -1464,9 +1471,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
 	int ret;
-	struct list_head data_refs = LIST_HEAD_INIT(data_refs);
-	struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
-	struct btrfs_trans_handle *trans;
+	struct btrfs_trans_handle *trans = NULL;
 	struct ulist *refs = NULL;
 	struct ulist *roots = NULL;
 	struct ulist_node *ref_node = NULL;
@@ -1478,9 +1483,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 	pr_debug("resolving all inodes for extent %llu\n",
 			extent_item_objectid);
 
-	if (search_commit_root) {
-		trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
-	} else {
+	if (!search_commit_root) {
 		trans = btrfs_join_transaction(fs_info->extent_root);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
@@ -1511,11 +1514,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 						iterate, ctx);
 		}
 		ulist_free(roots);
-		roots = NULL;
 	}
 
 	free_leaf_list(refs);
-	ulist_free(roots);
 out:
 	if (!search_commit_root) {
 		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index d61feca7945..8f2e7670293 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -19,12 +19,10 @@
 #ifndef __BTRFS_BACKREF__
 #define __BTRFS_BACKREF__
 
-#include "ioctl.h"
+#include <linux/btrfs.h>
 #include "ulist.h"
 #include "extent_io.h"
 
-#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
-
 struct inode_fs_paths {
 	struct btrfs_path		*btrfs_path;
 	struct btrfs_root		*fs_root;
@@ -59,9 +57,6 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 				struct btrfs_fs_info *fs_info, u64 bytenr,
 				u64 time_seq, struct ulist **roots);
-char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
-			 struct btrfs_inode_ref *iref, struct extent_buffer *eb,
-			 u64 parent, char *dest, u32 size);
 char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 			u32 name_len, unsigned long name_off,
 			struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2a8c242bc4f..08b286b2a2c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,6 +40,8 @@
 #define BTRFS_INODE_HAS_ASYNC_EXTENT		6
 #define BTRFS_INODE_NEEDS_FULL_SYNC		7
 #define BTRFS_INODE_COPY_EVERYTHING		8
+#define BTRFS_INODE_IN_DELALLOC_LIST		9
+#define BTRFS_INODE_READDIO_NEED_LOCK		10
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -91,7 +93,7 @@ struct btrfs_inode {
 
 	unsigned long runtime_flags;
 
-	/* Keep track of who's O_SYNC/fsycing currently */
+	/* Keep track of who's O_SYNC/fsyncing currently */
 	atomic_t sync_writers;
 
 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
@@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 	return 0;
 }
 
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+	set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
+	smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+		  &BTRFS_I(inode)->runtime_flags);
+}
+
 #endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 11d47bfb62b..1431a696501 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror(
 	    (bh->b_data + (dev_bytenr & 4095));
 
 	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
-	    strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
-		    sizeof(super_tmp->magic)) ||
+	    super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||
 	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
 	    btrfs_super_nodesize(super_tmp) != state->metablock_size ||
 	    btrfs_super_leafsize(super_tmp) != state->metablock_size ||
@@ -1701,7 +1700,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 		unsigned int j;
 		DECLARE_COMPLETION_ONSTACK(complete);
 
-		bio = bio_alloc(GFP_NOFS, num_pages - i);
+		bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
 		if (!bio) {
 			printk(KERN_INFO
 			       "btrfsic: bio_alloc() for %u pages failed!\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 94ab2f80e7e..b189bd1e7a3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -82,6 +82,10 @@ struct compressed_bio {
 	u32 sums;
 };
 
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+				   u64 disk_start, struct bio_vec *bvec,
+				   int vcnt, size_t srclen);
+
 static inline int compressed_bio_size(struct btrfs_root *root,
 				      unsigned long disk_size)
 {
@@ -106,7 +110,6 @@ static int check_compressed_csum(struct inode *inode,
 				 u64 disk_start)
 {
 	int ret;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page *page;
 	unsigned long i;
 	char *kaddr;
@@ -121,7 +124,7 @@ static int check_compressed_csum(struct inode *inode,
 		csum = ~(u32)0;
 
 		kaddr = kmap_atomic(page);
-		csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+		csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE);
 		btrfs_csum_final(csum, (char *)&csum);
 		kunmap_atomic(kaddr);
 
@@ -372,7 +375,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 		page = compressed_pages[pg_index];
 		page->mapping = inode->i_mapping;
 		if (bio->bi_size)
-			ret = io_tree->ops->merge_bio_hook(page, 0,
+			ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
 							   PAGE_CACHE_SIZE,
 							   bio, 0);
 		else
@@ -655,7 +658,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 		page->index = em_start >> PAGE_CACHE_SHIFT;
 
 		if (comp_bio->bi_size)
-			ret = tree->ops->merge_bio_hook(page, 0,
+			ret = tree->ops->merge_bio_hook(READ, page, 0,
 							PAGE_CACHE_SIZE,
 							comp_bio, 0);
 		else
@@ -739,7 +742,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
 static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
 static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
 
-struct btrfs_compress_op *btrfs_compress_op[] = {
+static struct btrfs_compress_op *btrfs_compress_op[] = {
 	&btrfs_zlib_compress,
 	&btrfs_lzo_compress,
 };
@@ -910,8 +913,9 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
  * be contiguous.  They all correspond to the range of bytes covered by
  * the compressed extent.
  */
-int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
-			    struct bio_vec *bvec, int vcnt, size_t srclen)
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+				   u64 disk_start, struct bio_vec *bvec,
+				   int vcnt, size_t srclen)
 {
 	struct list_head *workspace;
 	int ret;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 9afb0a62ae8..0c803b4fbf9 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -30,8 +30,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
 			 unsigned long *total_in,
 			 unsigned long *total_out,
 			 unsigned long max_out);
-int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
-			    struct bio_vec *bvec, int vcnt, size_t srclen);
 int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
 		     unsigned long start_byte, size_t srclen, size_t destlen);
 int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index eea5da7a2b9..5bf4c39e2ad 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -37,16 +37,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct extent_buffer *dst_buf,
 			      struct extent_buffer *src_buf);
-static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		    struct btrfs_path *path, int level, int slot);
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+		    int level, int slot);
 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 				 struct extent_buffer *eb);
-struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
-					  u32 blocksize, u64 parent_transid,
-					  u64 time_seq);
-struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
-						u64 bytenr, u32 blocksize,
-						u64 time_seq);
+static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -208,7 +203,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
  * tree until you end up with a lock on the root.  A locked buffer
  * is returned, with a reference held.
  */
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 {
 	struct extent_buffer *eb;
 
@@ -361,6 +356,44 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
 }
 
 /*
+ * Increment the upper half of tree_mod_seq, set lower half zero.
+ *
+ * Must be called with fs_info->tree_mod_seq_lock held.
+ */
+static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
+{
+	u64 seq = atomic64_read(&fs_info->tree_mod_seq);
+	seq &= 0xffffffff00000000ull;
+	seq += 1ull << 32;
+	atomic64_set(&fs_info->tree_mod_seq, seq);
+	return seq;
+}
+
+/*
+ * Increment the lower half of tree_mod_seq.
+ *
+ * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
+ * are generated should not technically require a spin lock here. (Rationale:
+ * incrementing the minor while incrementing the major seq number is between its
+ * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
+ * just returns a unique sequence number as usual.) We have decided to leave
+ * that requirement in here and rethink it once we notice it really imposes a
+ * problem on some workload.
+ */
+static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
+{
+	return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * return the last minor in the previous major tree_mod_seq number
+ */
+u64 btrfs_tree_mod_seq_prev(u64 seq)
+{
+	return (seq & 0xffffffff00000000ull) - 1ull;
+}
+
+/*
  * This adds a new blocker to the tree mod log's blocker list if the @elem
  * passed does not already have a sequence number set. So when a caller expects
  * to record tree modifications, it should ensure to set elem->seq to zero
@@ -376,10 +409,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 	tree_mod_log_write_lock(fs_info);
 	spin_lock(&fs_info->tree_mod_seq_lock);
 	if (!elem->seq) {
-		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+		elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 	}
-	seq = btrfs_inc_tree_mod_seq(fs_info);
+	seq = btrfs_inc_tree_mod_seq_minor(fs_info);
 	spin_unlock(&fs_info->tree_mod_seq_lock);
 	tree_mod_log_write_unlock(fs_info);
 
@@ -524,7 +557,10 @@ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
 	if (!tm)
 		return -ENOMEM;
 
-	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
 	return tm->seq;
 }
 
@@ -643,7 +679,8 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 static noinline int
 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 			 struct extent_buffer *old_root,
-			 struct extent_buffer *new_root, gfp_t flags)
+			 struct extent_buffer *new_root, gfp_t flags,
+			 int log_removal)
 {
 	struct tree_mod_elem *tm;
 	int ret;
@@ -651,6 +688,9 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 	if (tree_mod_dont_log(fs_info, NULL))
 		return 0;
 
+	if (log_removal)
+		__tree_mod_log_free_eb(fs_info, old_root);
+
 	ret = tree_mod_alloc(fs_info, flags, &tm);
 	if (ret < 0)
 		goto out;
@@ -751,8 +791,8 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 
 	for (i = 0; i < nr_items; i++) {
 		ret = tree_mod_log_insert_key_locked(fs_info, src,
-						     i + src_offset,
-						     MOD_LOG_KEY_REMOVE);
+						i + src_offset,
+						MOD_LOG_KEY_REMOVE);
 		BUG_ON(ret < 0);
 		ret = tree_mod_log_insert_key_locked(fs_info, dst,
 						     i + dst_offset,
@@ -798,11 +838,12 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 
 static noinline void
 tree_mod_log_set_root_pointer(struct btrfs_root *root,
-			      struct extent_buffer *new_root_node)
+			      struct extent_buffer *new_root_node,
+			      int log_removal)
 {
 	int ret;
 	ret = tree_mod_log_insert_root(root->fs_info, root->node,
-				       new_root_node, GFP_NOFS);
+				       new_root_node, GFP_NOFS, log_removal);
 	BUG_ON(ret < 0);
 }
 
@@ -863,7 +904,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 
 	if (btrfs_block_can_be_shared(root, buf)) {
 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
-					       buf->len, &refs, &flags);
+					       btrfs_header_level(buf), 1,
+					       &refs, &flags);
 		if (ret)
 			return ret;
 		if (refs == 0) {
@@ -909,10 +951,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 			BUG_ON(ret); /* -ENOMEM */
 		}
 		if (new_flags != 0) {
+			int level = btrfs_header_level(buf);
+
 			ret = btrfs_set_disk_extent_flags(trans, root,
 							  buf->start,
 							  buf->len,
-							  new_flags, 0);
+							  new_flags, level, 0);
 			if (ret)
 				return ret;
 		}
@@ -927,7 +971,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 			ret = btrfs_dec_ref(trans, root, buf, 1, 1);
 			BUG_ON(ret); /* -ENOMEM */
 		}
-		tree_mod_log_free_eb(root->fs_info, buf);
 		clean_tree_block(trans, root, buf);
 		*last_ref = 1;
 	}
@@ -1025,7 +1068,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			parent_start = 0;
 
 		extent_buffer_get(cow);
-		tree_mod_log_set_root_pointer(root, cow);
+		tree_mod_log_set_root_pointer(root, cow, 1);
 		rcu_assign_pointer(root->node, cow);
 
 		btrfs_free_tree_block(trans, root, buf, parent_start,
@@ -1046,6 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		btrfs_set_node_ptr_generation(parent, parent_slot,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
+		if (last_ref)
+			tree_mod_log_free_eb(root->fs_info, buf);
 		btrfs_free_tree_block(trans, root, buf, parent_start,
 				      last_ref);
 	}
@@ -1063,11 +1108,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
  */
 static struct tree_mod_elem *
 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
-			   struct btrfs_root *root, u64 time_seq)
+			   struct extent_buffer *eb_root, u64 time_seq)
 {
 	struct tree_mod_elem *tm;
 	struct tree_mod_elem *found = NULL;
-	u64 root_logical = root->node->start;
+	u64 root_logical = eb_root->start;
 	int looped = 0;
 
 	if (!time_seq)
@@ -1101,7 +1146,6 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
 
 		found = tm;
 		root_logical = tm->old_root.logical;
-		BUG_ON(root_logical == root->node->start);
 		looped = 1;
 	}
 
@@ -1118,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
  * time_seq).
  */
 static void
-__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
-		      struct tree_mod_elem *first_tm)
+__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+		      u64 time_seq, struct tree_mod_elem *first_tm)
 {
 	u32 n;
 	struct rb_node *next;
@@ -1129,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
 
 	n = btrfs_header_nritems(eb);
+	tree_mod_log_read_lock(fs_info);
 	while (tm && tm->seq >= time_seq) {
 		/*
 		 * all the operations are recorded with the operator used for
@@ -1138,6 +1183,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
 		switch (tm->op) {
 		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
 			BUG_ON(tm->slot < n);
+			/* Fallthrough */
 		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
 		case MOD_LOG_KEY_REMOVE:
 			btrfs_set_node_key(eb, &tm->key, tm->slot);
@@ -1182,9 +1228,17 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
 		if (tm->index != first_tm->index)
 			break;
 	}
+	tree_mod_log_read_unlock(fs_info);
 	btrfs_set_header_nritems(eb, n);
 }
 
+/*
+ * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
+ * is returned. If rewind operations happen, a fresh buffer is returned. The
+ * returned buffer is always read-locked. If the returned buffer is not the
+ * input buffer, the lock on the input buffer is released and the input buffer
+ * is freed (its refcount is decremented).
+ */
 static struct extent_buffer *
 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 		    u64 time_seq)
@@ -1218,11 +1272,14 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 	}
 
 	extent_buffer_get(eb_rewin);
+	btrfs_tree_read_unlock(eb);
 	free_extent_buffer(eb);
 
-	__tree_mod_log_rewind(eb_rewin, time_seq, tm);
+	extent_buffer_get(eb_rewin);
+	btrfs_tree_read_lock(eb_rewin);
+	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
 	WARN_ON(btrfs_header_nritems(eb_rewin) >
-		BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root));
+		BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
 
 	return eb_rewin;
 }
@@ -1238,33 +1295,35 @@ static inline struct extent_buffer *
 get_old_root(struct btrfs_root *root, u64 time_seq)
 {
 	struct tree_mod_elem *tm;
-	struct extent_buffer *eb;
+	struct extent_buffer *eb = NULL;
+	struct extent_buffer *eb_root;
 	struct extent_buffer *old;
 	struct tree_mod_root *old_root = NULL;
 	u64 old_generation = 0;
 	u64 logical;
 	u32 blocksize;
 
-	eb = btrfs_read_lock_root_node(root);
-	tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+	eb_root = btrfs_read_lock_root_node(root);
+	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
 	if (!tm)
-		return root->node;
+		return eb_root;
 
 	if (tm->op == MOD_LOG_ROOT_REPLACE) {
 		old_root = &tm->old_root;
 		old_generation = tm->generation;
 		logical = old_root->logical;
 	} else {
-		logical = root->node->start;
+		logical = eb_root->start;
 	}
 
 	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
-		btrfs_tree_read_unlock(root->node);
-		free_extent_buffer(root->node);
+		btrfs_tree_read_unlock(eb_root);
+		free_extent_buffer(eb_root);
 		blocksize = btrfs_level_size(root, old_root->level);
 		old = read_tree_block(root, logical, blocksize, 0);
-		if (!old) {
+		if (!old || !extent_buffer_uptodate(old)) {
+			free_extent_buffer(old);
 			pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
 				logical);
 			WARN_ON(1);
@@ -1273,13 +1332,13 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 			free_extent_buffer(old);
 		}
 	} else if (old_root) {
-		btrfs_tree_read_unlock(root->node);
-		free_extent_buffer(root->node);
+		btrfs_tree_read_unlock(eb_root);
+		free_extent_buffer(eb_root);
 		eb = alloc_dummy_extent_buffer(logical, root->nodesize);
 	} else {
-		eb = btrfs_clone_extent_buffer(root->node);
-		btrfs_tree_read_unlock(root->node);
-		free_extent_buffer(root->node);
+		eb = btrfs_clone_extent_buffer(eb_root);
+		btrfs_tree_read_unlock(eb_root);
+		free_extent_buffer(eb_root);
 	}
 
 	if (!eb)
@@ -1289,12 +1348,12 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 	if (old_root) {
 		btrfs_set_header_bytenr(eb, eb->start);
 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
-		btrfs_set_header_owner(eb, root->root_key.objectid);
+		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
 		btrfs_set_header_level(eb, old_root->level);
 		btrfs_set_header_generation(eb, old_generation);
 	}
 	if (tm)
-		__tree_mod_log_rewind(eb, time_seq, tm);
+		__tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
 	else
 		WARN_ON(btrfs_header_level(eb) != 0);
 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -1306,15 +1365,15 @@ int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
 {
 	struct tree_mod_elem *tm;
 	int level;
+	struct extent_buffer *eb_root = btrfs_root_node(root);
 
-	tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
 		level = tm->old_root.level;
 	} else {
-		rcu_read_lock();
-		level = btrfs_header_level(root->node);
-		rcu_read_unlock();
+		level = btrfs_header_level(eb_root);
 	}
+	free_extent_buffer(eb_root);
 
 	return level;
 }
@@ -1441,7 +1500,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
  */
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int start_slot, int cache_only, u64 *last_ret,
+		       int start_slot, u64 *last_ret,
 		       struct btrfs_key *progress)
 {
 	struct extent_buffer *cur;
@@ -1461,8 +1520,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	struct btrfs_disk_key disk_key;
 
 	parent_level = btrfs_header_level(parent);
-	if (cache_only && parent_level != 1)
-		return 0;
 
 	WARN_ON(trans->transaction != root->fs_info->running_transaction);
 	WARN_ON(trans->transid != root->fs_info->generation);
@@ -1508,15 +1565,13 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		else
 			uptodate = 0;
 		if (!cur || !uptodate) {
-			if (cache_only) {
-				free_extent_buffer(cur);
-				continue;
-			}
 			if (!cur) {
 				cur = read_tree_block(root, blocknr,
 							 blocksize, gen);
-				if (!cur)
+				if (!cur || !extent_buffer_uptodate(cur)) {
+					free_extent_buffer(cur);
 					return -EIO;
+				}
 			} else if (!uptodate) {
 				err = btrfs_read_buffer(cur, gen);
 				if (err) {
@@ -1681,6 +1736,8 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 				   struct extent_buffer *parent, int slot)
 {
 	int level = btrfs_header_level(parent);
+	struct extent_buffer *eb;
+
 	if (slot < 0)
 		return NULL;
 	if (slot >= btrfs_header_nritems(parent))
@@ -1688,9 +1745,15 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 
 	BUG_ON(level == 0);
 
-	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
-		       btrfs_level_size(root, level - 1),
-		       btrfs_node_ptr_generation(parent, slot));
+	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
+			     btrfs_level_size(root, level - 1),
+			     btrfs_node_ptr_generation(parent, slot));
+	if (eb && !extent_buffer_uptodate(eb)) {
+		free_extent_buffer(eb);
+		eb = NULL;
+	}
+
+	return eb;
 }
 
 /*
@@ -1755,8 +1818,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			goto enospc;
 		}
 
-		tree_mod_log_free_eb(root->fs_info, root->node);
-		tree_mod_log_set_root_pointer(root, child);
+		tree_mod_log_set_root_pointer(root, child, 1);
 		rcu_assign_pointer(root->node, child);
 
 		add_root_to_dirty_list(root);
@@ -1820,7 +1882,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (btrfs_header_nritems(right) == 0) {
 			clean_tree_block(trans, root, right);
 			btrfs_tree_unlock(right);
-			del_ptr(trans, root, path, level + 1, pslot + 1);
+			del_ptr(root, path, level + 1, pslot + 1);
 			root_sub_used(root, right->len);
 			btrfs_free_tree_block(trans, root, right, 0, 1);
 			free_extent_buffer_stale(right);
@@ -1864,7 +1926,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	if (btrfs_header_nritems(mid) == 0) {
 		clean_tree_block(trans, root, mid);
 		btrfs_tree_unlock(mid);
-		del_ptr(trans, root, path, level + 1, pslot);
+		del_ptr(root, path, level + 1, pslot);
 		root_sub_used(root, mid->len);
 		btrfs_free_tree_block(trans, root, mid, 0, 1);
 		free_extent_buffer_stale(mid);
@@ -2119,12 +2181,8 @@ static void reada_for_search(struct btrfs_root *root,
 	}
 }
 
-/*
- * returns -EAGAIN if it had to drop the path, or zero if everything was in
- * cache
- */
-static noinline int reada_for_balance(struct btrfs_root *root,
-				      struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_root *root,
+				       struct btrfs_path *path, int level)
 {
 	int slot;
 	int nritems;
@@ -2133,12 +2191,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 	u64 gen;
 	u64 block1 = 0;
 	u64 block2 = 0;
-	int ret = 0;
 	int blocksize;
 
 	parent = path->nodes[level + 1];
 	if (!parent)
-		return 0;
+		return;
 
 	nritems = btrfs_header_nritems(parent);
 	slot = path->slots[level + 1];
@@ -2165,28 +2222,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
 			block2 = 0;
 		free_extent_buffer(eb);
 	}
-	if (block1 || block2) {
-		ret = -EAGAIN;
-
-		/* release the whole path */
-		btrfs_release_path(path);
-
-		/* read the blocks */
-		if (block1)
-			readahead_tree_block(root, block1, blocksize, 0);
-		if (block2)
-			readahead_tree_block(root, block2, blocksize, 0);
 
-		if (block1) {
-			eb = read_tree_block(root, block1, blocksize, 0);
-			free_extent_buffer(eb);
-		}
-		if (block2) {
-			eb = read_tree_block(root, block2, blocksize, 0);
-			free_extent_buffer(eb);
-		}
-	}
-	return ret;
+	if (block1)
+		readahead_tree_block(root, block1, blocksize, 0);
+	if (block2)
+		readahead_tree_block(root, block2, blocksize, 0);
 }
 
 
@@ -2212,9 +2252,6 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
 	int no_skips = 0;
 	struct extent_buffer *t;
 
-	if (path->really_keep_locks)
-		return;
-
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
 		if (!path->nodes[i])
 			break;
@@ -2262,7 +2299,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 {
 	int i;
 
-	if (path->keep_locks || path->really_keep_locks)
+	if (path->keep_locks)
 		return;
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2303,35 +2340,28 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
 	if (tmp) {
 		/* first we do an atomic uptodate check */
-		if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
-			if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
-				/*
-				 * we found an up to date block without
-				 * sleeping, return
-				 * right away
-				 */
-				*eb_ret = tmp;
-				return 0;
-			}
-			/* the pages were up to date, but we failed
-			 * the generation number check.  Do a full
-			 * read for the generation number that is correct.
-			 * We must do this without dropping locks so
-			 * we can trust our generation number
-			 */
-			free_extent_buffer(tmp);
-			btrfs_set_path_blocking(p);
+		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+			*eb_ret = tmp;
+			return 0;
+		}
 
-			/* now we're allowed to do a blocking uptodate check */
-			tmp = read_tree_block(root, blocknr, blocksize, gen);
-			if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
-				*eb_ret = tmp;
-				return 0;
-			}
-			free_extent_buffer(tmp);
-			btrfs_release_path(p);
-			return -EIO;
+		/* the pages were up to date, but we failed
+		 * the generation number check.  Do a full
+		 * read for the generation number that is correct.
+		 * We must do this without dropping locks so
+		 * we can trust our generation number
+		 */
+		btrfs_set_path_blocking(p);
+
+		/* now we're allowed to do a blocking uptodate check */
+		ret = btrfs_read_buffer(tmp, gen);
+		if (!ret) {
+			*eb_ret = tmp;
+			return 0;
 		}
+		free_extent_buffer(tmp);
+		btrfs_release_path(p);
+		return -EIO;
 	}
 
 	/*
@@ -2392,11 +2422,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
 			goto again;
 		}
 
-		sret = reada_for_balance(root, p, level);
-		if (sret)
-			goto again;
-
 		btrfs_set_path_blocking(p);
+		reada_for_balance(root, p, level);
 		sret = split_node(trans, root, p, level);
 		btrfs_clear_path_blocking(p, NULL, 0);
 
@@ -2416,11 +2443,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
 			goto again;
 		}
 
-		sret = reada_for_balance(root, p, level);
-		if (sret)
-			goto again;
-
 		btrfs_set_path_blocking(p);
+		reada_for_balance(root, p, level);
 		sret = balance_level(trans, root, p, level);
 		btrfs_clear_path_blocking(p, NULL, 0);
 
@@ -2495,7 +2519,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!cow)
 		write_lock_level = -1;
 
-	if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
+	if (cow && (p->keep_locks || p->lowest_level))
 		write_lock_level = BTRFS_MAX_LEVEL;
 
 	min_write_lock_level = write_lock_level;
@@ -2797,15 +2821,9 @@ again:
 				btrfs_clear_path_blocking(p, b,
 							  BTRFS_READ_LOCK);
 			}
+			b = tree_mod_log_rewind(root->fs_info, b, time_seq);
 			p->locks[level] = BTRFS_READ_LOCK;
 			p->nodes[level] = b;
-			b = tree_mod_log_rewind(root->fs_info, b, time_seq);
-			if (b != p->nodes[level]) {
-				btrfs_tree_unlock_rw(p->nodes[level],
-						     p->locks[level]);
-				p->locks[level] = 0;
-				p->nodes[level] = b;
-			}
 		} else {
 			p->slots[level] = slot;
 			unlock_up(p, level, lowest_unlock, 0, NULL);
@@ -2904,8 +2922,7 @@ again:
  * higher levels
  *
  */
-static void fixup_low_keys(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct btrfs_path *path,
+static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
 			   struct btrfs_disk_key *key, int level)
 {
 	int i;
@@ -2930,8 +2947,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
  * This function isn't completely safe. It's the caller's responsibility
  * that the new key won't break the order
  */
-void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
 			     struct btrfs_key *new_key)
 {
 	struct btrfs_disk_key disk_key;
@@ -2953,7 +2969,7 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
 	btrfs_set_item_key(eb, &disk_key, slot);
 	btrfs_mark_buffer_dirty(eb);
 	if (slot == 0)
-		fixup_low_keys(trans, root, path, &disk_key, 1);
+		fixup_low_keys(root, path, &disk_key, 1);
 }
 
 /*
@@ -3146,7 +3162,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(c);
 
 	old = root->node;
-	tree_mod_log_set_root_pointer(root, c);
+	tree_mod_log_set_root_pointer(root, c, 0);
 	rcu_assign_pointer(root->node, c);
 
 	/* the super has an extra ref to root->node */
@@ -3227,7 +3243,16 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	c = path->nodes[level];
 	WARN_ON(btrfs_header_generation(c) != trans->transid);
 	if (c == root->node) {
-		/* trying to split the root, lets make a new one */
+		/*
+		 * trying to split the root, lets make a new one
+		 *
+		 * tree mod log: We don't log_removal old root in
+		 * insert_new_root, because that root buffer will be kept as a
+		 * normal node. We are going to log removal of half of the
+		 * elements below with tree_mod_log_eb_copy. We're holding a
+		 * tree lock on the buffer, which is why we cannot race with
+		 * other tree_mod_log users.
+		 */
 		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
@@ -3682,7 +3707,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 		clean_tree_block(trans, root, right);
 
 	btrfs_item_key(right, &disk_key, 0);
-	fixup_low_keys(trans, root, path, &disk_key, 1);
+	fixup_low_keys(root, path, &disk_key, 1);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
@@ -3929,7 +3954,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 		return -EOVERFLOW;
 
 	/* first try to make some room by pushing left and right */
-	if (data_size) {
+	if (data_size && path->nodes[1]) {
 		wret = push_leaf_right(trans, root, path, data_size,
 				       data_size, 0, 0);
 		if (wret < 0)
@@ -4042,8 +4067,7 @@ again:
 			path->nodes[0] = right;
 			path->slots[0] = 0;
 			if (path->slots[1] == 0)
-				fixup_low_keys(trans, root, path,
-					       &disk_key, 1);
+				fixup_low_keys(root, path, &disk_key, 1);
 		}
 		btrfs_mark_buffer_dirty(right);
 		return ret;
@@ -4259,7 +4283,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
 		return ret;
 
 	path->slots[0]++;
-	setup_items_for_insert(trans, root, path, new_key, &item_size,
+	setup_items_for_insert(root, path, new_key, &item_size,
 			       item_size, item_size +
 			       sizeof(struct btrfs_item), 1);
 	leaf = path->nodes[0];
@@ -4276,9 +4300,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
  * off the end of the item or if we shift the item to chop bytes off
  * the front.
  */
-void btrfs_truncate_item(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 struct btrfs_path *path,
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
 			 u32 new_size, int from_end)
 {
 	int slot;
@@ -4362,7 +4384,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
 		btrfs_set_item_key(leaf, &disk_key, slot);
 		if (slot == 0)
-			fixup_low_keys(trans, root, path, &disk_key, 1);
+			fixup_low_keys(root, path, &disk_key, 1);
 	}
 
 	item = btrfs_item_nr(leaf, slot);
@@ -4376,10 +4398,9 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
 }
 
 /*
- * make the item pointed to by the path bigger, data_size is the new size.
+ * make the item pointed to by the path bigger, data_size is the added size.
  */
-void btrfs_extend_item(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
 		       u32 data_size)
 {
 	int slot;
@@ -4449,8 +4470,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
  * to save stack depth by doing the bulk of the work in a function
  * that doesn't call btrfs_search_slot
  */
-void setup_items_for_insert(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, struct btrfs_path *path,
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 			    struct btrfs_key *cpu_key, u32 *data_size,
 			    u32 total_data, u32 total_size, int nr)
 {
@@ -4526,7 +4546,7 @@ void setup_items_for_insert(struct btrfs_trans_handle *trans,
 
 	if (slot == 0) {
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-		fixup_low_keys(trans, root, path, &disk_key, 1);
+		fixup_low_keys(root, path, &disk_key, 1);
 	}
 	btrfs_unlock_up_safe(path, 1);
 	btrfs_mark_buffer_dirty(leaf);
@@ -4566,7 +4586,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	slot = path->slots[0];
 	BUG_ON(slot < 0);
 
-	setup_items_for_insert(trans, root, path, cpu_key, data_size,
+	setup_items_for_insert(root, path, cpu_key, data_size,
 			       total_data, total_size, nr);
 	return 0;
 }
@@ -4604,8 +4624,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
  * the tree should have been previously balanced so the deletion does not
  * empty a node.
  */
-static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		    struct btrfs_path *path, int level, int slot)
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+		    int level, int slot)
 {
 	struct extent_buffer *parent = path->nodes[level];
 	u32 nritems;
@@ -4637,7 +4657,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		struct btrfs_disk_key disk_key;
 
 		btrfs_node_key(parent, &disk_key, 0);
-		fixup_low_keys(trans, root, path, &disk_key, level + 1);
+		fixup_low_keys(root, path, &disk_key, level + 1);
 	}
 	btrfs_mark_buffer_dirty(parent);
 }
@@ -4658,7 +4678,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
 				    struct extent_buffer *leaf)
 {
 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
-	del_ptr(trans, root, path, 1, path->slots[1]);
+	del_ptr(root, path, 1, path->slots[1]);
 
 	/*
 	 * btrfs_free_extent is expensive, we want to make sure we
@@ -4739,7 +4759,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			struct btrfs_disk_key disk_key;
 
 			btrfs_item_key(leaf, &disk_key, 0);
-			fixup_low_keys(trans, root, path, &disk_key, 1);
+			fixup_low_keys(root, path, &disk_key, 1);
 		}
 
 		/* delete the leaf if it is mostly empty */
@@ -4825,8 +4845,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
 /*
  * A helper function to walk down the tree starting at min_key, and looking
- * for nodes or leaves that are either in cache or have a minimum
- * transaction id.  This is used by the btree defrag code, and tree logging
+ * for nodes or leaves that are have a minimum transaction id.
+ * This is used by the btree defrag code, and tree logging
  *
  * This does not cow, but it does stuff the starting key it finds back
  * into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -4847,7 +4867,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  */
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 			 struct btrfs_key *max_key,
-			 struct btrfs_path *path, int cache_only,
+			 struct btrfs_path *path,
 			 u64 min_trans)
 {
 	struct extent_buffer *cur;
@@ -4887,15 +4907,12 @@ again:
 		if (sret && slot > 0)
 			slot--;
 		/*
-		 * check this node pointer against the cache_only and
-		 * min_trans parameters.  If it isn't in cache or is too
-		 * old, skip to the next one.
+		 * check this node pointer against the min_trans parameters.
+		 * If it is too old, old, skip to the next one.
 		 */
 		while (slot < nritems) {
 			u64 blockptr;
 			u64 gen;
-			struct extent_buffer *tmp;
-			struct btrfs_disk_key disk_key;
 
 			blockptr = btrfs_node_blockptr(cur, slot);
 			gen = btrfs_node_ptr_generation(cur, slot);
@@ -4903,27 +4920,7 @@ again:
 				slot++;
 				continue;
 			}
-			if (!cache_only)
-				break;
-
-			if (max_key) {
-				btrfs_node_key(cur, &disk_key, slot);
-				if (comp_keys(&disk_key, max_key) >= 0) {
-					ret = 1;
-					goto out;
-				}
-			}
-
-			tmp = btrfs_find_tree_block(root, blockptr,
-					    btrfs_level_size(root, level - 1));
-
-			if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
-				free_extent_buffer(tmp);
-				break;
-			}
-			if (tmp)
-				free_extent_buffer(tmp);
-			slot++;
+			break;
 		}
 find_next_key:
 		/*
@@ -4934,7 +4931,7 @@ find_next_key:
 			path->slots[level] = slot;
 			btrfs_set_path_blocking(path);
 			sret = btrfs_find_next_key(root, path, min_key, level,
-						  cache_only, min_trans);
+						  min_trans);
 			if (sret == 0) {
 				btrfs_release_path(path);
 				goto again;
@@ -5399,8 +5396,7 @@ out:
 /*
  * this is similar to btrfs_next_leaf, but does not try to preserve
  * and fixup the path.  It looks for and returns the next key in the
- * tree based on the current path and the cache_only and min_trans
- * parameters.
+ * tree based on the current path and the min_trans parameters.
  *
  * 0 is returned if another key is found, < 0 if there are any errors
  * and 1 is returned if there are no higher keys in the tree
@@ -5409,8 +5405,7 @@ out:
  * calling this function.
  */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-			struct btrfs_key *key, int level,
-			int cache_only, u64 min_trans)
+			struct btrfs_key *key, int level, u64 min_trans)
 {
 	int slot;
 	struct extent_buffer *c;
@@ -5461,22 +5456,8 @@ next:
 		if (level == 0)
 			btrfs_item_key_to_cpu(c, key, slot);
 		else {
-			u64 blockptr = btrfs_node_blockptr(c, slot);
 			u64 gen = btrfs_node_ptr_generation(c, slot);
 
-			if (cache_only) {
-				struct extent_buffer *cur;
-				cur = btrfs_find_tree_block(root, blockptr,
-					    btrfs_level_size(root, level - 1));
-				if (!cur ||
-				    btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
-					slot++;
-					if (cur)
-						free_extent_buffer(cur);
-					goto next;
-				}
-				free_extent_buffer(cur);
-			}
 			if (gen < min_trans) {
 				slot++;
 				goto next;
@@ -5498,139 +5479,6 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	return btrfs_next_old_leaf(root, path, 0);
 }
 
-/* Release the path up to but not including the given level */
-static void btrfs_release_level(struct btrfs_path *path, int level)
-{
-	int i;
-
-	for (i = 0; i < level; i++) {
-		path->slots[i] = 0;
-		if (!path->nodes[i])
-			continue;
-		if (path->locks[i]) {
-			btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
-			path->locks[i] = 0;
-		}
-		free_extent_buffer(path->nodes[i]);
-		path->nodes[i] = NULL;
-	}
-}
-
-/*
- * This function assumes 2 things
- *
- * 1) You are using path->keep_locks
- * 2) You are not inserting items.
- *
- * If either of these are not true do not use this function. If you need a next
- * leaf with either of these not being true then this function can be easily
- * adapted to do that, but at the moment these are the limitations.
- */
-int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct btrfs_path *path,
-			  int del)
-{
-	struct extent_buffer *b;
-	struct btrfs_key key;
-	u32 nritems;
-	int level = 1;
-	int slot;
-	int ret = 1;
-	int write_lock_level = BTRFS_MAX_LEVEL;
-	int ins_len = del ? -1 : 0;
-
-	WARN_ON(!(path->keep_locks || path->really_keep_locks));
-
-	nritems = btrfs_header_nritems(path->nodes[0]);
-	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
-
-	while (path->nodes[level]) {
-		nritems = btrfs_header_nritems(path->nodes[level]);
-		if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
-search:
-			btrfs_release_path(path);
-			ret = btrfs_search_slot(trans, root, &key, path,
-						ins_len, 1);
-			if (ret < 0)
-				goto out;
-			level = 1;
-			continue;
-		}
-
-		if (path->slots[level] >= nritems - 1) {
-			level++;
-			continue;
-		}
-
-		btrfs_release_level(path, level);
-		break;
-	}
-
-	if (!path->nodes[level]) {
-		ret = 1;
-		goto out;
-	}
-
-	path->slots[level]++;
-	b = path->nodes[level];
-
-	while (b) {
-		level = btrfs_header_level(b);
-
-		if (!should_cow_block(trans, root, b))
-			goto cow_done;
-
-		btrfs_set_path_blocking(path);
-		ret = btrfs_cow_block(trans, root, b,
-				      path->nodes[level + 1],
-				      path->slots[level + 1], &b);
-		if (ret)
-			goto out;
-cow_done:
-		path->nodes[level] = b;
-		btrfs_clear_path_blocking(path, NULL, 0);
-		if (level != 0) {
-			ret = setup_nodes_for_search(trans, root, path, b,
-						     level, ins_len,
-						     &write_lock_level);
-			if (ret == -EAGAIN)
-				goto search;
-			if (ret)
-				goto out;
-
-			b = path->nodes[level];
-			slot = path->slots[level];
-
-			ret = read_block_for_search(trans, root, path,
-						    &b, level, slot, &key, 0);
-			if (ret == -EAGAIN)
-				goto search;
-			if (ret)
-				goto out;
-			level = btrfs_header_level(b);
-			if (!btrfs_try_tree_write_lock(b)) {
-				btrfs_set_path_blocking(path);
-				btrfs_tree_lock(b);
-				btrfs_clear_path_blocking(path, b,
-							  BTRFS_WRITE_LOCK);
-			}
-			path->locks[level] = BTRFS_WRITE_LOCK;
-			path->nodes[level] = b;
-			path->slots[level] = 0;
-		} else {
-			path->slots[level] = 0;
-			ret = 0;
-			break;
-		}
-	}
-
-out:
-	if (ret)
-		btrfs_release_path(path);
-
-	return ret;
-}
-
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 			u64 time_seq)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 547b7b05727..e795bf135e8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -31,10 +31,10 @@
 #include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
 #include <linux/pagemap.h>
+#include <linux/btrfs.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
-#include "ioctl.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
 
 #define BTRFS_MAX_MIRRORS 3
 
@@ -88,12 +88,12 @@ struct btrfs_ordered_sum;
 /* holds checksums of all the data extents */
 #define BTRFS_CSUM_TREE_OBJECTID 7ULL
 
-/* for storing balance parameters in the root tree */
-#define BTRFS_BALANCE_OBJECTID -4ULL
-
 /* holds quota configuration and tracking */
 #define BTRFS_QUOTA_TREE_OBJECTID 8ULL
 
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 /* ioprio of readahead is set to idle */
 #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
 
+#define BTRFS_DIRTY_METADATA_THRESH	(32 * 1024 * 1024)
+
 /*
  * The key defines the order in the tree, and so it also defines (optimal)
  * block layout.
@@ -336,7 +338,11 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 /*
  * File system states
  */
+#define BTRFS_FS_STATE_ERROR		0
+#define BTRFS_FS_STATE_REMOUNTING	1
+#define BTRFS_FS_STATE_TRANS_ABORTED	2
 
+/* Super block flags */
 /* Errors detected */
 #define BTRFS_SUPER_FLAG_ERROR		(1ULL << 2)
 
@@ -502,6 +508,8 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA	(1ULL << 5)
 
 #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6)
+#define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)
+#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA	(1ULL << 8)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
@@ -511,7 +519,9 @@ struct btrfs_super_block {
 	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
 	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
 	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
-	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
+	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
+	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -576,7 +586,6 @@ struct btrfs_path {
 	unsigned int skip_locking:1;
 	unsigned int leave_spinning:1;
 	unsigned int search_commit_root:1;
-	unsigned int really_keep_locks:1;
 };
 
 /*
@@ -952,8 +961,20 @@ struct btrfs_dev_replace_item {
 #define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)
 #define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)
 #define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RAID5         (1ULL << 7)
+#define BTRFS_BLOCK_GROUP_RAID6         (1ULL << 8)
 #define BTRFS_BLOCK_GROUP_RESERVED	BTRFS_AVAIL_ALLOC_BIT_SINGLE
-#define BTRFS_NR_RAID_TYPES		5
+
+enum btrfs_raid_types {
+	BTRFS_RAID_RAID10,
+	BTRFS_RAID_RAID1,
+	BTRFS_RAID_DUP,
+	BTRFS_RAID_RAID0,
+	BTRFS_RAID_SINGLE,
+	BTRFS_RAID_RAID5,
+	BTRFS_RAID_RAID6,
+	BTRFS_NR_RAID_TYPES
+};
 
 #define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
 					 BTRFS_BLOCK_GROUP_SYSTEM |  \
@@ -961,6 +982,8 @@ struct btrfs_dev_replace_item {
 
 #define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \
 					 BTRFS_BLOCK_GROUP_RAID1 |   \
+					 BTRFS_BLOCK_GROUP_RAID5 |   \
+					 BTRFS_BLOCK_GROUP_RAID6 |   \
 					 BTRFS_BLOCK_GROUP_DUP |     \
 					 BTRFS_BLOCK_GROUP_RAID10)
 /*
@@ -998,9 +1021,9 @@ struct btrfs_block_group_item {
  */
 #define BTRFS_QGROUP_STATUS_FLAG_ON		(1ULL << 0)
 /*
- * SCANNING is set during the initialization phase
+ * RESCAN is set during the initialization phase
  */
-#define BTRFS_QGROUP_STATUS_FLAG_SCANNING	(1ULL << 1)
+#define BTRFS_QGROUP_STATUS_FLAG_RESCAN		(1ULL << 1)
 /*
  * Some qgroup entries are known to be out of date,
  * either because the configuration has changed in a way that
@@ -1029,7 +1052,7 @@ struct btrfs_qgroup_status_item {
 	 * only used during scanning to record the progress
 	 * of the scan. It contains a logical address
 	 */
-	__le64 scan;
+	__le64 rescan;
 } __attribute__ ((__packed__));
 
 struct btrfs_qgroup_info_item {
@@ -1079,6 +1102,18 @@ struct btrfs_space_info {
 				   account */
 
 	/*
+	 * bytes_pinned is kept in line with what is actually pinned, as in
+	 * we've called update_block_group and dropped the bytes_used counter
+	 * and increased the bytes_pinned counter.  However this means that
+	 * bytes_pinned does not reflect the bytes that will be pinned once the
+	 * delayed refs are flushed, so this counter is inc'ed everytime we call
+	 * btrfs_free_extent so it is a realtime count of what will be freed
+	 * once the transaction is committed.  It will be zero'ed everytime the
+	 * transaction commits.
+	 */
+	struct percpu_counter total_bytes_pinned;
+
+	/*
 	 * we bump reservation progress every time we decrement
 	 * bytes_reserved.  This way people waiting for reservations
 	 * know something good has happened and they can check
@@ -1185,6 +1220,10 @@ struct btrfs_block_group_cache {
 	u64 flags;
 	u64 sectorsize;
 	u64 cache_generation;
+
+	/* for raid56, this is a full stripe, without parity */
+	unsigned long full_stripe_len;
+
 	unsigned int ro:1;
 	unsigned int dirty:1;
 	unsigned int iref:1;
@@ -1225,6 +1264,28 @@ struct seq_list {
 	u64 seq;
 };
 
+enum btrfs_orphan_cleanup_state {
+	ORPHAN_CLEANUP_STARTED	= 1,
+	ORPHAN_CLEANUP_DONE	= 2,
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+	struct list_head hash_list;
+	wait_queue_head_t wait;
+	spinlock_t lock;
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+	struct list_head stripe_cache;
+	spinlock_t cache_lock;
+	int cache_size;
+	struct btrfs_stripe_hash table[];
+};
+
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+
 /* fs_info */
 struct reloc_control;
 struct btrfs_device;
@@ -1250,6 +1311,7 @@ struct btrfs_fs_info {
 
 	/* block group cache stuff */
 	spinlock_t block_group_cache_lock;
+	u64 first_logical_byte;
 	struct rb_root block_group_cache_tree;
 
 	/* keep track of unallocated space */
@@ -1288,7 +1350,23 @@ struct btrfs_fs_info {
 	u64 last_trans_log_full_commit;
 	unsigned long mount_opt;
 	unsigned long compress_type:4;
+	/*
+	 * It is a suggestive number, the read side is safe even it gets a
+	 * wrong number because we will write out the data into a regular
+	 * extent. The write side(mount/remount) is under ->s_umount lock,
+	 * so it is also safe.
+	 */
 	u64 max_inline;
+	/*
+	 * Protected by ->chunk_mutex and sb->s_umount.
+	 *
+	 * The reason that we use two lock to protect it is because only
+	 * remount and mount operations can change it and these two operations
+	 * are under sb->s_umount, but the read side (chunk allocation) can not
+	 * acquire sb->s_umount or the deadlock would happen. So we use two
+	 * locks to protect it. On the write side, we must acquire two locks,
+	 * and on the read side, we just need acquire one of them.
+	 */
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
@@ -1296,6 +1374,17 @@ struct btrfs_fs_info {
 	wait_queue_head_t transaction_blocked_wait;
 	wait_queue_head_t async_submit_wait;
 
+	/*
+	 * Used to protect the incompat_flags, compat_flags, compat_ro_flags
+	 * when they are updated.
+	 *
+	 * Because we do not clear the flags for ever, so we needn't use
+	 * the lock on the read side.
+	 *
+	 * We also needn't use the lock when we mount the fs, because
+	 * there is no other task which will update the flag.
+	 */
+	spinlock_t super_lock;
 	struct btrfs_super_block *super_copy;
 	struct btrfs_super_block *super_for_commit;
 	struct block_device *__bdev;
@@ -1307,6 +1396,13 @@ struct btrfs_fs_info {
 	struct mutex cleaner_mutex;
 	struct mutex chunk_mutex;
 	struct mutex volume_mutex;
+
+	/* this is used during read/modify/write to make sure
+	 * no two ios are trying to mod the same stripe at the same
+	 * time
+	 */
+	struct btrfs_stripe_hash_table *stripe_hash_table;
+
 	/*
 	 * this protects the ordered operations list only while we are
 	 * processing all of the entries on it.  This way we make
@@ -1338,7 +1434,7 @@ struct btrfs_fs_info {
 
 	/* this protects tree_mod_seq_list */
 	spinlock_t tree_mod_seq_lock;
-	atomic_t tree_mod_seq;
+	atomic64_t tree_mod_seq;
 	struct list_head tree_mod_seq_list;
 	struct seq_list tree_mod_seq_elem;
 
@@ -1353,31 +1449,22 @@ struct btrfs_fs_info {
 	atomic_t open_ioctl_trans;
 
 	/*
-	 * this is used by the balancing code to wait for all the pending
-	 * ordered extents
+	 * this is used to protect the following list -- ordered_roots.
 	 */
-	spinlock_t ordered_extent_lock;
+	spinlock_t ordered_root_lock;
 
 	/*
-	 * all of the data=ordered extents pending writeback
+	 * all fs/file tree roots in which there are data=ordered extents
+	 * pending writeback are added into this list.
+	 *
 	 * these can span multiple transactions and basically include
 	 * every dirty data page that isn't from nodatacow
 	 */
-	struct list_head ordered_extents;
+	struct list_head ordered_roots;
 
-	/*
-	 * all of the inodes that have delalloc bytes.  It is possible for
-	 * this list to be empty even when there is still dirty data=ordered
-	 * extents waiting to finish IO.
-	 */
-	struct list_head delalloc_inodes;
-
-	/*
-	 * special rename and truncate targets that must be on disk before
-	 * we're allowed to commit.  This is basically the ext3 style
-	 * data=ordered list.
-	 */
-	struct list_head ordered_operations;
+	spinlock_t delalloc_root_lock;
+	/* all fs/file tree roots that have delalloc inodes. */
+	struct list_head delalloc_roots;
 
 	/*
 	 * there is a pool of worker threads for checksumming during writes
@@ -1395,6 +1482,8 @@ struct btrfs_fs_info {
 	struct btrfs_workers flush_workers;
 	struct btrfs_workers endio_workers;
 	struct btrfs_workers endio_meta_workers;
+	struct btrfs_workers endio_raid56_workers;
+	struct btrfs_workers rmw_workers;
 	struct btrfs_workers endio_meta_write_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers endio_freespace_worker;
@@ -1418,15 +1507,15 @@ struct btrfs_fs_info {
 	int do_barriers;
 	int closing;
 	int log_root_recovering;
-	int enospc_unlink;
-	int trans_no_join;
 
 	u64 total_pinned;
 
-	/* protected by the delalloc lock, used to keep from writing
-	 * metadata until there is a nice batch
-	 */
-	u64 dirty_metadata_bytes;
+	/* used to keep from writing metadata until there is a nice batch */
+	struct percpu_counter dirty_metadata_bytes;
+	struct percpu_counter delalloc_bytes;
+	s32 dirty_metadata_batch;
+	s32 delalloc_batch;
+
 	struct list_head dirty_cowonly_roots;
 
 	struct btrfs_fs_devices *fs_devices;
@@ -1442,9 +1531,6 @@ struct btrfs_fs_info {
 
 	struct reloc_control *reloc_ctl;
 
-	spinlock_t delalloc_lock;
-	u64 delalloc_bytes;
-
 	/* data_alloc_cluster is only used in ssd mode */
 	struct btrfs_free_cluster data_alloc_cluster;
 
@@ -1456,6 +1542,8 @@ struct btrfs_fs_info {
 	struct rb_root defrag_inodes;
 	atomic_t defrag_running;
 
+	/* Used to protect avail_{data, metadata, system}_alloc_bits */
+	seqlock_t profiles_lock;
 	/*
 	 * these three are in extended format (availability of single
 	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1513,14 +1601,30 @@ struct btrfs_fs_info {
 	struct rb_root qgroup_tree;
 	spinlock_t qgroup_lock;
 
+	/*
+	 * used to avoid frequently calling ulist_alloc()/ulist_free()
+	 * when doing qgroup accounting, it must be protected by qgroup_lock.
+	 */
+	struct ulist *qgroup_ulist;
+
+	/* protect user change for quota operations */
+	struct mutex qgroup_ioctl_lock;
+
 	/* list of dirty qgroups to be written at next commit */
 	struct list_head dirty_qgroups;
 
 	/* used by btrfs_qgroup_record_ref for an efficient tree traversal */
 	u64 qgroup_seq;
 
+	/* qgroup rescan items */
+	struct mutex qgroup_rescan_lock; /* protects the progress item */
+	struct btrfs_key qgroup_rescan_progress;
+	struct btrfs_workers qgroup_rescan_workers;
+	struct completion qgroup_rescan_completion;
+	struct btrfs_work qgroup_rescan_work;
+
 	/* filesystem state */
-	u64 fs_state;
+	unsigned long fs_state;
 
 	struct btrfs_delayed_root *delayed_root;
 
@@ -1623,6 +1727,9 @@ struct btrfs_root {
 
 	struct list_head root_list;
 
+	spinlock_t log_extents_lock[2];
+	struct list_head logged_list[2];
+
 	spinlock_t orphan_lock;
 	atomic_t orphan_inodes;
 	struct btrfs_block_rsv *orphan_block_rsv;
@@ -1647,6 +1754,31 @@ struct btrfs_root {
 	int force_cow;
 
 	spinlock_t root_item_lock;
+	atomic_t refs;
+
+	spinlock_t delalloc_lock;
+	/*
+	 * all of the inodes that have delalloc bytes.  It is possible for
+	 * this list to be empty even when there is still dirty data=ordered
+	 * extents waiting to finish IO.
+	 */
+	struct list_head delalloc_inodes;
+	struct list_head delalloc_root;
+	u64 nr_delalloc_inodes;
+	/*
+	 * this is used by the balancing code to wait for all the pending
+	 * ordered extents
+	 */
+	spinlock_t ordered_extent_lock;
+
+	/*
+	 * all of the data=ordered extents pending writeback
+	 * these can span multiple transactions and basically include
+	 * every dirty data page that isn't from nodatacow
+	 */
+	struct list_head ordered_extents;
+	struct list_head ordered_root;
+	u64 nr_ordered_extents;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -1737,6 +1869,12 @@ struct btrfs_ioctl_defrag_range_args {
  */
 #define BTRFS_EXTENT_ITEM_KEY	168
 
+/*
+ * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
+ * the length, so we save the level in key->offset instead of the length.
+ */
+#define BTRFS_METADATA_ITEM_KEY	169
+
 #define BTRFS_TREE_BLOCK_REF_KEY	176
 
 #define BTRFS_EXTENT_DATA_REF_KEY	178
@@ -1832,6 +1970,7 @@ struct btrfs_ioctl_defrag_range_args {
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt)	((o) & BTRFS_MOUNT_##opt)
 #define btrfs_test_opt(root, opt)	((root)->fs_info->mount_opt & \
 					 BTRFS_MOUNT_##opt)
 /*
@@ -2694,8 +2833,10 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
 
 static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
 {
-	int t = btrfs_super_csum_type(s);
-	BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+	u16 t = btrfs_super_csum_type(s);
+	/*
+	 * csum type is validated at mount time
+	 */
 	return btrfs_csum_sizes[t];
 }
 
@@ -2792,8 +2933,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
 		   version, 64);
 BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
 		   flags, 64);
-BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item,
-		   scan, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
+		   rescan, 64);
 
 /* btrfs_qgroup_info_item */
 BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
@@ -2927,18 +3068,21 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
 		num_items;
 }
 
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u64 *refs, u64 *flags);
+			     u64 offset, int metadata, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
 		     u64 bytenr, u64 num, int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
 				    u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct btrfs_root *root,
+				 struct extent_buffer *eb);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 objectid, u64 offset, u64 bytenr);
@@ -2946,8 +3090,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
 						 u64 bytenr);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
-u64 btrfs_find_block_group(struct btrfs_root *root,
-			   u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root, u32 blocksize,
 					u64 parent, u64 root_objectid,
@@ -2957,10 +3099,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
 			   u64 parent, int last_ref);
-struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize,
-					    int level);
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 root_objectid, u64 owner,
@@ -2973,7 +3111,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
 				  u64 empty_size, u64 hint_byte,
-				  struct btrfs_key *ins, u64 data);
+				  struct btrfs_key *ins, int is_data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2981,7 +3119,7 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 flags,
-				int is_data);
+				int level, int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
@@ -3013,7 +3151,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *root);
-u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 
@@ -3035,8 +3172,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 				  struct inode *inode);
 void btrfs_orphan_release_metadata(struct inode *inode);
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending);
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+				     struct btrfs_block_rsv *rsv,
+				     int nitems,
+				     u64 *qgroup_reserved);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+				      struct btrfs_block_rsv *rsv,
+				      u64 qgroup_reserved);
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
@@ -3057,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_rsv *dest, u64 num_bytes,
+			     int min_factor);
 void btrfs_block_rsv_release(struct btrfs_root *root,
 			     struct btrfs_block_rsv *block_rsv,
 			     u64 num_bytes);
@@ -3085,17 +3230,16 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
-void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
 			     struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 			struct btrfs_key *key, int lowest_level,
-			int cache_only, u64 min_trans);
+			u64 min_trans);
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 			 struct btrfs_key *max_key,
-			 struct btrfs_path *path, int cache_only,
+			 struct btrfs_path *path,
 			 u64 min_trans);
 enum btrfs_compare_tree_result {
 	BTRFS_COMPARE_TREE_NEW,
@@ -3122,12 +3266,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct extent_buffer **cow_ret, u64 new_root_objectid);
 int btrfs_block_can_be_shared(struct btrfs_root *root,
 			      struct extent_buffer *buf);
-void btrfs_extend_item(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
 		       u32 data_size);
-void btrfs_truncate_item(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 struct btrfs_path *path,
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
 			 u32 new_size, int from_end);
 int btrfs_split_item(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
@@ -3148,7 +3289,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
 			       int find_higher, int return_any);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int start_slot, int cache_only, u64 *last_ret,
+		       int start_slot, u64 *last_ret,
 		       struct btrfs_key *progress);
 void btrfs_release_path(struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
@@ -3167,8 +3308,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 	return btrfs_del_items(trans, root, path, path->slots[0], 1);
 }
 
-void setup_items_for_insert(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, struct btrfs_path *path,
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 			    struct btrfs_key *cpu_key, u32 *data_size,
 			    u32 total_data, u32 total_size, int nr);
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
@@ -3188,9 +3328,6 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
-int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct btrfs_path *path,
-			  int del);
 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
 			u64 time_seq);
 static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3205,7 +3342,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
 {
 	return btrfs_next_old_item(root, p, 0);
 }
-int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
 				     struct btrfs_block_rsv *block_rsv,
@@ -3222,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 	smp_mb();
 	return fs_info->closing;
 }
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+{
+	return (root->fs_info->sb->s_flags & MS_RDONLY ||
+		btrfs_fs_closing(root->fs_info));
+}
+
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
 	kfree(fs_info->balance_ctl);
@@ -3242,10 +3390,7 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 			   struct seq_list *elem);
 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 			    struct seq_list *elem);
-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
-{
-	return atomic_inc_return(&fs_info->tree_mod_seq);
-}
+u64 btrfs_tree_mod_seq_prev(u64 seq);
 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
 
 /* root-item.c */
@@ -3269,12 +3414,11 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct btrfs_key *key,
 				   struct btrfs_root_item *item);
-void btrfs_read_root_item(struct btrfs_root *root,
-			 struct extent_buffer *eb, int slot,
-			 struct btrfs_root_item *item);
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
-			 btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+			  struct btrfs_root_item *item);
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+		    struct btrfs_path *path, struct btrfs_root_item *root_item,
+		    struct btrfs_key *root_key);
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 void btrfs_set_root_node(struct btrfs_root_item *item,
 			 struct extent_buffer *node);
@@ -3304,9 +3448,6 @@ struct btrfs_dir_item *
 btrfs_search_dir_index_item(struct btrfs_root *root,
 			    struct btrfs_path *path, u64 dirid,
 			    const char *name, int name_len);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
-			      struct btrfs_path *path,
-			      const char *name, int name_len);
 int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct btrfs_path *path,
@@ -3384,16 +3525,11 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
 			     u64 bytenr, int mod);
-u64 btrfs_file_extent_length(struct btrfs_path *path);
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio, u64 file_start, int contig);
-struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, int cow);
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
@@ -3416,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
 					   size_t pg_offset, u64 start, u64 len,
 					   int create);
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+			      struct inode *inode, u64 offset, u64 *len,
+			      u64 *orig_start, u64 *orig_block_len,
+			      u64 *ram_bytes);
 
 /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
 #if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -3453,20 +3593,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+				    int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
-int btrfs_writepages(struct address_space *mapping,
-		     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *new_root, u64 new_dirid);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio, unsigned long bio_flags);
-
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio,
+			 unsigned long bio_flags);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-int btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
@@ -3484,7 +3623,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, struct inode *inode);
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
-int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root);
@@ -3535,7 +3673,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
-void btrfs_drop_pages(struct page **pages, size_t num_pages);
 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 		      struct page **pages, size_t num_pages,
 		      loff_t pos, size_t write_bytes,
@@ -3543,7 +3680,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, int cache_only);
+			struct btrfs_root *root);
 
 /* sysfs.c */
 int btrfs_init_sysfs(void);
@@ -3558,14 +3695,31 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
 
 #ifdef CONFIG_PRINTK
 __printf(2, 3)
-void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...);
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
 #else
 static inline __printf(2, 3)
-void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 {
 }
 #endif
 
+#define btrfs_emerg(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+#define btrfs_debug(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+
 __printf(5, 6)
 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		     unsigned int line, int errno, const char *fmt, ...);
@@ -3587,11 +3741,28 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
 	disk_super = fs_info->super_copy;
 	features = btrfs_super_incompat_flags(disk_super);
 	if (!(features & flag)) {
-		features |= flag;
-		btrfs_set_super_incompat_flags(disk_super, features);
+		spin_lock(&fs_info->super_lock);
+		features = btrfs_super_incompat_flags(disk_super);
+		if (!(features & flag)) {
+			features |= flag;
+			btrfs_set_super_incompat_flags(disk_super, features);
+			printk(KERN_INFO "btrfs: setting %llu feature flag\n",
+					 flag);
+		}
+		spin_unlock(&fs_info->super_lock);
 	}
 }
 
+#define btrfs_fs_incompat(fs_info, opt) \
+	__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	disk_super = fs_info->super_copy;
+	return !!(btrfs_super_incompat_flags(disk_super) & flag);
+}
+
 /*
  * Call btrfs_abort_transaction as early as possible when an error condition is
  * detected, that way the exact line number is reported.
@@ -3620,11 +3791,14 @@ __printf(5, 6)
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 		   unsigned int line, int errno, const char *fmt, ...);
 
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic().  Otherwise we BUG() here.
+ */
 #define btrfs_panic(fs_info, errno, fmt, args...)			\
 do {									\
-	struct btrfs_fs_info *_i = (fs_info);				\
-	__btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args);	\
-	BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR));	\
+	__btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args);	\
+	BUG();								\
 } while (0)
 
 /* acl.c */
@@ -3674,7 +3848,6 @@ void btrfs_scrub_continue_super(struct btrfs_root *root);
 int btrfs_scrub_cancel(struct btrfs_fs_info *info);
 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
 			   struct btrfs_device *dev);
-int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 			 struct btrfs_scrub_progress *progress);
 
@@ -3705,7 +3878,9 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 		       struct btrfs_fs_info *fs_info);
 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 			struct btrfs_fs_info *fs_info);
-int btrfs_quota_rescan(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
@@ -3745,4 +3920,11 @@ static inline int is_fstree(u64 rootid)
 		return 1;
 	return 0;
 }
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+	return signal_pending(current);
+}
+
+
 #endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 34836036f01..375510913fe 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -22,8 +22,9 @@
 #include "disk-io.h"
 #include "transaction.h"
 
-#define BTRFS_DELAYED_WRITEBACK		400
-#define BTRFS_DELAYED_BACKGROUND	100
+#define BTRFS_DELAYED_WRITEBACK		512
+#define BTRFS_DELAYED_BACKGROUND	128
+#define BTRFS_DELAYED_BATCH		16
 
 static struct kmem_cache *delayed_node_cache;
 
@@ -201,7 +202,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
 	spin_unlock(&root->lock);
 }
 
-struct btrfs_delayed_node *btrfs_first_delayed_node(
+static struct btrfs_delayed_node *btrfs_first_delayed_node(
 			struct btrfs_delayed_root *delayed_root)
 {
 	struct list_head *p;
@@ -220,7 +221,7 @@ out:
 	return node;
 }
 
-struct btrfs_delayed_node *btrfs_next_delayed_node(
+static struct btrfs_delayed_node *btrfs_next_delayed_node(
 						struct btrfs_delayed_node *node)
 {
 	struct btrfs_delayed_root *delayed_root;
@@ -281,7 +282,7 @@ static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
 	__btrfs_release_delayed_node(node, 0);
 }
 
-struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
+static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
 					struct btrfs_delayed_root *delayed_root)
 {
 	struct list_head *p;
@@ -307,7 +308,7 @@ static inline void btrfs_release_prepared_delayed_node(
 	__btrfs_release_delayed_node(node, 1);
 }
 
-struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
 {
 	struct btrfs_delayed_item *item;
 	item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
@@ -382,7 +383,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
 	return NULL;
 }
 
-struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
 					struct btrfs_delayed_node *delayed_node,
 					struct btrfs_key *key)
 {
@@ -393,45 +394,6 @@ struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
 	return item;
 }
 
-struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item(
-					struct btrfs_delayed_node *delayed_node,
-					struct btrfs_key *key)
-{
-	struct btrfs_delayed_item *item;
-
-	item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
-					   NULL, NULL);
-	return item;
-}
-
-struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item(
-					struct btrfs_delayed_node *delayed_node,
-					struct btrfs_key *key)
-{
-	struct btrfs_delayed_item *item, *next;
-
-	item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
-					   NULL, &next);
-	if (!item)
-		item = next;
-
-	return item;
-}
-
-struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item(
-					struct btrfs_delayed_node *delayed_node,
-					struct btrfs_key *key)
-{
-	struct btrfs_delayed_item *item, *next;
-
-	item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
-					   NULL, &next);
-	if (!item)
-		item = next;
-
-	return item;
-}
-
 static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
 				    struct btrfs_delayed_item *ins,
 				    int action)
@@ -494,6 +456,15 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
 					BTRFS_DELAYED_DELETION_ITEM);
 }
 
+static void finish_one_item(struct btrfs_delayed_root *delayed_root)
+{
+	int seq = atomic_inc_return(&delayed_root->items_seq);
+	if ((atomic_dec_return(&delayed_root->items) <
+	    BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
+	    waitqueue_active(&delayed_root->wait))
+		wake_up(&delayed_root->wait);
+}
+
 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
 {
 	struct rb_root *root;
@@ -512,10 +483,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
 
 	rb_erase(&delayed_item->rb_node, root);
 	delayed_item->delayed_node->count--;
-	if (atomic_dec_return(&delayed_root->items) <
-	    BTRFS_DELAYED_BACKGROUND &&
-	    waitqueue_active(&delayed_root->wait))
-		wake_up(&delayed_root->wait);
+
+	finish_one_item(delayed_root);
 }
 
 static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
@@ -527,7 +496,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
 	}
 }
 
-struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
+static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
 					struct btrfs_delayed_node *delayed_node)
 {
 	struct rb_node *p;
@@ -540,7 +509,7 @@ struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
 	return item;
 }
 
-struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
+static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
 					struct btrfs_delayed_node *delayed_node)
 {
 	struct rb_node *p;
@@ -553,7 +522,7 @@ struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
 	return item;
 }
 
-struct btrfs_delayed_item *__btrfs_next_delayed_item(
+static struct btrfs_delayed_item *__btrfs_next_delayed_item(
 						struct btrfs_delayed_item *item)
 {
 	struct rb_node *p;
@@ -566,20 +535,6 @@ struct btrfs_delayed_item *__btrfs_next_delayed_item(
 	return next;
 }
 
-static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
-						   u64 root_id)
-{
-	struct btrfs_key root_key;
-
-	if (root->objectid == root_id)
-		return root;
-
-	root_key.objectid = root_id;
-	root_key.type = BTRFS_ROOT_ITEM_KEY;
-	root_key.offset = (u64)-1;
-	return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
-}
-
 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 					       struct btrfs_root *root,
 					       struct btrfs_delayed_item *item)
@@ -758,10 +713,9 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
  * This helper will insert some continuous items into the same leaf according
  * to the free space of the leaf.
  */
-static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_path *path,
-				struct btrfs_delayed_item *item)
+static int btrfs_batch_insert_items(struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct btrfs_delayed_item *item)
 {
 	struct btrfs_delayed_item *curr, *next;
 	int free_space;
@@ -840,7 +794,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
 	btrfs_clear_path_blocking(path, NULL, 0);
 
 	/* insert the keys of the items */
-	setup_items_for_insert(trans, root, path, keys, data_size,
+	setup_items_for_insert(root, path, keys, data_size,
 			       total_data_size, total_size, nitems);
 
 	/* insert the dir index items */
@@ -875,7 +829,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 				     struct btrfs_delayed_item *delayed_item)
 {
 	struct extent_buffer *leaf;
-	struct btrfs_item *item;
 	char *ptr;
 	int ret;
 
@@ -886,7 +839,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 
 	leaf = path->nodes[0];
 
-	item = btrfs_item_nr(leaf, path->slots[0]);
 	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
 
 	write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
@@ -926,7 +878,7 @@ do_again:
 	if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
 		/* insert the continuous items into the same leaf */
 		path->slots[0]++;
-		btrfs_batch_insert_items(trans, root, path, curr);
+		btrfs_batch_insert_items(root, path, curr);
 	}
 	btrfs_release_delayed_item(prev);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
@@ -1058,39 +1010,29 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
 		delayed_node->count--;
 
 		delayed_root = delayed_node->root->fs_info->delayed_root;
-		if (atomic_dec_return(&delayed_root->items) <
-		    BTRFS_DELAYED_BACKGROUND &&
-		    waitqueue_active(&delayed_root->wait))
-			wake_up(&delayed_root->wait);
+		finish_one_item(delayed_root);
 	}
 }
 
-static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      struct btrfs_delayed_node *node)
+static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct btrfs_delayed_node *node)
 {
 	struct btrfs_key key;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
 	int ret;
 
-	mutex_lock(&node->mutex);
-	if (!node->inode_dirty) {
-		mutex_unlock(&node->mutex);
-		return 0;
-	}
-
 	key.objectid = node->inode_id;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
+
 	ret = btrfs_lookup_inode(trans, root, path, &key, 1);
 	if (ret > 0) {
 		btrfs_release_path(path);
-		mutex_unlock(&node->mutex);
 		return -ENOENT;
 	} else if (ret < 0) {
-		mutex_unlock(&node->mutex);
 		return ret;
 	}
 
@@ -1105,11 +1047,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 
 	btrfs_delayed_inode_release_metadata(root, node);
 	btrfs_release_delayed_inode(node);
-	mutex_unlock(&node->mutex);
 
 	return 0;
 }
 
+static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path,
+					     struct btrfs_delayed_node *node)
+{
+	int ret;
+
+	mutex_lock(&node->mutex);
+	if (!node->inode_dirty) {
+		mutex_unlock(&node->mutex);
+		return 0;
+	}
+
+	ret = __btrfs_update_delayed_inode(trans, root, path, node);
+	mutex_unlock(&node->mutex);
+	return ret;
+}
+
+static inline int
+__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				   struct btrfs_path *path,
+				   struct btrfs_delayed_node *node)
+{
+	int ret;
+
+	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+	if (ret)
+		return ret;
+
+	ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+	if (ret)
+		return ret;
+
+	ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+	return ret;
+}
+
 /*
  * Called when committing the transaction.
  * Returns 0 on success.
@@ -1119,7 +1097,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root, int nr)
 {
-	struct btrfs_root *curr_root = root;
 	struct btrfs_delayed_root *delayed_root;
 	struct btrfs_delayed_node *curr_node, *prev_node;
 	struct btrfs_path *path;
@@ -1142,15 +1119,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
 
 	curr_node = btrfs_first_delayed_node(delayed_root);
 	while (curr_node && (!count || (count && nr--))) {
-		curr_root = curr_node->root;
-		ret = btrfs_insert_delayed_items(trans, path, curr_root,
-						 curr_node);
-		if (!ret)
-			ret = btrfs_delete_delayed_items(trans, path,
-						curr_root, curr_node);
-		if (!ret)
-			ret = btrfs_update_delayed_inode(trans, curr_root,
-						path, curr_node);
+		ret = __btrfs_commit_inode_delayed_items(trans, path,
+							 curr_node);
 		if (ret) {
 			btrfs_release_delayed_node(curr_node);
 			curr_node = NULL;
@@ -1183,51 +1153,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
 	return __btrfs_run_delayed_items(trans, root, nr);
 }
 
-static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
-					      struct btrfs_delayed_node *node)
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				     struct inode *inode)
 {
+	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
 	struct btrfs_path *path;
 	struct btrfs_block_rsv *block_rsv;
 	int ret;
 
+	if (!delayed_node)
+		return 0;
+
+	mutex_lock(&delayed_node->mutex);
+	if (!delayed_node->count) {
+		mutex_unlock(&delayed_node->mutex);
+		btrfs_release_delayed_node(delayed_node);
+		return 0;
+	}
+	mutex_unlock(&delayed_node->mutex);
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 	path->leave_spinning = 1;
 
 	block_rsv = trans->block_rsv;
-	trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
+	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
 
-	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
-	if (!ret)
-		ret = btrfs_delete_delayed_items(trans, path, node->root, node);
-	if (!ret)
-		ret = btrfs_update_delayed_inode(trans, node->root, path, node);
-	btrfs_free_path(path);
+	ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
 
+	btrfs_release_delayed_node(delayed_node);
+	btrfs_free_path(path);
 	trans->block_rsv = block_rsv;
+
 	return ret;
 }
 
-int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
-				     struct inode *inode)
+int btrfs_commit_inode_delayed_inode(struct inode *inode)
 {
+	struct btrfs_trans_handle *trans;
 	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *block_rsv;
 	int ret;
 
 	if (!delayed_node)
 		return 0;
 
 	mutex_lock(&delayed_node->mutex);
-	if (!delayed_node->count) {
+	if (!delayed_node->inode_dirty) {
 		mutex_unlock(&delayed_node->mutex);
 		btrfs_release_delayed_node(delayed_node);
 		return 0;
 	}
 	mutex_unlock(&delayed_node->mutex);
 
-	ret = __btrfs_commit_inode_delayed_items(trans, delayed_node);
+	trans = btrfs_join_transaction(delayed_node->root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto trans_out;
+	}
+	path->leave_spinning = 1;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+	mutex_lock(&delayed_node->mutex);
+	if (delayed_node->inode_dirty)
+		ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
+						   path, delayed_node);
+	else
+		ret = 0;
+	mutex_unlock(&delayed_node->mutex);
+
+	btrfs_free_path(path);
+	trans->block_rsv = block_rsv;
+trans_out:
+	btrfs_end_transaction(trans, delayed_node->root);
+	btrfs_btree_balance_dirty(delayed_node->root);
+out:
 	btrfs_release_delayed_node(delayed_node);
+
 	return ret;
 }
 
@@ -1243,48 +1255,49 @@ void btrfs_remove_delayed_node(struct inode *inode)
 	btrfs_release_delayed_node(delayed_node);
 }
 
-struct btrfs_async_delayed_node {
-	struct btrfs_root *root;
-	struct btrfs_delayed_node *delayed_node;
+struct btrfs_async_delayed_work {
+	struct btrfs_delayed_root *delayed_root;
+	int nr;
 	struct btrfs_work work;
 };
 
-static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
+static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 {
-	struct btrfs_async_delayed_node *async_node;
+	struct btrfs_async_delayed_work *async_work;
+	struct btrfs_delayed_root *delayed_root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path;
 	struct btrfs_delayed_node *delayed_node = NULL;
 	struct btrfs_root *root;
 	struct btrfs_block_rsv *block_rsv;
-	int need_requeue = 0;
-	int ret;
+	int total_done = 0;
 
-	async_node = container_of(work, struct btrfs_async_delayed_node, work);
+	async_work = container_of(work, struct btrfs_async_delayed_work, work);
+	delayed_root = async_work->delayed_root;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		goto out;
-	path->leave_spinning = 1;
 
-	delayed_node = async_node->delayed_node;
+again:
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2)
+		goto free_path;
+
+	delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+	if (!delayed_node)
+		goto free_path;
+
+	path->leave_spinning = 1;
 	root = delayed_node->root;
 
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
-		goto free_path;
+		goto release_path;
 
 	block_rsv = trans->block_rsv;
 	trans->block_rsv = &root->fs_info->delayed_block_rsv;
 
-	ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
-	if (!ret)
-		ret = btrfs_delete_delayed_items(trans, path, root,
-						 delayed_node);
-
-	if (!ret)
-		btrfs_update_delayed_inode(trans, root, path, delayed_node);
-
+	__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
 	/*
 	 * Maybe new delayed items have been inserted, so we need requeue
 	 * the work. Besides that, we must dequeue the empty delayed nodes
@@ -1310,57 +1323,47 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
 	 * Task1 will sleep until the transaction is commited.
 	 */
 	mutex_lock(&delayed_node->mutex);
-	if (delayed_node->count)
-		need_requeue = 1;
-	else
-		btrfs_dequeue_delayed_node(root->fs_info->delayed_root,
-					   delayed_node);
+	btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node);
 	mutex_unlock(&delayed_node->mutex);
 
 	trans->block_rsv = block_rsv;
 	btrfs_end_transaction_dmeta(trans, root);
 	btrfs_btree_balance_dirty_nodelay(root);
+
+release_path:
+	btrfs_release_path(path);
+	total_done++;
+
+	btrfs_release_prepared_delayed_node(delayed_node);
+	if (async_work->nr == 0 || total_done < async_work->nr)
+		goto again;
+
 free_path:
 	btrfs_free_path(path);
 out:
-	if (need_requeue)
-		btrfs_requeue_work(&async_node->work);
-	else {
-		btrfs_release_prepared_delayed_node(delayed_node);
-		kfree(async_node);
-	}
+	wake_up(&delayed_root->wait);
+	kfree(async_work);
 }
 
+
 static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
-				     struct btrfs_root *root, int all)
+				     struct btrfs_root *root, int nr)
 {
-	struct btrfs_async_delayed_node *async_node;
-	struct btrfs_delayed_node *curr;
-	int count = 0;
+	struct btrfs_async_delayed_work *async_work;
 
-again:
-	curr = btrfs_first_prepared_delayed_node(delayed_root);
-	if (!curr)
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
 		return 0;
 
-	async_node = kmalloc(sizeof(*async_node), GFP_NOFS);
-	if (!async_node) {
-		btrfs_release_prepared_delayed_node(curr);
+	async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
+	if (!async_work)
 		return -ENOMEM;
-	}
-
-	async_node->root = root;
-	async_node->delayed_node = curr;
-
-	async_node->work.func = btrfs_async_run_delayed_node_done;
-	async_node->work.flags = 0;
-
-	btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work);
-	count++;
 
-	if (all || count < 4)
-		goto again;
+	async_work->delayed_root = delayed_root;
+	async_work->work.func = btrfs_async_run_delayed_root;
+	async_work->work.flags = 0;
+	async_work->nr = nr;
 
+	btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work);
 	return 0;
 }
 
@@ -1371,30 +1374,55 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
 	WARN_ON(btrfs_first_delayed_node(delayed_root));
 }
 
+static int refs_newer(struct btrfs_delayed_root *delayed_root,
+		      int seq, int count)
+{
+	int val = atomic_read(&delayed_root->items_seq);
+
+	if (val < seq || val >= seq + count)
+		return 1;
+	return 0;
+}
+
 void btrfs_balance_delayed_items(struct btrfs_root *root)
 {
 	struct btrfs_delayed_root *delayed_root;
+	int seq;
 
 	delayed_root = btrfs_get_delayed_root(root);
 
 	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
 		return;
 
+	seq = atomic_read(&delayed_root->items_seq);
+
 	if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
 		int ret;
-		ret = btrfs_wq_run_delayed_node(delayed_root, root, 1);
+		DEFINE_WAIT(__wait);
+
+		ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
 		if (ret)
 			return;
 
-		wait_event_interruptible_timeout(
-				delayed_root->wait,
-				(atomic_read(&delayed_root->items) <
-				 BTRFS_DELAYED_BACKGROUND),
-				HZ);
-		return;
+		while (1) {
+			prepare_to_wait(&delayed_root->wait, &__wait,
+					TASK_INTERRUPTIBLE);
+
+			if (refs_newer(delayed_root, seq,
+				       BTRFS_DELAYED_BATCH) ||
+			    atomic_read(&delayed_root->items) <
+			    BTRFS_DELAYED_BACKGROUND) {
+				break;
+			}
+			if (!signal_pending(current))
+				schedule();
+			else
+				break;
+		}
+		finish_wait(&delayed_root->wait, &__wait);
 	}
 
-	btrfs_wq_run_delayed_node(delayed_root, root, 0);
+	btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
 }
 
 /* Will return 0 or -ENOMEM */
@@ -1639,8 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
  * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
  *
  */
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
-				    filldir_t filldir,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 				    struct list_head *ins_list)
 {
 	struct btrfs_dir_item *di;
@@ -1662,13 +1689,13 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
 	list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
 		list_del(&curr->readdir_list);
 
-		if (curr->key.offset < filp->f_pos) {
+		if (curr->key.offset < ctx->pos) {
 			if (atomic_dec_and_test(&curr->refs))
 				kfree(curr);
 			continue;
 		}
 
-		filp->f_pos = curr->key.offset;
+		ctx->pos = curr->key.offset;
 
 		di = (struct btrfs_dir_item *)curr->data;
 		name = (char *)(di + 1);
@@ -1677,7 +1704,7 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
 		d_type = btrfs_filetype_table[di->type];
 		btrfs_disk_key_to_cpu(&location, &di->location);
 
-		over = filldir(dirent, name, name_len, curr->key.offset,
+		over = !dir_emit(ctx, name, name_len,
 			       location.objectid, d_type);
 
 		if (atomic_dec_and_test(&curr->refs))
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 4f808e1baee..a4b38f934d1 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -43,6 +43,7 @@ struct btrfs_delayed_root {
 	 */
 	struct list_head prepare_list;
 	atomic_t items;		/* for delayed items */
+	atomic_t items_seq;	/* for delayed items */
 	int nodes;		/* for delayed nodes */
 	wait_queue_head_t wait;
 };
@@ -86,6 +87,7 @@ static inline void btrfs_init_delayed_root(
 				struct btrfs_delayed_root *delayed_root)
 {
 	atomic_set(&delayed_root->items, 0);
+	atomic_set(&delayed_root->items_seq, 0);
 	delayed_root->nodes = 0;
 	spin_lock_init(&delayed_root->lock);
 	init_waitqueue_head(&delayed_root->wait);
@@ -117,6 +119,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 /* Used for evicting the inode. */
 void btrfs_remove_delayed_node(struct inode *inode);
 void btrfs_kill_delayed_inode_items(struct inode *inode);
+int btrfs_commit_inode_delayed_inode(struct inode *inode);
 
 
 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
@@ -136,8 +139,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
 			     struct list_head *del_list);
 int btrfs_should_delete_dir_index(struct list_head *del_list,
 				  u64 index);
-int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
-				    filldir_t filldir,
+int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 				    struct list_head *ins_list);
 
 /* for init */
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ae941177339..c219463fb1f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -23,6 +23,10 @@
 #include "delayed-ref.h"
 #include "transaction.h"
 
+struct kmem_cache *btrfs_delayed_ref_head_cachep;
+struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_extent_op_cachep;
 /*
  * delayed back reference update tracking.  For subvolume trees
  * we queue up extent allocations and backref maintenance for
@@ -36,16 +40,19 @@
  * compare two delayed tree backrefs with same bytenr and type
  */
 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
-			  struct btrfs_delayed_tree_ref *ref1)
+			  struct btrfs_delayed_tree_ref *ref1, int type)
 {
-	if (ref1->root < ref2->root)
-		return -1;
-	if (ref1->root > ref2->root)
-		return 1;
-	if (ref1->parent < ref2->parent)
-		return -1;
-	if (ref1->parent > ref2->parent)
-		return 1;
+	if (type == BTRFS_TREE_BLOCK_REF_KEY) {
+		if (ref1->root < ref2->root)
+			return -1;
+		if (ref1->root > ref2->root)
+			return 1;
+	} else {
+		if (ref1->parent < ref2->parent)
+			return -1;
+		if (ref1->parent > ref2->parent)
+			return 1;
+	}
 	return 0;
 }
 
@@ -109,7 +116,8 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
-				      btrfs_delayed_node_to_tree_ref(ref1));
+				      btrfs_delayed_node_to_tree_ref(ref1),
+				      ref1->type);
 	} else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
 		   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
 		return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
@@ -353,8 +361,10 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
 					struct seq_list, list);
 		if (seq >= elem->seq) {
-			pr_debug("holding back delayed_ref %llu, lowest is "
-				 "%llu (%p)\n", seq, elem->seq, delayed_refs);
+			pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
+				 (u32)(seq >> 32), (u32)seq,
+				 (u32)(elem->seq >> 32), (u32)elem->seq,
+				 delayed_refs);
 			ret = 1;
 		}
 	}
@@ -422,6 +432,14 @@ again:
 	return 1;
 }
 
+void btrfs_release_ref_cluster(struct list_head *cluster)
+{
+	struct list_head *pos, *q;
+
+	list_for_each_safe(pos, q, cluster)
+		list_del_init(pos);
+}
+
 /*
  * helper function to update an extent delayed ref in the
  * rbtree.  existing and update must both have the same
@@ -511,7 +529,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 					ref->extent_op->flags_to_set;
 				existing_ref->extent_op->update_flags = 1;
 			}
-			kfree(ref->extent_op);
+			btrfs_free_delayed_extent_op(ref->extent_op);
 		}
 	}
 	/*
@@ -592,7 +610,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
-		kfree(head_ref);
+		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
 	} else {
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
@@ -653,7 +671,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
-		kfree(full_ref);
+		kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
 	} else {
 		delayed_refs->num_entries++;
 		trans->delayed_ref_updates++;
@@ -714,7 +732,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
-		kfree(full_ref);
+		kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
 	} else {
 		delayed_refs->num_entries++;
 		trans->delayed_ref_updates++;
@@ -738,13 +756,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	BUG_ON(extent_op && extent_op->is_data);
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
 	if (!head_ref) {
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
 		return -ENOMEM;
 	}
 
@@ -786,13 +804,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	BUG_ON(extent_op && !extent_op->is_data);
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
 	if (!head_ref) {
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
 		return -ENOMEM;
 	}
 
@@ -826,7 +844,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
 	if (!head_ref)
 		return -ENOMEM;
 
@@ -860,3 +878,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 		return btrfs_delayed_node_to_head(ref);
 	return NULL;
 }
+
+void btrfs_delayed_ref_exit(void)
+{
+	if (btrfs_delayed_ref_head_cachep)
+		kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+	if (btrfs_delayed_tree_ref_cachep)
+		kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+	if (btrfs_delayed_data_ref_cachep)
+		kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+	if (btrfs_delayed_extent_op_cachep)
+		kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
+}
+
+int btrfs_delayed_ref_init(void)
+{
+	btrfs_delayed_ref_head_cachep = kmem_cache_create(
+				"btrfs_delayed_ref_head",
+				sizeof(struct btrfs_delayed_ref_head), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_ref_head_cachep)
+		goto fail;
+
+	btrfs_delayed_tree_ref_cachep = kmem_cache_create(
+				"btrfs_delayed_tree_ref",
+				sizeof(struct btrfs_delayed_tree_ref), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_tree_ref_cachep)
+		goto fail;
+
+	btrfs_delayed_data_ref_cachep = kmem_cache_create(
+				"btrfs_delayed_data_ref",
+				sizeof(struct btrfs_delayed_data_ref), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_data_ref_cachep)
+		goto fail;
+
+	btrfs_delayed_extent_op_cachep = kmem_cache_create(
+				"btrfs_delayed_extent_op",
+				sizeof(struct btrfs_delayed_extent_op), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_extent_op_cachep)
+		goto fail;
+
+	return 0;
+fail:
+	btrfs_delayed_ref_exit();
+	return -ENOMEM;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c9d703693df..70b962cc177 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -60,6 +60,7 @@ struct btrfs_delayed_ref_node {
 struct btrfs_delayed_extent_op {
 	struct btrfs_disk_key key;
 	u64 flags_to_set;
+	int level;
 	unsigned int update_key:1;
 	unsigned int update_flags:1;
 	unsigned int is_data:1;
@@ -132,6 +133,15 @@ struct btrfs_delayed_ref_root {
 	unsigned long num_heads_ready;
 
 	/*
+	 * bumped when someone is making progress on the delayed
+	 * refs, so that other procs know they are just adding to
+	 * contention intead of helping
+	 */
+	atomic_t procs_running_refs;
+	atomic_t ref_seq;
+	wait_queue_head_t wait;
+
+	/*
 	 * set when the tree is flushing before a transaction commit,
 	 * used by the throttling code to decide if new updates need
 	 * to be run right away
@@ -141,12 +151,47 @@ struct btrfs_delayed_ref_root {
 	u64 run_delayed_start;
 };
 
+extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
+extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
+
+int btrfs_delayed_ref_init(void);
+void btrfs_delayed_ref_exit(void);
+
+static inline struct btrfs_delayed_extent_op *
+btrfs_alloc_delayed_extent_op(void)
+{
+	return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
+}
+
+static inline void
+btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
+{
+	if (op)
+		kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
+}
+
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 {
 	WARN_ON(atomic_read(&ref->refs) == 0);
 	if (atomic_dec_and_test(&ref->refs)) {
 		WARN_ON(ref->in_tree);
-		kfree(ref);
+		switch (ref->type) {
+		case BTRFS_TREE_BLOCK_REF_KEY:
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+		case BTRFS_SHARED_DATA_REF_KEY:
+			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+			break;
+		case 0:
+			kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
+			break;
+		default:
+			BUG();
+		}
 	}
 }
 
@@ -176,8 +221,14 @@ struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 			   struct btrfs_delayed_ref_head *head);
+static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
+{
+	mutex_unlock(&head->mutex);
+}
+
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 			   struct list_head *cluster, u64 search_start);
+void btrfs_release_ref_cluster(struct list_head *cluster);
 
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 66dbc8dbddf..4253ad580e3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -313,6 +313,11 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	struct btrfs_device *tgt_device = NULL;
 	struct btrfs_device *src_device = NULL;
 
+	if (btrfs_fs_incompat(fs_info, RAID56)) {
+		pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n");
+		return -EINVAL;
+	}
+
 	switch (args->start.cont_reading_from_srcdev_mode) {
 	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
 	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
@@ -395,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 	btrfs_dev_replace_unlock(dev_replace);
 
-	btrfs_wait_ordered_extents(root, 0);
+	btrfs_wait_all_ordered_extents(root->fs_info, 0);
 
 	/* force writing the updated state information to disk */
 	trans = btrfs_start_transaction(root, 0);
@@ -465,8 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	 * flush all outstanding I/O and inode extent mappings before the
 	 * copy operation is declared as being finished
 	 */
-	btrfs_start_delalloc_inodes(root, 0);
-	btrfs_wait_ordered_extents(root, 0);
+	ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+	if (ret) {
+		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+		return ret;
+	}
+	btrfs_wait_all_ordered_extents(root->fs_info, 0);
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 502c2158167..79e594e341c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,6 +21,10 @@
 #include "hash.h"
 #include "transaction.h"
 
+static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len);
+
 /*
  * insert a name into a directory, doing overflow properly if there is a hash
  * collision.  data_size indicates how big the item inserted should be.  On
@@ -49,7 +53,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 		di = btrfs_match_dir_item_name(root, path, name, name_len);
 		if (di)
 			return ERR_PTR(-EEXIST);
-		btrfs_extend_item(trans, root, path, data_size);
+		btrfs_extend_item(root, path, data_size);
 	} else if (ret < 0)
 		return ERR_PTR(ret);
 	WARN_ON(ret > 0);
@@ -379,7 +383,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
  * this walks through all the entries in a dir item and finds one
  * for a specific name.
  */
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 			      struct btrfs_path *path,
 			      const char *name, int name_len)
 {
@@ -442,8 +446,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 		start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			item_len - (ptr + sub_item_len - start));
-		btrfs_truncate_item(trans, root, path,
-				    item_len - sub_item_len, 1);
+		btrfs_truncate_item(root, path, item_len - sub_item_len, 1);
 	}
 	return ret;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a8f652dc940..6b092a1c4e3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/migrate.h>
 #include <linux/ratelimit.h>
+#include <linux/uuid.h>
 #include <asm/unaligned.h>
 #include "compat.h"
 #include "ctree.h"
@@ -46,6 +47,7 @@
 #include "check-integrity.h"
 #include "rcu-string.h"
 #include "dev-replace.h"
+#include "raid56.h"
 
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -56,17 +58,20 @@ static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 				    int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+					     struct btrfs_root *root);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 				      struct btrfs_root *root);
-static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t);
 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages,
 					int mark);
 static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
 				       struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
+static void btrfs_error_commit_super(struct btrfs_root *root);
 
 /*
  * end_io_wq structs are used to do processing in task context when an IO is
@@ -147,7 +152,7 @@ static struct btrfs_lockdep_keyset {
 	{ .id = BTRFS_DEV_TREE_OBJECTID,	.name_stem = "dev"	},
 	{ .id = BTRFS_FS_TREE_OBJECTID,		.name_stem = "fs"	},
 	{ .id = BTRFS_CSUM_TREE_OBJECTID,	.name_stem = "csum"	},
-	{ .id = BTRFS_ORPHAN_OBJECTID,		.name_stem = "orphan"	},
+	{ .id = BTRFS_QUOTA_TREE_OBJECTID,	.name_stem = "quota"	},
 	{ .id = BTRFS_TREE_LOG_OBJECTID,	.name_stem = "log"	},
 	{ .id = BTRFS_TREE_RELOC_OBJECTID,	.name_stem = "treloc"	},
 	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	},
@@ -220,7 +225,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
+	ret = add_extent_mapping(em_tree, em, 0);
 	if (ret == -EEXIST) {
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
@@ -236,7 +241,7 @@ out:
 	return em;
 }
 
-u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+u32 btrfs_csum_data(char *data, u32 seed, size_t len)
 {
 	return crc32c(seed, data, len);
 }
@@ -272,7 +277,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 		if (err)
 			return 1;
 		cur_len = min(len, map_len - (offset - map_start));
-		crc = btrfs_csum_data(root, kaddr + offset - map_start,
+		crc = btrfs_csum_data(kaddr + offset - map_start,
 				      crc, cur_len);
 		len -= cur_len;
 		offset += cur_len;
@@ -352,6 +357,49 @@ out:
 }
 
 /*
+ * Return 0 if the superblock checksum type matches the checksum value of that
+ * algorithm. Pass the raw disk superblock data.
+ */
+static int btrfs_check_super_csum(char *raw_disk_sb)
+{
+	struct btrfs_super_block *disk_sb =
+		(struct btrfs_super_block *)raw_disk_sb;
+	u16 csum_type = btrfs_super_csum_type(disk_sb);
+	int ret = 0;
+
+	if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
+		u32 crc = ~(u32)0;
+		const int csum_size = sizeof(crc);
+		char result[csum_size];
+
+		/*
+		 * The super_block structure does not span the whole
+		 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
+		 * is filled with zeros and is included in the checkum.
+		 */
+		crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
+				crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+		btrfs_csum_final(crc, result);
+
+		if (memcmp(raw_disk_sb, result, csum_size))
+			ret = 1;
+
+		if (ret && btrfs_super_generation(disk_sb) < 10) {
+			printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n");
+			ret = 0;
+		}
+	}
+
+	if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
+		printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n",
+				csum_type);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+/*
  * helper to read a given tree block, doing retries as required when
  * the checksums don't match and we have alternate mirrors to try.
  */
@@ -420,7 +468,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
 	struct extent_io_tree *tree;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 found_start;
 	struct extent_buffer *eb;
 
@@ -528,41 +576,6 @@ static noinline int check_leaf(struct btrfs_root *root,
 	return 0;
 }
 
-struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
-				       struct page *page, int max_walk)
-{
-	struct extent_buffer *eb;
-	u64 start = page_offset(page);
-	u64 target = start;
-	u64 min_start;
-
-	if (start < max_walk)
-		min_start = 0;
-	else
-		min_start = start - max_walk;
-
-	while (start >= min_start) {
-		eb = find_extent_buffer(tree, start, 0);
-		if (eb) {
-			/*
-			 * we found an extent buffer and it contains our page
-			 * horray!
-			 */
-			if (eb->start <= target &&
-			    eb->start + eb->len > target)
-				return eb;
-
-			/* we found an extent buffer that wasn't for us */
-			free_extent_buffer(eb);
-			return NULL;
-		}
-		if (start == 0)
-			break;
-		start -= PAGE_CACHE_SIZE;
-	}
-	return NULL;
-}
-
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state, int mirror)
 {
@@ -611,6 +624,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		goto err;
 	}
 	found_level = btrfs_header_level(eb);
+	if (found_level >= BTRFS_MAX_LEVEL) {
+		btrfs_info(root->fs_info, "bad tree block level %d\n",
+			   (int)btrfs_header_level(eb));
+		ret = -EIO;
+		goto err;
+	}
 
 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
 				       eb, found_level);
@@ -634,13 +653,19 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	if (!ret)
 		set_extent_buffer_uptodate(eb);
 err:
-	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
-		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+	if (reads_done &&
+	    test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
 		btree_readahead_hook(root, eb, eb->start, ret);
-	}
 
-	if (ret)
+	if (ret) {
+		/*
+		 * our io error hook is going to dec the io pages
+		 * again, we have to make sure it has something
+		 * to decrement
+		 */
+		atomic_inc(&eb->io_pages);
 		clear_extent_buffer_uptodate(eb);
+	}
 	free_extent_buffer(eb);
 out:
 	return ret;
@@ -654,6 +679,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
 	eb = (struct extent_buffer *)page->private;
 	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
 	eb->read_mirror = failed_mirror;
+	atomic_dec(&eb->io_pages);
 	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
 		btree_readahead_hook(root, eb, eb->start, -EIO);
 	return -EIO;	/* we fixed nothing */
@@ -670,17 +696,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
 	end_io_wq->work.flags = 0;
 
 	if (bio->bi_rw & REQ_WRITE) {
-		if (end_io_wq->metadata == 1)
+		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
 			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
 					   &end_io_wq->work);
-		else if (end_io_wq->metadata == 2)
+		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
 			btrfs_queue_worker(&fs_info->endio_freespace_worker,
 					   &end_io_wq->work);
+		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+			btrfs_queue_worker(&fs_info->endio_raid56_workers,
+					   &end_io_wq->work);
 		else
 			btrfs_queue_worker(&fs_info->endio_write_workers,
 					   &end_io_wq->work);
 	} else {
-		if (end_io_wq->metadata)
+		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+			btrfs_queue_worker(&fs_info->endio_raid56_workers,
+					   &end_io_wq->work);
+		else if (end_io_wq->metadata)
 			btrfs_queue_worker(&fs_info->endio_meta_workers,
 					   &end_io_wq->work);
 		else
@@ -695,6 +727,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
  * 0 - if data
  * 1 - if normal metadta
  * 2 - if writing to the free space cache area
+ * 3 - raid parity work
  */
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata)
@@ -946,18 +979,20 @@ static int btree_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
+	struct btrfs_fs_info *fs_info;
+	int ret;
+
 	tree = &BTRFS_I(mapping->host)->io_tree;
 	if (wbc->sync_mode == WB_SYNC_NONE) {
-		struct btrfs_root *root = BTRFS_I(mapping->host)->root;
-		u64 num_dirty;
-		unsigned long thresh = 32 * 1024 * 1024;
 
 		if (wbc->for_kupdate)
 			return 0;
 
+		fs_info = BTRFS_I(mapping->host)->root->fs_info;
 		/* this is a bit racy, but that's ok */
-		num_dirty = root->fs_info->dirty_metadata_bytes;
-		if (num_dirty < thresh)
+		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+					     BTRFS_DIRTY_METADATA_THRESH);
+		if (ret < 0)
 			return 0;
 	}
 	return btree_write_cache_pages(mapping, wbc);
@@ -974,17 +1009,12 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	if (PageWriteback(page) || PageDirty(page))
 		return 0;
-	/*
-	 * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
-	 * slab allocation from alloc_extent_state down the callchain where
-	 * it'd hit a BUG_ON as those flags are not allowed.
-	 */
-	gfp_flags &= ~GFP_SLAB_BUG_MASK;
 
-	return try_release_extent_buffer(page, gfp_flags);
+	return try_release_extent_buffer(page);
 }
 
-static void btree_invalidatepage(struct page *page, unsigned long offset)
+static void btree_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1125,24 +1155,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      struct extent_buffer *buf)
 {
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
 	if (btrfs_header_generation(buf) ==
-	    root->fs_info->running_transaction->transid) {
+	    fs_info->running_transaction->transid) {
 		btrfs_assert_tree_locked(buf);
 
 		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
-			spin_lock(&root->fs_info->delalloc_lock);
-			if (root->fs_info->dirty_metadata_bytes >= buf->len)
-				root->fs_info->dirty_metadata_bytes -= buf->len;
-			else {
-				spin_unlock(&root->fs_info->delalloc_lock);
-				btrfs_panic(root->fs_info, -EOVERFLOW,
-					  "Can't clear %lu bytes from "
-					  " dirty_mdatadata_bytes (%llu)",
-					  buf->len,
-					  root->fs_info->dirty_metadata_bytes);
-			}
-			spin_unlock(&root->fs_info->delalloc_lock);
-
+			__percpu_counter_add(&fs_info->dirty_metadata_bytes,
+					     -buf->len,
+					     fs_info->dirty_metadata_batch);
 			/* ugh, clear_extent_buffer_dirty needs to lock the page */
 			btrfs_set_lock_blocking(buf);
 			clear_extent_buffer_dirty(buf);
@@ -1170,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->objectid = objectid;
 	root->last_trans = 0;
 	root->highest_objectid = 0;
+	root->nr_delalloc_inodes = 0;
+	root->nr_ordered_extents = 0;
 	root->name = NULL;
 	root->inode_tree = RB_ROOT;
 	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1178,9 +1202,19 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	INIT_LIST_HEAD(&root->root_list);
+	INIT_LIST_HEAD(&root->delalloc_inodes);
+	INIT_LIST_HEAD(&root->delalloc_root);
+	INIT_LIST_HEAD(&root->ordered_extents);
+	INIT_LIST_HEAD(&root->ordered_root);
+	INIT_LIST_HEAD(&root->logged_list[0]);
+	INIT_LIST_HEAD(&root->logged_list[1]);
 	spin_lock_init(&root->orphan_lock);
 	spin_lock_init(&root->inode_lock);
+	spin_lock_init(&root->delalloc_lock);
+	spin_lock_init(&root->ordered_extent_lock);
 	spin_lock_init(&root->accounting_lock);
+	spin_lock_init(&root->log_extents_lock[0]);
+	spin_lock_init(&root->log_extents_lock[1]);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
 	init_waitqueue_head(&root->log_writer_wait);
@@ -1191,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	atomic_set(&root->log_writers, 0);
 	atomic_set(&root->log_batch, 0);
 	atomic_set(&root->orphan_inodes, 0);
+	atomic_set(&root->refs, 1);
 	root->log_transid = 0;
 	root->last_log_commit = 0;
 	extent_io_tree_init(&root->dirty_log_pages,
@@ -1209,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->root_item_lock);
 }
 
-static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
-					    struct btrfs_fs_info *fs_info,
-					    u64 objectid,
-					    struct btrfs_root *root)
-{
-	int ret;
-	u32 blocksize;
-	u64 generation;
-
-	__setup_root(tree_root->nodesize, tree_root->leafsize,
-		     tree_root->sectorsize, tree_root->stripesize,
-		     root, fs_info, objectid);
-	ret = btrfs_find_last_root(tree_root, objectid,
-				   &root->root_item, &root->root_key);
-	if (ret > 0)
-		return -ENOENT;
-	else if (ret < 0)
-		return ret;
-
-	generation = btrfs_root_generation(&root->root_item);
-	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
-	root->commit_root = NULL;
-	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-				     blocksize, generation);
-	if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
-		free_extent_buffer(root->node);
-		root->node = NULL;
-		return -EIO;
-	}
-	root->commit_root = btrfs_root_node(root);
-	return 0;
-}
-
 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1260,6 +1262,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	int ret = 0;
 	u64 bytenr;
+	uuid_le uuid;
 
 	root = btrfs_alloc_root(fs_info);
 	if (!root)
@@ -1276,6 +1279,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				      0, objectid, NULL, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
+		leaf = NULL;
 		goto fail;
 	}
 
@@ -1308,6 +1312,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	btrfs_set_root_used(&root->root_item, leaf->len);
 	btrfs_set_root_last_snapshot(&root->root_item, 0);
 	btrfs_set_root_dirid(&root->root_item, 0);
+	uuid_le_gen(&uuid);
+	memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
 	root->root_item.drop_level = 0;
 
 	key.objectid = objectid;
@@ -1319,11 +1325,16 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 
 	btrfs_tree_unlock(leaf);
 
+	return root;
+
 fail:
-	if (ret)
-		return ERR_PTR(ret);
+	if (leaf) {
+		btrfs_tree_unlock(leaf);
+		free_extent_buffer(leaf);
+	}
+	kfree(root);
 
-	return root;
+	return ERR_PTR(ret);
 }
 
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
@@ -1417,63 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
-					       struct btrfs_key *location)
+struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+					struct btrfs_key *key)
 {
 	struct btrfs_root *root;
 	struct btrfs_fs_info *fs_info = tree_root->fs_info;
 	struct btrfs_path *path;
-	struct extent_buffer *l;
 	u64 generation;
 	u32 blocksize;
-	int ret = 0;
-	int slot;
+	int ret;
 
-	root = btrfs_alloc_root(fs_info);
-	if (!root)
+	path = btrfs_alloc_path();
+	if (!path)
 		return ERR_PTR(-ENOMEM);
-	if (location->offset == (u64)-1) {
-		ret = find_and_setup_root(tree_root, fs_info,
-					  location->objectid, root);
-		if (ret) {
-			kfree(root);
-			return ERR_PTR(ret);
-		}
-		goto out;
+
+	root = btrfs_alloc_root(fs_info);
+	if (!root) {
+		ret = -ENOMEM;
+		goto alloc_fail;
 	}
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
-		     root, fs_info, location->objectid);
+		     root, fs_info, key->objectid);
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		kfree(root);
-		return ERR_PTR(-ENOMEM);
-	}
-	ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
-	if (ret == 0) {
-		l = path->nodes[0];
-		slot = path->slots[0];
-		btrfs_read_root_item(tree_root, l, slot, &root->root_item);
-		memcpy(&root->root_key, location, sizeof(*location));
-	}
-	btrfs_free_path(path);
+	ret = btrfs_find_root(tree_root, key, path,
+			      &root->root_item, &root->root_key);
 	if (ret) {
-		kfree(root);
 		if (ret > 0)
 			ret = -ENOENT;
-		return ERR_PTR(ret);
+		goto find_fail;
 	}
 
 	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
+	if (!root->node) {
+		ret = -ENOMEM;
+		goto find_fail;
+	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+		ret = -EIO;
+		goto read_fail;
+	}
 	root->commit_root = btrfs_root_node(root);
-	BUG_ON(!root->node); /* -ENOMEM */
 out:
-	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+	btrfs_free_path(path);
+	return root;
+
+read_fail:
+	free_extent_buffer(root->node);
+find_fail:
+	kfree(root);
+alloc_fail:
+	root = ERR_PTR(ret);
+	goto out;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+				      struct btrfs_key *location)
+{
+	struct btrfs_root *root;
+
+	root = btrfs_read_tree_root(tree_root, location);
+	if (IS_ERR(root))
+		return root;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 		root->ref_cows = 1;
 		btrfs_check_and_init_root_item(&root->root_item);
 	}
@@ -1481,6 +1502,66 @@ out:
 	return root;
 }
 
+int btrfs_init_fs_root(struct btrfs_root *root)
+{
+	int ret;
+
+	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+					GFP_NOFS);
+	if (!root->free_ino_pinned || !root->free_ino_ctl) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	btrfs_init_free_ino_ctl(root);
+	mutex_init(&root->fs_commit_mutex);
+	spin_lock_init(&root->cache_lock);
+	init_waitqueue_head(&root->cache_wait);
+
+	ret = get_anon_bdev(&root->anon_dev);
+	if (ret)
+		goto fail;
+	return 0;
+fail:
+	kfree(root->free_ino_ctl);
+	kfree(root->free_ino_pinned);
+	return ret;
+}
+
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_id)
+{
+	struct btrfs_root *root;
+
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+				 (unsigned long)root_id);
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+	return root;
+}
+
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+			 struct btrfs_root *root)
+{
+	int ret;
+
+	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+	if (ret)
+		return ret;
+
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	ret = radix_tree_insert(&fs_info->fs_roots_radix,
+				(unsigned long)root->root_key.objectid,
+				root);
+	if (ret == 0)
+		root->in_radix = 1;
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+	radix_tree_preload_end();
+
+	return ret;
+}
+
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 					      struct btrfs_key *location)
 {
@@ -1501,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 		return fs_info->quota_root ? fs_info->quota_root :
 					     ERR_PTR(-ENOENT);
 again:
-	spin_lock(&fs_info->fs_roots_radix_lock);
-	root = radix_tree_lookup(&fs_info->fs_roots_radix,
-				 (unsigned long)location->objectid);
-	spin_unlock(&fs_info->fs_roots_radix_lock);
+	root = btrfs_lookup_fs_root(fs_info, location->objectid);
 	if (root)
 		return root;
 
-	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+	root = btrfs_read_fs_root(fs_info->tree_root, location);
 	if (IS_ERR(root))
 		return root;
 
-	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
-	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
-					GFP_NOFS);
-	if (!root->free_ino_pinned || !root->free_ino_ctl) {
-		ret = -ENOMEM;
+	if (btrfs_root_refs(&root->root_item) == 0) {
+		ret = -ENOENT;
 		goto fail;
 	}
 
-	btrfs_init_free_ino_ctl(root);
-	mutex_init(&root->fs_commit_mutex);
-	spin_lock_init(&root->cache_lock);
-	init_waitqueue_head(&root->cache_wait);
-
-	ret = get_anon_bdev(&root->anon_dev);
+	ret = btrfs_init_fs_root(root);
 	if (ret)
 		goto fail;
 
-	if (btrfs_root_refs(&root->root_item) == 0) {
-		ret = -ENOENT;
-		goto fail;
-	}
-
 	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
 	if (ret < 0)
 		goto fail;
 	if (ret == 0)
 		root->orphan_item_inserted = 1;
 
-	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
-	if (ret)
-		goto fail;
-
-	spin_lock(&fs_info->fs_roots_radix_lock);
-	ret = radix_tree_insert(&fs_info->fs_roots_radix,
-				(unsigned long)root->root_key.objectid,
-				root);
-	if (ret == 0)
-		root->in_radix = 1;
-
-	spin_unlock(&fs_info->fs_roots_radix_lock);
-	radix_tree_preload_end();
+	ret = btrfs_insert_fs_root(fs_info, root);
 	if (ret) {
 		if (ret == -EEXIST) {
 			free_fs_root(root);
@@ -1560,10 +1613,6 @@ again:
 		}
 		goto fail;
 	}
-
-	ret = btrfs_find_dead_roots(fs_info->tree_root,
-				    root->root_key.objectid);
-	WARN_ON(ret);
 	return root;
 fail:
 	free_fs_root(root);
@@ -1635,17 +1684,38 @@ static void end_workqueue_fn(struct btrfs_work *work)
 static int cleaner_kthread(void *arg)
 {
 	struct btrfs_root *root = arg;
+	int again;
 
 	do {
-		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
-		    mutex_trylock(&root->fs_info->cleaner_mutex)) {
-			btrfs_run_delayed_iputs(root);
-			btrfs_clean_old_snapshots(root);
+		again = 0;
+
+		/* Make the cleaner go to sleep early. */
+		if (btrfs_need_cleaner_sleep(root))
+			goto sleep;
+
+		if (!mutex_trylock(&root->fs_info->cleaner_mutex))
+			goto sleep;
+
+		/*
+		 * Avoid the problem that we change the status of the fs
+		 * during the above check and trylock.
+		 */
+		if (btrfs_need_cleaner_sleep(root)) {
 			mutex_unlock(&root->fs_info->cleaner_mutex);
-			btrfs_run_defrag_inodes(root->fs_info);
+			goto sleep;
 		}
 
-		if (!try_to_freeze()) {
+		btrfs_run_delayed_iputs(root);
+		again = btrfs_clean_one_deleted_snapshot(root);
+		mutex_unlock(&root->fs_info->cleaner_mutex);
+
+		/*
+		 * The defragger has dealt with the R/O remount and umount,
+		 * needn't do anything special here.
+		 */
+		btrfs_run_defrag_inodes(root->fs_info);
+sleep:
+		if (!try_to_freeze() && !again) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (!kthread_should_stop())
 				schedule();
@@ -1678,7 +1748,7 @@ static int transaction_kthread(void *arg)
 		}
 
 		now = get_seconds();
-		if (!cur->blocked &&
+		if (cur->state < TRANS_STATE_BLOCKED &&
 		    (now < cur->start_time || now - cur->start_time < 30)) {
 			spin_unlock(&root->fs_info->trans_lock);
 			delay = HZ * 5;
@@ -1914,35 +1984,60 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
 	return 0;
 }
 
+/* helper to cleanup workers */
+static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
+{
+	btrfs_stop_workers(&fs_info->generic_worker);
+	btrfs_stop_workers(&fs_info->fixup_workers);
+	btrfs_stop_workers(&fs_info->delalloc_workers);
+	btrfs_stop_workers(&fs_info->workers);
+	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
+	btrfs_stop_workers(&fs_info->endio_raid56_workers);
+	btrfs_stop_workers(&fs_info->rmw_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
+	btrfs_stop_workers(&fs_info->endio_freespace_worker);
+	btrfs_stop_workers(&fs_info->submit_workers);
+	btrfs_stop_workers(&fs_info->delayed_workers);
+	btrfs_stop_workers(&fs_info->caching_workers);
+	btrfs_stop_workers(&fs_info->readahead_workers);
+	btrfs_stop_workers(&fs_info->flush_workers);
+	btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
+}
+
 /* helper to cleanup tree roots */
 static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 {
 	free_extent_buffer(info->tree_root->node);
 	free_extent_buffer(info->tree_root->commit_root);
-	free_extent_buffer(info->dev_root->node);
-	free_extent_buffer(info->dev_root->commit_root);
-	free_extent_buffer(info->extent_root->node);
-	free_extent_buffer(info->extent_root->commit_root);
-	free_extent_buffer(info->csum_root->node);
-	free_extent_buffer(info->csum_root->commit_root);
-	if (info->quota_root) {
-		free_extent_buffer(info->quota_root->node);
-		free_extent_buffer(info->quota_root->commit_root);
-	}
-
 	info->tree_root->node = NULL;
 	info->tree_root->commit_root = NULL;
-	info->dev_root->node = NULL;
-	info->dev_root->commit_root = NULL;
-	info->extent_root->node = NULL;
-	info->extent_root->commit_root = NULL;
-	info->csum_root->node = NULL;
-	info->csum_root->commit_root = NULL;
+
+	if (info->dev_root) {
+		free_extent_buffer(info->dev_root->node);
+		free_extent_buffer(info->dev_root->commit_root);
+		info->dev_root->node = NULL;
+		info->dev_root->commit_root = NULL;
+	}
+	if (info->extent_root) {
+		free_extent_buffer(info->extent_root->node);
+		free_extent_buffer(info->extent_root->commit_root);
+		info->extent_root->node = NULL;
+		info->extent_root->commit_root = NULL;
+	}
+	if (info->csum_root) {
+		free_extent_buffer(info->csum_root->node);
+		free_extent_buffer(info->csum_root->commit_root);
+		info->csum_root->node = NULL;
+		info->csum_root->commit_root = NULL;
+	}
 	if (info->quota_root) {
+		free_extent_buffer(info->quota_root->node);
+		free_extent_buffer(info->quota_root->commit_root);
 		info->quota_root->node = NULL;
 		info->quota_root->commit_root = NULL;
 	}
-
 	if (chunk_root) {
 		free_extent_buffer(info->chunk_root->node);
 		free_extent_buffer(info->chunk_root->commit_root);
@@ -1951,6 +2046,36 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 	}
 }
 
+static void del_fs_roots(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	struct btrfs_root *gang[8];
+	int i;
+
+	while (!list_empty(&fs_info->dead_roots)) {
+		gang[0] = list_entry(fs_info->dead_roots.next,
+				     struct btrfs_root, root_list);
+		list_del(&gang[0]->root_list);
+
+		if (gang[0]->in_radix) {
+			btrfs_drop_and_free_fs_root(fs_info, gang[0]);
+		} else {
+			free_extent_buffer(gang[0]->node);
+			free_extent_buffer(gang[0]->commit_root);
+			btrfs_put_fs_root(gang[0]);
+		}
+	}
+
+	while (1) {
+		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+					     (void **)gang, 0,
+					     ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++)
+			btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+	}
+}
 
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
@@ -1980,14 +2105,8 @@ int open_ctree(struct super_block *sb,
 	int backup_index = 0;
 
 	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
-	extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
-	csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
 	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
-	dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
-	quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
-
-	if (!tree_root || !extent_root || !csum_root ||
-	    !chunk_root || !dev_root || !quota_root) {
+	if (!tree_root || !chunk_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -2004,10 +2123,24 @@ int open_ctree(struct super_block *sb,
 		goto fail_srcu;
 	}
 
+	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+	if (ret) {
+		err = ret;
+		goto fail_bdi;
+	}
+	fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+					(1 + ilog2(nr_cpu_ids));
+
+	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+	if (ret) {
+		err = ret;
+		goto fail_dirty_metadata_bytes;
+	}
+
 	fs_info->btree_inode = new_inode(sb);
 	if (!fs_info->btree_inode) {
 		err = -ENOMEM;
-		goto fail_bdi;
+		goto fail_delalloc_bytes;
 	}
 
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2016,18 +2149,19 @@ int open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->delayed_iputs);
-	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
-	INIT_LIST_HEAD(&fs_info->ordered_operations);
+	INIT_LIST_HEAD(&fs_info->delalloc_roots);
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);
-	spin_lock_init(&fs_info->delalloc_lock);
+	spin_lock_init(&fs_info->delalloc_root_lock);
 	spin_lock_init(&fs_info->trans_lock);
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->delayed_iput_lock);
 	spin_lock_init(&fs_info->defrag_inodes_lock);
 	spin_lock_init(&fs_info->free_chunk_lock);
 	spin_lock_init(&fs_info->tree_mod_seq_lock);
+	spin_lock_init(&fs_info->super_lock);
 	rwlock_init(&fs_info->tree_mod_log_lock);
 	mutex_init(&fs_info->reloc_mutex);
+	seqlock_init(&fs_info->profiles_lock);
 
 	init_completion(&fs_info->kobj_unregister);
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2048,12 +2182,11 @@ int open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
 	atomic_set(&fs_info->defrag_running, 0);
-	atomic_set(&fs_info->tree_mod_seq, 0);
+	atomic64_set(&fs_info->tree_mod_seq, 0);
 	fs_info->sb = sb;
 	fs_info->max_inline = 8192 * 1024;
 	fs_info->metadata_ratio = 0;
 	fs_info->defrag_inodes = RB_ROOT;
-	fs_info->trans_no_join = 0;
 	fs_info->free_chunk_space = 0;
 	fs_info->tree_mod_log = RB_ROOT;
 
@@ -2064,8 +2197,8 @@ int open_ctree(struct super_block *sb,
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
 
-	INIT_LIST_HEAD(&fs_info->ordered_extents);
-	spin_lock_init(&fs_info->ordered_extent_lock);
+	INIT_LIST_HEAD(&fs_info->ordered_roots);
+	spin_lock_init(&fs_info->ordered_root_lock);
 	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
 					GFP_NOFS);
 	if (!fs_info->delayed_root) {
@@ -2126,6 +2259,7 @@ int open_ctree(struct super_block *sb,
 
 	spin_lock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree = RB_ROOT;
+	fs_info->first_logical_byte = (u64)-1;
 
 	extent_io_tree_init(&fs_info->freed_extents[0],
 			     fs_info->btree_inode->i_mapping);
@@ -2151,11 +2285,14 @@ int open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->dev_replace.lock);
 
 	spin_lock_init(&fs_info->qgroup_lock);
+	mutex_init(&fs_info->qgroup_ioctl_lock);
 	fs_info->qgroup_tree = RB_ROOT;
 	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
 	fs_info->qgroup_seq = 1;
 	fs_info->quota_enabled = 0;
 	fs_info->pending_quota_state = 0;
+	fs_info->qgroup_ulist = NULL;
+	mutex_init(&fs_info->qgroup_rescan_lock);
 
 	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
 	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -2165,16 +2302,41 @@ int open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->transaction_blocked_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
 
+	ret = btrfs_alloc_stripe_hash_table(fs_info);
+	if (ret) {
+		err = ret;
+		goto fail_alloc;
+	}
+
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
 	invalidate_bdev(fs_devices->latest_bdev);
+
+	/*
+	 * Read super block and check the signature bytes only
+	 */
 	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
 	if (!bh) {
 		err = -EINVAL;
 		goto fail_alloc;
 	}
 
+	/*
+	 * We want to check superblock checksum, the type is stored inside.
+	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
+	 */
+	if (btrfs_check_super_csum(bh->b_data)) {
+		printk(KERN_ERR "btrfs: superblock checksum mismatch\n");
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	/*
+	 * super_copy is zeroed at allocation time and we never touch the
+	 * following bytes up to INFO_SIZE, the checksum is calculated from
+	 * the whole block of INFO_SIZE
+	 */
 	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
 	memcpy(fs_info->super_for_commit, fs_info->super_copy,
 	       sizeof(*fs_info->super_for_commit));
@@ -2182,20 +2344,21 @@ int open_ctree(struct super_block *sb,
 
 	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
 
-	disk_super = fs_info->super_copy;
-	if (!btrfs_super_root(disk_super))
-		goto fail_alloc;
-
-	/* check FS state, whether FS is broken. */
-	fs_info->fs_state |= btrfs_super_flags(disk_super);
-
 	ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
 	if (ret) {
 		printk(KERN_ERR "btrfs: superblock contains fatal errors\n");
-		err = ret;
+		err = -EINVAL;
 		goto fail_alloc;
 	}
 
+	disk_super = fs_info->super_copy;
+	if (!btrfs_super_root(disk_super))
+		goto fail_alloc;
+
+	/* check FS state, whether FS is broken. */
+	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+		set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
+
 	/*
 	 * run through our array of backup supers and setup
 	 * our ring pointer to the oldest one
@@ -2247,6 +2410,9 @@ int open_ctree(struct super_block *sb,
 	if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
 		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
 
+	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
+		printk(KERN_ERR "btrfs: has skinny extents\n");
+
 	/*
 	 * flag our filesystem as having big metadata blocks if
 	 * they are bigger than the page size
@@ -2261,6 +2427,8 @@ int open_ctree(struct super_block *sb,
 	leafsize = btrfs_super_leafsize(disk_super);
 	sectorsize = btrfs_super_sectorsize(disk_super);
 	stripesize = btrfs_super_stripesize(disk_super);
+	fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
 
 	/*
 	 * mixed block groups end up with duplicate but slightly offset
@@ -2274,6 +2442,10 @@ int open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
+	/*
+	 * Needn't use the lock because there is no other task which will
+	 * update the flag.
+	 */
 	btrfs_set_super_incompat_flags(disk_super, features);
 
 	features = btrfs_super_compat_ro_flags(disk_super) &
@@ -2332,6 +2504,12 @@ int open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->endio_meta_write_workers,
 			   "endio-meta-write", fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->endio_raid56_workers,
+			   "endio-raid56", fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->rmw_workers,
+			   "rmw", fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
@@ -2343,6 +2521,8 @@ int open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->readahead_workers, "readahead",
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
+			   &fs_info->generic_worker);
 
 	/*
 	 * endios are largely parallel and should have a very
@@ -2350,6 +2530,8 @@ int open_ctree(struct super_block *sb,
 	 */
 	fs_info->endio_workers.idle_thresh = 4;
 	fs_info->endio_meta_workers.idle_thresh = 4;
+	fs_info->endio_raid56_workers.idle_thresh = 4;
+	fs_info->rmw_workers.idle_thresh = 2;
 
 	fs_info->endio_write_workers.idle_thresh = 2;
 	fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2548,8 @@ int open_ctree(struct super_block *sb,
 	ret |= btrfs_start_workers(&fs_info->fixup_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
+	ret |= btrfs_start_workers(&fs_info->rmw_workers);
+	ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_write_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2373,6 +2557,7 @@ int open_ctree(struct super_block *sb,
 	ret |= btrfs_start_workers(&fs_info->caching_workers);
 	ret |= btrfs_start_workers(&fs_info->readahead_workers);
 	ret |= btrfs_start_workers(&fs_info->flush_workers);
+	ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
 	if (ret) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
@@ -2390,8 +2575,7 @@ int open_ctree(struct super_block *sb,
 	sb->s_blocksize = sectorsize;
 	sb->s_blocksize_bits = blksize_bits(sectorsize);
 
-	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-		    sizeof(disk_super->magic))) {
+	if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) {
 		printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
 		goto fail_sb_buffer;
 	}
@@ -2421,8 +2605,8 @@ int open_ctree(struct super_block *sb,
 	chunk_root->node = read_tree_block(chunk_root,
 					   btrfs_super_chunk_root(disk_super),
 					   blocksize, generation);
-	BUG_ON(!chunk_root->node); /* -ENOMEM */
-	if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+	if (!chunk_root->node ||
+	    !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
 		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
 		       sb->s_id);
 		goto fail_tree_roots;
@@ -2472,33 +2656,44 @@ retry_root_backup:
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
 
-	ret = find_and_setup_root(tree_root, fs_info,
-				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
-	if (ret)
+	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = 0;
+
+	extent_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(extent_root)) {
+		ret = PTR_ERR(extent_root);
 		goto recovery_tree_root;
+	}
 	extent_root->track_dirty = 1;
+	fs_info->extent_root = extent_root;
 
-	ret = find_and_setup_root(tree_root, fs_info,
-				  BTRFS_DEV_TREE_OBJECTID, dev_root);
-	if (ret)
+	location.objectid = BTRFS_DEV_TREE_OBJECTID;
+	dev_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(dev_root)) {
+		ret = PTR_ERR(dev_root);
 		goto recovery_tree_root;
+	}
 	dev_root->track_dirty = 1;
+	fs_info->dev_root = dev_root;
+	btrfs_init_devices_late(fs_info);
 
-	ret = find_and_setup_root(tree_root, fs_info,
-				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
-	if (ret)
+	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+	csum_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(csum_root)) {
+		ret = PTR_ERR(csum_root);
 		goto recovery_tree_root;
+	}
 	csum_root->track_dirty = 1;
+	fs_info->csum_root = csum_root;
 
-	ret = find_and_setup_root(tree_root, fs_info,
-				  BTRFS_QUOTA_TREE_OBJECTID, quota_root);
-	if (ret) {
-		kfree(quota_root);
-		quota_root = fs_info->quota_root = NULL;
-	} else {
+	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+	quota_root = btrfs_read_tree_root(tree_root, &location);
+	if (!IS_ERR(quota_root)) {
 		quota_root->track_dirty = 1;
 		fs_info->quota_enabled = 1;
 		fs_info->pending_quota_state = 1;
+		fs_info->quota_root = quota_root;
 	}
 
 	fs_info->generation = generation;
@@ -2607,6 +2802,13 @@ retry_root_backup:
 		log_tree_root->node = read_tree_block(tree_root, bytenr,
 						      blocksize,
 						      generation + 1);
+		if (!log_tree_root->node ||
+		    !extent_buffer_uptodate(log_tree_root->node)) {
+			printk(KERN_ERR "btrfs: failed to read log tree\n");
+			free_extent_buffer(log_tree_root->node);
+			kfree(log_tree_root);
+			goto fail_trans_kthread;
+		}
 		/* returns with log_tree_root freed on success */
 		ret = btrfs_recover_log_trees(log_tree_root);
 		if (ret) {
@@ -2644,11 +2846,9 @@ retry_root_backup:
 
 	location.objectid = BTRFS_FS_TREE_OBJECTID;
 	location.type = BTRFS_ROOT_ITEM_KEY;
-	location.offset = (u64)-1;
+	location.offset = 0;
 
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
-	if (!fs_info->fs_root)
-		goto fail_qgroup;
 	if (IS_ERR(fs_info->fs_root)) {
 		err = PTR_ERR(fs_info->fs_root);
 		goto fail_qgroup;
@@ -2680,12 +2880,16 @@ retry_root_backup:
 		return ret;
 	}
 
+	btrfs_qgroup_rescan_resume(fs_info);
+
 	return 0;
 
 fail_qgroup:
 	btrfs_free_qgroup_config(fs_info);
 fail_trans_kthread:
 	kthread_stop(fs_info->transaction_kthread);
+	btrfs_cleanup_transaction(fs_info->tree_root);
+	del_fs_roots(fs_info);
 fail_cleaner:
 	kthread_stop(fs_info->cleaner_kthread);
 
@@ -2694,40 +2898,32 @@ fail_cleaner:
 	 * kthreads
 	 */
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
-	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
 fail_block_groups:
+	btrfs_put_block_group_cache(fs_info);
 	btrfs_free_block_groups(fs_info);
 
 fail_tree_roots:
 	free_root_pointers(fs_info, 1);
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
 fail_sb_buffer:
-	btrfs_stop_workers(&fs_info->generic_worker);
-	btrfs_stop_workers(&fs_info->readahead_workers);
-	btrfs_stop_workers(&fs_info->fixup_workers);
-	btrfs_stop_workers(&fs_info->delalloc_workers);
-	btrfs_stop_workers(&fs_info->workers);
-	btrfs_stop_workers(&fs_info->endio_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
-	btrfs_stop_workers(&fs_info->endio_write_workers);
-	btrfs_stop_workers(&fs_info->endio_freespace_worker);
-	btrfs_stop_workers(&fs_info->submit_workers);
-	btrfs_stop_workers(&fs_info->delayed_workers);
-	btrfs_stop_workers(&fs_info->caching_workers);
-	btrfs_stop_workers(&fs_info->flush_workers);
+	btrfs_stop_all_workers(fs_info);
 fail_alloc:
 fail_iput:
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
-	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
+fail_delalloc_bytes:
+	percpu_counter_destroy(&fs_info->delalloc_bytes);
+fail_dirty_metadata_bytes:
+	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 fail_bdi:
 	bdi_destroy(&fs_info->bdi);
 fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
+	btrfs_free_stripe_hash_table(fs_info);
 	btrfs_close_devices(fs_info->fs_devices);
 	return err;
 
@@ -2795,8 +2991,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 
 		super = (struct btrfs_super_block *)bh->b_data;
 		if (btrfs_super_bytenr(super) != bytenr ||
-		    strncmp((char *)(&super->magic), BTRFS_MAGIC,
-			    sizeof(super->magic))) {
+		    super->magic != cpu_to_le64(BTRFS_MAGIC)) {
 			brelse(bh);
 			continue;
 		}
@@ -2845,7 +3040,10 @@ static int write_dev_supers(struct btrfs_device *device,
 		if (wait) {
 			bh = __find_get_block(device->bdev, bytenr / 4096,
 					      BTRFS_SUPER_INFO_SIZE);
-			BUG_ON(!bh);
+			if (!bh) {
+				errors++;
+				continue;
+			}
 			wait_on_buffer(bh);
 			if (!buffer_uptodate(bh))
 				errors++;
@@ -2860,7 +3058,7 @@ static int write_dev_supers(struct btrfs_device *device,
 			btrfs_set_super_bytenr(sb, bytenr);
 
 			crc = ~(u32)0;
-			crc = btrfs_csum_data(NULL, (char *)sb +
+			crc = btrfs_csum_data((char *)sb +
 					      BTRFS_CSUM_SIZE, crc,
 					      BTRFS_SUPER_INFO_SIZE -
 					      BTRFS_CSUM_SIZE);
@@ -2872,6 +3070,13 @@ static int write_dev_supers(struct btrfs_device *device,
 			 */
 			bh = __getblk(device->bdev, bytenr / 4096,
 				      BTRFS_SUPER_INFO_SIZE);
+			if (!bh) {
+				printk(KERN_ERR "btrfs: couldn't get super "
+				       "buffer head for bytenr %Lu\n", bytenr);
+				errors++;
+				continue;
+			}
+
 			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
 
 			/* one reference for submit_bh */
@@ -2954,7 +3159,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 	 * caller
 	 */
 	device->flush_bio = NULL;
-	bio = bio_alloc(GFP_NOFS, 0);
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
 	if (!bio)
 		return -ENOMEM;
 
@@ -3076,11 +3281,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
 				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
 				      == 0)))
 					num_tolerated_disk_barrier_failures = 0;
-				else if (num_tolerated_disk_barrier_failures > 1
-					 &&
-					 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-						   BTRFS_BLOCK_GROUP_RAID10)))
-					num_tolerated_disk_barrier_failures = 1;
+				else if (num_tolerated_disk_barrier_failures > 1) {
+					if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+					    BTRFS_BLOCK_GROUP_RAID5 |
+					    BTRFS_BLOCK_GROUP_RAID10)) {
+						num_tolerated_disk_barrier_failures = 1;
+					} else if (flags &
+						   BTRFS_BLOCK_GROUP_RAID6) {
+						num_tolerated_disk_barrier_failures = 2;
+					}
+				}
 			}
 		}
 		up_read(&sinfo->groups_sem);
@@ -3089,7 +3299,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
 	return num_tolerated_disk_barrier_failures;
 }
 
-int write_all_supers(struct btrfs_root *root, int max_mirrors)
+static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
 	struct list_head *head;
 	struct btrfs_device *dev;
@@ -3185,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+/* Drop a fs root from the radix tree and free it. */
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+				  struct btrfs_root *root)
 {
 	spin_lock(&fs_info->fs_roots_radix_lock);
 	radix_tree_delete(&fs_info->fs_roots_radix,
@@ -3195,6 +3407,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 	if (btrfs_root_refs(&root->root_item) == 0)
 		synchronize_srcu(&fs_info->subvol_srcu);
 
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		btrfs_free_log(NULL, root);
+		btrfs_free_log_root_tree(NULL, fs_info);
+	}
+
 	__btrfs_remove_free_space_cache(root->free_ino_pinned);
 	__btrfs_remove_free_space_cache(root->free_ino_ctl);
 	free_fs_root(root);
@@ -3211,38 +3428,12 @@ static void free_fs_root(struct btrfs_root *root)
 	kfree(root->free_ino_ctl);
 	kfree(root->free_ino_pinned);
 	kfree(root->name);
-	kfree(root);
+	btrfs_put_fs_root(root);
 }
 
-static void del_fs_roots(struct btrfs_fs_info *fs_info)
+void btrfs_free_fs_root(struct btrfs_root *root)
 {
-	int ret;
-	struct btrfs_root *gang[8];
-	int i;
-
-	while (!list_empty(&fs_info->dead_roots)) {
-		gang[0] = list_entry(fs_info->dead_roots.next,
-				     struct btrfs_root, root_list);
-		list_del(&gang[0]->root_list);
-
-		if (gang[0]->in_radix) {
-			btrfs_free_fs_root(fs_info, gang[0]);
-		} else {
-			free_extent_buffer(gang[0]->node);
-			free_extent_buffer(gang[0]->commit_root);
-			kfree(gang[0]);
-		}
-	}
-
-	while (1) {
-		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
-					     (void **)gang, 0,
-					     ARRAY_SIZE(gang));
-		if (!ret)
-			break;
-		for (i = 0; i < ret; i++)
-			btrfs_free_fs_root(fs_info, gang[i]);
-	}
+	free_fs_root(root);
 }
 
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3280,8 +3471,8 @@ int btrfs_commit_super(struct btrfs_root *root)
 
 	mutex_lock(&root->fs_info->cleaner_mutex);
 	btrfs_run_delayed_iputs(root);
-	btrfs_clean_old_snapshots(root);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
+	wake_up_process(root->fs_info->cleaner_kthread);
 
 	/* wait until ongoing cleanup work done */
 	down_write(&root->fs_info->cleanup_work_sem);
@@ -3339,7 +3530,7 @@ int close_ctree(struct btrfs_root *root)
 			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
 
-	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		btrfs_error_commit_super(root);
 
 	btrfs_put_block_group_cache(fs_info);
@@ -3352,46 +3543,20 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_free_qgroup_config(root->fs_info);
 
-	if (fs_info->delalloc_bytes) {
-		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
-		       (unsigned long long)fs_info->delalloc_bytes);
-	}
-
-	free_extent_buffer(fs_info->extent_root->node);
-	free_extent_buffer(fs_info->extent_root->commit_root);
-	free_extent_buffer(fs_info->tree_root->node);
-	free_extent_buffer(fs_info->tree_root->commit_root);
-	free_extent_buffer(fs_info->chunk_root->node);
-	free_extent_buffer(fs_info->chunk_root->commit_root);
-	free_extent_buffer(fs_info->dev_root->node);
-	free_extent_buffer(fs_info->dev_root->commit_root);
-	free_extent_buffer(fs_info->csum_root->node);
-	free_extent_buffer(fs_info->csum_root->commit_root);
-	if (fs_info->quota_root) {
-		free_extent_buffer(fs_info->quota_root->node);
-		free_extent_buffer(fs_info->quota_root->commit_root);
+	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
+		printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
+		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
 	btrfs_free_block_groups(fs_info);
 
+	btrfs_stop_all_workers(fs_info);
+
 	del_fs_roots(fs_info);
 
-	iput(fs_info->btree_inode);
+	free_root_pointers(fs_info, 1);
 
-	btrfs_stop_workers(&fs_info->generic_worker);
-	btrfs_stop_workers(&fs_info->fixup_workers);
-	btrfs_stop_workers(&fs_info->delalloc_workers);
-	btrfs_stop_workers(&fs_info->workers);
-	btrfs_stop_workers(&fs_info->endio_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
-	btrfs_stop_workers(&fs_info->endio_write_workers);
-	btrfs_stop_workers(&fs_info->endio_freespace_worker);
-	btrfs_stop_workers(&fs_info->submit_workers);
-	btrfs_stop_workers(&fs_info->delayed_workers);
-	btrfs_stop_workers(&fs_info->caching_workers);
-	btrfs_stop_workers(&fs_info->readahead_workers);
-	btrfs_stop_workers(&fs_info->flush_workers);
+	iput(fs_info->btree_inode);
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3401,9 +3566,13 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
+	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+	percpu_counter_destroy(&fs_info->delalloc_bytes);
 	bdi_destroy(&fs_info->bdi);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
+	btrfs_free_stripe_hash_table(fs_info);
+
 	return 0;
 }
 
@@ -3443,11 +3612,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 			(unsigned long long)transid,
 			(unsigned long long)root->fs_info->generation);
 	was_dirty = set_extent_buffer_dirty(buf);
-	if (!was_dirty) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->dirty_metadata_bytes += buf->len;
-		spin_unlock(&root->fs_info->delalloc_lock);
-	}
+	if (!was_dirty)
+		__percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
+				     buf->len,
+				     root->fs_info->dirty_metadata_batch);
 }
 
 static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
@@ -3457,8 +3625,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
 	 * looks as though older kernels can get into trouble with
 	 * this code, they end up stuck in balance_dirty_pages forever
 	 */
-	u64 num_dirty;
-	unsigned long thresh = 32 * 1024 * 1024;
+	int ret;
 
 	if (current->flags & PF_MEMALLOC)
 		return;
@@ -3466,9 +3633,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
 	if (flush_delayed)
 		btrfs_balance_delayed_items(root);
 
-	num_dirty = root->fs_info->dirty_metadata_bytes;
-
-	if (num_dirty > thresh) {
+	ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+				     BTRFS_DIRTY_METADATA_THRESH);
+	if (ret > 0) {
 		balance_dirty_pages_ratelimited(
 				   root->fs_info->btree_inode->i_mapping);
 	}
@@ -3494,18 +3661,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 			      int read_only)
 {
-	if (btrfs_super_csum_type(fs_info->super_copy) >= ARRAY_SIZE(btrfs_csum_sizes)) {
-		printk(KERN_ERR "btrfs: unsupported checksum algorithm\n");
-		return -EINVAL;
-	}
-
-	if (read_only)
-		return 0;
-
+	/*
+	 * Placeholder for checks
+	 */
 	return 0;
 }
 
-void btrfs_error_commit_super(struct btrfs_root *root)
+static void btrfs_error_commit_super(struct btrfs_root *root)
 {
 	mutex_lock(&root->fs_info->cleaner_mutex);
 	btrfs_run_delayed_iputs(root);
@@ -3518,7 +3680,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
 	btrfs_cleanup_transaction(root);
 }
 
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+					     struct btrfs_root *root)
 {
 	struct btrfs_inode *btrfs_inode;
 	struct list_head splice;
@@ -3526,54 +3689,59 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
 	INIT_LIST_HEAD(&splice);
 
 	mutex_lock(&root->fs_info->ordered_operations_mutex);
-	spin_lock(&root->fs_info->ordered_extent_lock);
+	spin_lock(&root->fs_info->ordered_root_lock);
 
-	list_splice_init(&root->fs_info->ordered_operations, &splice);
+	list_splice_init(&t->ordered_operations, &splice);
 	while (!list_empty(&splice)) {
 		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
 					 ordered_operations);
 
 		list_del_init(&btrfs_inode->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_root_lock);
 
 		btrfs_invalidate_inodes(btrfs_inode->root);
+
+		spin_lock(&root->fs_info->ordered_root_lock);
 	}
 
-	spin_unlock(&root->fs_info->ordered_extent_lock);
+	spin_unlock(&root->fs_info->ordered_root_lock);
 	mutex_unlock(&root->fs_info->ordered_operations_mutex);
 }
 
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
 {
-	struct list_head splice;
 	struct btrfs_ordered_extent *ordered;
-	struct inode *inode;
 
-	INIT_LIST_HEAD(&splice);
-
-	spin_lock(&root->fs_info->ordered_extent_lock);
-
-	list_splice_init(&root->fs_info->ordered_extents, &splice);
-	while (!list_empty(&splice)) {
-		ordered = list_entry(splice.next, struct btrfs_ordered_extent,
-				     root_extent_list);
+	spin_lock(&root->ordered_extent_lock);
+	/*
+	 * This will just short circuit the ordered completion stuff which will
+	 * make sure the ordered extent gets properly cleaned up.
+	 */
+	list_for_each_entry(ordered, &root->ordered_extents,
+			    root_extent_list)
+		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+	spin_unlock(&root->ordered_extent_lock);
+}
 
-		list_del_init(&ordered->root_extent_list);
-		atomic_inc(&ordered->refs);
+static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
 
-		/* the inode may be getting freed (in sys_unlink path). */
-		inode = igrab(ordered->inode);
+	INIT_LIST_HEAD(&splice);
 
-		spin_unlock(&root->fs_info->ordered_extent_lock);
-		if (inode)
-			iput(inode);
+	spin_lock(&fs_info->ordered_root_lock);
+	list_splice_init(&fs_info->ordered_roots, &splice);
+	while (!list_empty(&splice)) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					ordered_root);
+		list_del_init(&root->ordered_root);
 
-		atomic_set(&ordered->refs, 1);
-		btrfs_put_ordered_extent(ordered);
+		btrfs_destroy_ordered_extents(root);
 
-		spin_lock(&root->fs_info->ordered_extent_lock);
+		cond_resched_lock(&fs_info->ordered_root_lock);
 	}
-
-	spin_unlock(&root->fs_info->ordered_extent_lock);
+	spin_unlock(&fs_info->ordered_root_lock);
 }
 
 int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3594,11 +3762,12 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 	}
 
 	while ((node = rb_first(&delayed_refs->root)) != NULL) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		struct btrfs_delayed_ref_head *head = NULL;
+		bool pin_bytes = false;
 
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
 		atomic_set(&ref->refs, 1);
 		if (btrfs_delayed_ref_is_head(ref)) {
-			struct btrfs_delayed_ref_head *head;
 
 			head = btrfs_delayed_node_to_head(ref);
 			if (!mutex_trylock(&head->mutex)) {
@@ -3614,17 +3783,25 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 				continue;
 			}
 
-			kfree(head->extent_op);
+			if (head->must_insert_reserved)
+				pin_bytes = true;
+			btrfs_free_delayed_extent_op(head->extent_op);
 			delayed_refs->num_heads--;
 			if (list_empty(&head->cluster))
 				delayed_refs->num_heads_ready--;
 			list_del_init(&head->cluster);
 		}
+
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
 		delayed_refs->num_entries--;
-
 		spin_unlock(&delayed_refs->lock);
+		if (head) {
+			if (pin_bytes)
+				btrfs_pin_extent(root, ref->bytenr,
+						 ref->num_bytes, 1);
+			mutex_unlock(&head->mutex);
+		}
 		btrfs_put_delayed_ref(ref);
 
 		cond_resched();
@@ -3636,7 +3813,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 	return ret;
 }
 
-static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t)
 {
 	struct btrfs_pending_snapshot *snapshot;
 	struct list_head splice;
@@ -3649,10 +3826,8 @@ static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
 		snapshot = list_entry(splice.next,
 				      struct btrfs_pending_snapshot,
 				      list);
-
+		snapshot->error = -ECANCELED;
 		list_del_init(&snapshot->list);
-
-		kfree(snapshot);
 	}
 }
 
@@ -3663,19 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 
 	INIT_LIST_HEAD(&splice);
 
-	spin_lock(&root->fs_info->delalloc_lock);
-	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+	spin_lock(&root->delalloc_lock);
+	list_splice_init(&root->delalloc_inodes, &splice);
 
 	while (!list_empty(&splice)) {
-		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-				    delalloc_inodes);
+		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+					       delalloc_inodes);
 
 		list_del_init(&btrfs_inode->delalloc_inodes);
+		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			  &btrfs_inode->runtime_flags);
+		spin_unlock(&root->delalloc_lock);
 
 		btrfs_invalidate_inodes(btrfs_inode->root);
+
+		spin_lock(&root->delalloc_lock);
 	}
 
-	spin_unlock(&root->fs_info->delalloc_lock);
+	spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&fs_info->delalloc_root_lock);
+	list_splice_init(&fs_info->delalloc_roots, &splice);
+	while (!list_empty(&splice)) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					 delalloc_root);
+		list_del_init(&root->delalloc_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		spin_unlock(&fs_info->delalloc_root_lock);
+
+		btrfs_destroy_delalloc_inodes(root);
+		btrfs_put_fs_root(root);
+
+		spin_lock(&fs_info->delalloc_root_lock);
+	}
+	spin_unlock(&fs_info->delalloc_root_lock);
 }
 
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3683,13 +3888,9 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 					int mark)
 {
 	int ret;
-	struct page *page;
-	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_buffer *eb;
 	u64 start = 0;
 	u64 end;
-	u64 offset;
-	unsigned long index;
 
 	while (1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
@@ -3699,36 +3900,17 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 
 		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
 		while (start <= end) {
-			index = start >> PAGE_CACHE_SHIFT;
-			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-			page = find_get_page(btree_inode->i_mapping, index);
-			if (!page)
+			eb = btrfs_find_tree_block(root, start,
+						   root->leafsize);
+			start += root->leafsize;
+			if (!eb)
 				continue;
-			offset = page_offset(page);
-
-			spin_lock(&dirty_pages->buffer_lock);
-			eb = radix_tree_lookup(
-			     &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
-					       offset >> PAGE_CACHE_SHIFT);
-			spin_unlock(&dirty_pages->buffer_lock);
-			if (eb)
-				ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
-							 &eb->bflags);
-			if (PageWriteback(page))
-				end_page_writeback(page);
-
-			lock_page(page);
-			if (PageDirty(page)) {
-				clear_page_dirty_for_io(page);
-				spin_lock_irq(&page->mapping->tree_lock);
-				radix_tree_tag_clear(&page->mapping->page_tree,
-							page_index(page),
-							PAGECACHE_TAG_DIRTY);
-				spin_unlock_irq(&page->mapping->tree_lock);
-			}
+			wait_on_extent_buffer_writeback(eb);
 
-			unlock_page(page);
-			page_cache_release(page);
+			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+					       &eb->bflags))
+				clear_extent_buffer_dirty(eb);
+			free_extent_buffer_stale(eb);
 		}
 	}
 
@@ -3782,34 +3964,32 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 	btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
 				cur_trans->dirty_pages.dirty_bytes);
 
-	/* FIXME: cleanup wait for commit */
-	cur_trans->in_commit = 1;
-	cur_trans->blocked = 1;
+	cur_trans->state = TRANS_STATE_COMMIT_START;
 	wake_up(&root->fs_info->transaction_blocked_wait);
 
-	cur_trans->blocked = 0;
-	wake_up(&root->fs_info->transaction_wait);
+	btrfs_evict_pending_snapshots(cur_trans);
 
-	cur_trans->commit_done = 1;
-	wake_up(&cur_trans->commit_wait);
+	cur_trans->state = TRANS_STATE_UNBLOCKED;
+	wake_up(&root->fs_info->transaction_wait);
 
 	btrfs_destroy_delayed_inodes(root);
 	btrfs_assert_delayed_root_empty(root);
 
-	btrfs_destroy_pending_snapshots(cur_trans);
-
 	btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
 				     EXTENT_DIRTY);
 	btrfs_destroy_pinned_extent(root,
 				    root->fs_info->pinned_extents);
 
+	cur_trans->state =TRANS_STATE_COMPLETED;
+	wake_up(&cur_trans->commit_wait);
+
 	/*
 	memset(cur_trans, 0, sizeof(*cur_trans));
 	kmem_cache_free(btrfs_transaction_cachep, cur_trans);
 	*/
 }
 
-int btrfs_cleanup_transaction(struct btrfs_root *root)
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
 {
 	struct btrfs_transaction *t;
 	LIST_HEAD(list);
@@ -3818,51 +3998,39 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
 
 	spin_lock(&root->fs_info->trans_lock);
 	list_splice_init(&root->fs_info->trans_list, &list);
-	root->fs_info->trans_no_join = 1;
+	root->fs_info->running_transaction = NULL;
 	spin_unlock(&root->fs_info->trans_lock);
 
 	while (!list_empty(&list)) {
 		t = list_entry(list.next, struct btrfs_transaction, list);
-		if (!t)
-			break;
 
-		btrfs_destroy_ordered_operations(root);
+		btrfs_destroy_ordered_operations(t, root);
 
-		btrfs_destroy_ordered_extents(root);
+		btrfs_destroy_all_ordered_extents(root->fs_info);
 
 		btrfs_destroy_delayed_refs(t, root);
 
-		btrfs_block_rsv_release(root,
-					&root->fs_info->trans_block_rsv,
-					t->dirty_pages.dirty_bytes);
-
-		/* FIXME: cleanup wait for commit */
-		t->in_commit = 1;
-		t->blocked = 1;
+		/*
+		 *  FIXME: cleanup wait for commit
+		 *  We needn't acquire the lock here, because we are during
+		 *  the umount, there is no other task which will change it.
+		 */
+		t->state = TRANS_STATE_COMMIT_START;
 		smp_mb();
 		if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
 			wake_up(&root->fs_info->transaction_blocked_wait);
 
-		t->blocked = 0;
+		btrfs_evict_pending_snapshots(t);
+
+		t->state = TRANS_STATE_UNBLOCKED;
 		smp_mb();
 		if (waitqueue_active(&root->fs_info->transaction_wait))
 			wake_up(&root->fs_info->transaction_wait);
 
-		t->commit_done = 1;
-		smp_mb();
-		if (waitqueue_active(&t->commit_wait))
-			wake_up(&t->commit_wait);
-
 		btrfs_destroy_delayed_inodes(root);
 		btrfs_assert_delayed_root_empty(root);
 
-		btrfs_destroy_pending_snapshots(t);
-
-		btrfs_destroy_delalloc_inodes(root);
-
-		spin_lock(&root->fs_info->trans_lock);
-		root->fs_info->running_transaction = NULL;
-		spin_unlock(&root->fs_info->trans_lock);
+		btrfs_destroy_all_delalloc_inodes(root->fs_info);
 
 		btrfs_destroy_marked_extents(root, &t->dirty_pages,
 					     EXTENT_DIRTY);
@@ -3870,15 +4038,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
 		btrfs_destroy_pinned_extent(root,
 					    root->fs_info->pinned_extents);
 
+		t->state = TRANS_STATE_COMPLETED;
+		smp_mb();
+		if (waitqueue_active(&t->commit_wait))
+			wake_up(&t->commit_wait);
+
 		atomic_set(&t->use_count, 0);
 		list_del_init(&t->list);
 		memset(t, 0, sizeof(*t));
 		kmem_cache_free(btrfs_transaction_cachep, t);
 	}
 
-	spin_lock(&root->fs_info->trans_lock);
-	root->fs_info->trans_no_join = 0;
-	spin_unlock(&root->fs_info->trans_lock);
 	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
 	return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 305c33efb0e..b71acd6e1e5 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
 #define BTRFS_SUPER_MIRROR_MAX	 3
 #define BTRFS_SUPER_MIRROR_SHIFT 12
 
+enum {
+	BTRFS_WQ_ENDIO_DATA = 0,
+	BTRFS_WQ_ENDIO_METADATA = 1,
+	BTRFS_WQ_ENDIO_FREE_SPACE = 2,
+	BTRFS_WQ_ENDIO_RAID56 = 3,
+};
+
 static inline u64 btrfs_sb_offset(int mirror)
 {
 	u64 start = 16 * 1024;
@@ -54,23 +61,48 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
-void btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
-					       struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+				      struct btrfs_key *location);
+int btrfs_init_fs_root(struct btrfs_root *root);
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+			 struct btrfs_root *root);
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 					      struct btrfs_key *location);
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
 void btrfs_btree_balance_dirty(struct btrfs_root *root);
 void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
-void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+				 struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_root *root);
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ * 	fs_info->subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+	if (atomic_inc_not_zero(&root->refs))
+		return root;
+	return NULL;
+}
+
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+	if (atomic_dec_and_test(&root->refs))
+		kfree(root);
+}
+
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 			  int atomic);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
-u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
+u32 btrfs_csum_data(char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata);
@@ -86,10 +118,8 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
-int btrfs_cleanup_transaction(struct btrfs_root *root);
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
 				  struct btrfs_root *root);
-void btrfs_abort_devices(struct btrfs_root *root);
 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				     struct btrfs_fs_info *fs_info,
 				     u64 objectid);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 614f34a899c..4b869160737 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -22,10 +22,10 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 
 	if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
 		*max_len = BTRFS_FID_SIZE_CONNECTABLE;
-		return 255;
+		return FILEID_INVALID;
 	} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
 		*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
-		return 255;
+		return FILEID_INVALID;
 	}
 
 	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
@@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 		goto fail;
 	}
 
-	if (btrfs_root_refs(&root->root_item) == 0) {
-		err = -ENOENT;
-		goto fail;
-	}
-
 	key.objectid = objectid;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5a3327b8f90..0236de71198 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,6 +24,7 @@
 #include <linux/kthread.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/percpu_counter.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -31,6 +32,7 @@
 #include "print-tree.h"
 #include "transaction.h"
 #include "volumes.h"
+#include "raid56.h"
 #include "locking.h"
 #include "free-space-cache.h"
 #include "math.h"
@@ -72,8 +74,7 @@ enum {
 	RESERVE_ALLOC_NO_ACCOUNT = 2,
 };
 
-static int update_block_group(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
@@ -103,6 +104,10 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 			    int dump_block_groups);
 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 				       u64 num_bytes, int reserve);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+			       u64 num_bytes);
+int btrfs_pin_extent(struct btrfs_root *root,
+		     u64 bytenr, u64 num_bytes, int reserved);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -162,6 +167,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 	rb_link_node(&block_group->cache_node, parent, p);
 	rb_insert_color(&block_group->cache_node,
 			&info->block_group_cache_tree);
+
+	if (info->first_logical_byte > block_group->key.objectid)
+		info->first_logical_byte = block_group->key.objectid;
+
 	spin_unlock(&info->block_group_cache_lock);
 
 	return 0;
@@ -203,8 +212,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 			break;
 		}
 	}
-	if (ret)
+	if (ret) {
 		btrfs_get_block_group(ret);
+		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+			info->first_logical_byte = ret->key.objectid;
+	}
 	spin_unlock(&info->block_group_cache_lock);
 
 	return ret;
@@ -248,7 +260,8 @@ static int exclude_super_stripes(struct btrfs_root *root,
 		cache->bytes_super += stripe_len;
 		ret = add_excluded_extent(root, cache->key.objectid,
 					  stripe_len);
-		BUG_ON(ret); /* -ENOMEM */
+		if (ret)
+			return ret;
 	}
 
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -256,13 +269,35 @@ static int exclude_super_stripes(struct btrfs_root *root,
 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 				       cache->key.objectid, bytenr,
 				       0, &logical, &nr, &stripe_len);
-		BUG_ON(ret); /* -ENOMEM */
+		if (ret)
+			return ret;
 
 		while (nr--) {
-			cache->bytes_super += stripe_len;
-			ret = add_excluded_extent(root, logical[nr],
-						  stripe_len);
-			BUG_ON(ret); /* -ENOMEM */
+			u64 start, len;
+
+			if (logical[nr] > cache->key.objectid +
+			    cache->key.offset)
+				continue;
+
+			if (logical[nr] + stripe_len <= cache->key.objectid)
+				continue;
+
+			start = logical[nr];
+			if (start < cache->key.objectid) {
+				start = cache->key.objectid;
+				len = (logical[nr] + stripe_len) - start;
+			} else {
+				len = min_t(u64, stripe_len,
+					    cache->key.objectid +
+					    cache->key.offset - start);
+			}
+
+			cache->bytes_super += len;
+			ret = add_excluded_extent(root, start, len);
+			if (ret) {
+				kfree(logical);
+				return ret;
+			}
 		}
 
 		kfree(logical);
@@ -405,8 +440,7 @@ again:
 			if (ret)
 				break;
 
-			if (need_resched() ||
-			    btrfs_next_leaf(extent_root, path)) {
+			if (need_resched()) {
 				caching_ctl->progress = last;
 				btrfs_release_path(path);
 				up_read(&fs_info->extent_commit_sem);
@@ -414,6 +448,12 @@ again:
 				cond_resched();
 				goto again;
 			}
+
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto err;
+			if (ret)
+				break;
 			leaf = path->nodes[0];
 			nritems = btrfs_header_nritems(leaf);
 			continue;
@@ -428,11 +468,16 @@ again:
 		    block_group->key.offset)
 			break;
 
-		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+		    key.type == BTRFS_METADATA_ITEM_KEY) {
 			total_found += add_new_free_space(block_group,
 							  fs_info, last,
 							  key.objectid);
-			last = key.objectid + key.offset;
+			if (key.type == BTRFS_METADATA_ITEM_KEY)
+				last = key.objectid +
+					fs_info->tree_root->leafsize;
+			else
+				last = key.objectid + key.offset;
 
 			if (total_found > (1024 * 1024 * 2)) {
 				total_found = 0;
@@ -468,8 +513,6 @@ out:
 }
 
 static int cache_block_group(struct btrfs_block_group_cache *cache,
-			     struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
 			     int load_cache_only)
 {
 	DEFINE_WAIT(wait);
@@ -527,12 +570,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 	cache->cached = BTRFS_CACHE_FAST;
 	spin_unlock(&cache->lock);
 
-	/*
-	 * We can't do the read from on-disk cache during a commit since we need
-	 * to have the normal tree locking.  Also if we are currently trying to
-	 * allocate blocks for the tree root we can't do the fast caching since
-	 * we likely hold important locks.
-	 */
 	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 		ret = load_free_space_cache(fs_info, cache);
 
@@ -650,55 +687,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 	rcu_read_unlock();
 }
 
-u64 btrfs_find_block_group(struct btrfs_root *root,
-			   u64 search_start, u64 search_hint, int owner)
-{
-	struct btrfs_block_group_cache *cache;
-	u64 used;
-	u64 last = max(search_hint, search_start);
-	u64 group_start = 0;
-	int full_search = 0;
-	int factor = 9;
-	int wrapped = 0;
-again:
-	while (1) {
-		cache = btrfs_lookup_first_block_group(root->fs_info, last);
-		if (!cache)
-			break;
-
-		spin_lock(&cache->lock);
-		last = cache->key.objectid + cache->key.offset;
-		used = btrfs_block_group_used(&cache->item);
-
-		if ((full_search || !cache->ro) &&
-		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
-			if (used + cache->pinned + cache->reserved <
-			    div_factor(cache->key.offset, factor)) {
-				group_start = cache->key.objectid;
-				spin_unlock(&cache->lock);
-				btrfs_put_block_group(cache);
-				goto found;
-			}
-		}
-		spin_unlock(&cache->lock);
-		btrfs_put_block_group(cache);
-		cond_resched();
-	}
-	if (!wrapped) {
-		last = search_start;
-		wrapped = 1;
-		goto again;
-	}
-	if (!full_search && factor < 10) {
-		last = search_start;
-		full_search = 1;
-		factor = 10;
-		goto again;
-	}
-found:
-	return group_start;
-}
-
 /* simple helper to search for an existing extent at a given offset */
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 {
@@ -712,15 +700,21 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 
 	key.objectid = start;
 	key.offset = len;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	key.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 				0, 0);
+	if (ret > 0) {
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid == start &&
+		    key.type == BTRFS_METADATA_ITEM_KEY)
+			ret = 0;
+	}
 	btrfs_free_path(path);
 	return ret;
 }
 
 /*
- * helper function to lookup reference count and flags of extent.
+ * helper function to lookup reference count and flags of a tree block.
  *
  * the head node for delayed ref is used to store the sum of all the
  * reference count modifications queued up in the rbtree. the head
@@ -730,7 +724,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  */
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u64 *refs, u64 *flags)
+			     u64 offset, int metadata, u64 *refs, u64 *flags)
 {
 	struct btrfs_delayed_ref_head *head;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -743,13 +737,29 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 	u64 extent_flags;
 	int ret;
 
+	/*
+	 * If we don't have skinny metadata, don't bother doing anything
+	 * different
+	 */
+	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
+		offset = root->leafsize;
+		metadata = 0;
+	}
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = num_bytes;
+	if (metadata) {
+		key.objectid = bytenr;
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = offset;
+	} else {
+		key.objectid = bytenr;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = offset;
+	}
+
 	if (!trans) {
 		path->skip_locking = 1;
 		path->search_commit_root = 1;
@@ -760,6 +770,13 @@ again:
 	if (ret < 0)
 		goto out_free;
 
+	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = root->leafsize;
+		btrfs_release_path(path);
+		goto again;
+	}
+
 	if (ret == 0) {
 		leaf = path->nodes[0];
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -995,7 +1012,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 		return ret;
 	BUG_ON(ret); /* Corruption */
 
-	btrfs_extend_item(trans, root, path, new_size);
+	btrfs_extend_item(root, path, new_size);
 
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1447,6 +1464,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	int want;
 	int ret;
 	int err = 0;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
 
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -1458,16 +1477,54 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 		path->keep_locks = 1;
 	} else
 		extra_size = -1;
+
+	/*
+	 * Owner is our parent level, so we can just add one to get the level
+	 * for the block we are interested in.
+	 */
+	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = owner;
+	}
+
+again:
 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
 	if (ret < 0) {
 		err = ret;
 		goto out;
 	}
+
+	/*
+	 * We may be a newly converted file system which still has the old fat
+	 * extent entries for metadata, so try and see if we have one of those.
+	 */
+	if (ret > 0 && skinny_metadata) {
+		skinny_metadata = false;
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == num_bytes)
+				ret = 0;
+		}
+		if (ret) {
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			key.offset = num_bytes;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
+
 	if (ret && !insert) {
 		err = -ENOENT;
 		goto out;
+	} else if (ret) {
+		err = -EIO;
+		WARN_ON(1);
+		goto out;
 	}
-	BUG_ON(ret); /* Corruption */
 
 	leaf = path->nodes[0];
 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -1495,11 +1552,9 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	ptr = (unsigned long)(ei + 1);
 	end = (unsigned long)ei + item_size;
 
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
 		ptr += sizeof(struct btrfs_tree_block_info);
 		BUG_ON(ptr > end);
-	} else {
-		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
 	}
 
 	err = -ENOENT;
@@ -1581,8 +1636,7 @@ out:
  * helper to add new inline back ref
  */
 static noinline_for_stack
-void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
+void setup_inline_extent_backref(struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 struct btrfs_extent_inline_ref *iref,
 				 u64 parent, u64 root_objectid,
@@ -1605,7 +1659,7 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	type = extent_ref_type(parent, owner);
 	size = btrfs_extent_inline_ref_size(type);
 
-	btrfs_extend_item(trans, root, path, size);
+	btrfs_extend_item(root, path, size);
 
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, ei);
@@ -1674,8 +1728,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
  * helper to update/remove inline back ref
  */
 static noinline_for_stack
-void update_inline_extent_backref(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
+void update_inline_extent_backref(struct btrfs_root *root,
 				  struct btrfs_path *path,
 				  struct btrfs_extent_inline_ref *iref,
 				  int refs_to_mod,
@@ -1731,7 +1784,7 @@ void update_inline_extent_backref(struct btrfs_trans_handle *trans,
 			memmove_extent_buffer(leaf, ptr, ptr + size,
 					      end - ptr - size);
 		item_size -= size;
-		btrfs_truncate_item(trans, root, path, item_size, 1);
+		btrfs_truncate_item(root, path, item_size, 1);
 	}
 	btrfs_mark_buffer_dirty(leaf);
 }
@@ -1753,10 +1806,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
 					   root_objectid, owner, offset, 1);
 	if (ret == 0) {
 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
-		update_inline_extent_backref(trans, root, path, iref,
+		update_inline_extent_backref(root, path, iref,
 					     refs_to_add, extent_op);
 	} else if (ret == -ENOENT) {
-		setup_inline_extent_backref(trans, root, path, iref, parent,
+		setup_inline_extent_backref(root, path, iref, parent,
 					    root_objectid, owner, offset,
 					    refs_to_add, extent_op);
 		ret = 0;
@@ -1793,7 +1846,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 
 	BUG_ON(!is_data && refs_to_drop != 1);
 	if (iref) {
-		update_inline_extent_backref(trans, root, path, iref,
+		update_inline_extent_backref(root, path, iref,
 					     -refs_to_drop, NULL);
 	} else if (is_data) {
 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
@@ -1852,6 +1905,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 		*actual_bytes = discarded_bytes;
 
 
+	if (ret == -EOPNOTSUPP)
+		ret = 0;
 	return ret;
 }
 
@@ -1962,10 +2017,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 		ref_root = ref->root;
 
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
-		if (extent_op) {
-			BUG_ON(extent_op->update_key);
+		if (extent_op)
 			flags |= extent_op->flags_to_set;
-		}
 		ret = alloc_reserved_file_extent(trans, root,
 						 parent, ref_root, flags,
 						 ref->objectid, ref->offset,
@@ -2018,18 +2071,29 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 	u32 item_size;
 	int ret;
 	int err = 0;
+	int metadata = !extent_op->is_data;
 
 	if (trans->aborted)
 		return 0;
 
+	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+		metadata = 0;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	key.objectid = node->bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = node->num_bytes;
 
+	if (metadata) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = extent_op->level;
+	} else {
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = node->num_bytes;
+	}
+
+again:
 	path->reada = 1;
 	path->leave_spinning = 1;
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
@@ -2039,6 +2103,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	if (ret > 0) {
+		if (metadata) {
+			btrfs_release_path(path);
+			metadata = 0;
+
+			key.offset = node->num_bytes;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			goto again;
+		}
 		err = -EIO;
 		goto out;
 	}
@@ -2078,10 +2150,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_key ins;
 	u64 parent = 0;
 	u64 ref_root = 0;
-
-	ins.objectid = node->bytenr;
-	ins.offset = node->num_bytes;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
 
 	ref = btrfs_delayed_node_to_tree_ref(node);
 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
@@ -2089,10 +2159,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	else
 		ref_root = ref->root;
 
+	ins.objectid = node->bytenr;
+	if (skinny_metadata) {
+		ins.offset = ref->level;
+		ins.type = BTRFS_METADATA_ITEM_KEY;
+	} else {
+		ins.offset = node->num_bytes;
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+	}
+
 	BUG_ON(node->ref_mod != 1);
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
-		BUG_ON(!extent_op || !extent_op->update_flags ||
-		       !extent_op->update_key);
+		BUG_ON(!extent_op || !extent_op->update_flags);
 		ret = alloc_reserved_tree_block(trans, root,
 						parent, ref_root,
 						extent_op->flags_to_set,
@@ -2143,7 +2221,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 						      node->num_bytes);
 			}
 		}
-		mutex_unlock(&head->mutex);
 		return ret;
 	}
 
@@ -2258,7 +2335,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			 * process of being added. Don't run this ref yet.
 			 */
 			list_del_init(&locked_ref->cluster);
-			mutex_unlock(&locked_ref->mutex);
+			btrfs_delayed_ref_unlock(locked_ref);
 			locked_ref = NULL;
 			delayed_refs->num_heads_ready++;
 			spin_unlock(&delayed_refs->lock);
@@ -2285,7 +2362,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			ref = &locked_ref->node;
 
 			if (extent_op && must_insert_reserved) {
-				kfree(extent_op);
+				btrfs_free_delayed_extent_op(extent_op);
 				extent_op = NULL;
 			}
 
@@ -2294,28 +2371,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 
 				ret = run_delayed_extent_op(trans, root,
 							    ref, extent_op);
-				kfree(extent_op);
+				btrfs_free_delayed_extent_op(extent_op);
 
 				if (ret) {
-					list_del_init(&locked_ref->cluster);
-					mutex_unlock(&locked_ref->mutex);
-
-					printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
+					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
 					spin_lock(&delayed_refs->lock);
+					btrfs_delayed_ref_unlock(locked_ref);
 					return ret;
 				}
 
 				goto next;
 			}
-
-			list_del_init(&locked_ref->cluster);
-			locked_ref = NULL;
 		}
 
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
 		delayed_refs->num_entries--;
-		if (locked_ref) {
+		if (!btrfs_delayed_ref_is_head(ref)) {
 			/*
 			 * when we play the delayed ref, also correct the
 			 * ref_mod on head
@@ -2337,20 +2409,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
 					  must_insert_reserved);
 
-		btrfs_put_delayed_ref(ref);
-		kfree(extent_op);
-		count++;
-
+		btrfs_free_delayed_extent_op(extent_op);
 		if (ret) {
-			if (locked_ref) {
-				list_del_init(&locked_ref->cluster);
-				mutex_unlock(&locked_ref->mutex);
-			}
-			printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
+			btrfs_delayed_ref_unlock(locked_ref);
+			btrfs_put_delayed_ref(ref);
+			btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
 			spin_lock(&delayed_refs->lock);
 			return ret;
 		}
 
+		/*
+		 * If this node is a head, that means all the refs in this head
+		 * have been dealt with, and we will pick the next head to deal
+		 * with, so we must unlock the head and drop it from the cluster
+		 * list before we release it.
+		 */
+		if (btrfs_delayed_ref_is_head(ref)) {
+			list_del_init(&locked_ref->cluster);
+			btrfs_delayed_ref_unlock(locked_ref);
+			locked_ref = NULL;
+		}
+		btrfs_put_delayed_ref(ref);
+		count++;
 next:
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
@@ -2410,9 +2490,11 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 	if (list_empty(&trans->qgroup_ref_list) !=
 	    !trans->delayed_ref_elem.seq) {
 		/* list without seq or seq without list */
-		printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
+		btrfs_err(fs_info,
+			"qgroup accounting update error, list is%s empty, seq is %#x.%x",
 			list_empty(&trans->qgroup_ref_list) ? "" : " not",
-			trans->delayed_ref_elem.seq);
+			(u32)(trans->delayed_ref_elem.seq >> 32),
+			(u32)trans->delayed_ref_elem.seq);
 		BUG();
 	}
 
@@ -2435,6 +2517,61 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
+		      int count)
+{
+	int val = atomic_read(&delayed_refs->ref_seq);
+
+	if (val < seq || val >= seq + count)
+		return 1;
+	return 0;
+}
+
+static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
+{
+	u64 num_bytes;
+
+	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
+			     sizeof(struct btrfs_extent_inline_ref));
+	if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+		num_bytes += heads * sizeof(struct btrfs_tree_block_info);
+
+	/*
+	 * We don't ever fill up leaves all the way so multiply by 2 just to be
+	 * closer to what we're really going to want to ouse.
+	 */
+	return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_block_rsv *global_rsv;
+	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+	u64 num_bytes;
+	int ret = 0;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	num_heads = heads_to_leaves(root, num_heads);
+	if (num_heads > 1)
+		num_bytes += (num_heads - 1) * root->leafsize;
+	num_bytes <<= 1;
+	global_rsv = &root->fs_info->global_block_rsv;
+
+	/*
+	 * If we can't allocate any more chunks lets make sure we have _lots_ of
+	 * wiggle room since running delayed refs can create more delayed refs.
+	 */
+	if (global_rsv->space_info->full)
+		num_bytes <<= 1;
+
+	spin_lock(&global_rsv->lock);
+	if (global_rsv->reserved <= num_bytes)
+		ret = 1;
+	spin_unlock(&global_rsv->lock);
+	return ret;
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2469,6 +2606,45 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
 	delayed_refs = &trans->transaction->delayed_refs;
 	INIT_LIST_HEAD(&cluster);
+	if (count == 0) {
+		count = delayed_refs->num_entries * 2;
+		run_most = 1;
+	}
+
+	if (!run_all && !run_most) {
+		int old;
+		int seq = atomic_read(&delayed_refs->ref_seq);
+
+progress:
+		old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+		if (old) {
+			DEFINE_WAIT(__wait);
+			if (delayed_refs->flushing ||
+			    !btrfs_should_throttle_delayed_refs(trans, root))
+				return 0;
+
+			prepare_to_wait(&delayed_refs->wait, &__wait,
+					TASK_UNINTERRUPTIBLE);
+
+			old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+			if (old) {
+				schedule();
+				finish_wait(&delayed_refs->wait, &__wait);
+
+				if (!refs_newer(delayed_refs, seq, 256))
+					goto progress;
+				else
+					return 0;
+			} else {
+				finish_wait(&delayed_refs->wait, &__wait);
+				goto again;
+			}
+		}
+
+	} else {
+		atomic_inc(&delayed_refs->procs_running_refs);
+	}
+
 again:
 	loops = 0;
 	spin_lock(&delayed_refs->lock);
@@ -2477,13 +2653,9 @@ again:
 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
 
-	if (count == 0) {
-		count = delayed_refs->num_entries * 2;
-		run_most = 1;
-	}
 	while (1) {
 		if (!(run_all || run_most) &&
-		    delayed_refs->num_heads_ready < 64)
+		    !btrfs_should_throttle_delayed_refs(trans, root))
 			break;
 
 		/*
@@ -2500,11 +2672,16 @@ again:
 
 		ret = run_clustered_refs(trans, root, &cluster);
 		if (ret < 0) {
+			btrfs_release_ref_cluster(&cluster);
 			spin_unlock(&delayed_refs->lock);
 			btrfs_abort_transaction(trans, root, ret);
+			atomic_dec(&delayed_refs->procs_running_refs);
+			wake_up(&delayed_refs->wait);
 			return ret;
 		}
 
+		atomic_add(ret, &delayed_refs->ref_seq);
+
 		count -= min_t(unsigned long, ret, count);
 
 		if (count == 0)
@@ -2573,6 +2750,11 @@ again:
 		goto again;
 	}
 out:
+	atomic_dec(&delayed_refs->procs_running_refs);
+	smp_mb();
+	if (waitqueue_active(&delayed_refs->wait))
+		wake_up(&delayed_refs->wait);
+
 	spin_unlock(&delayed_refs->lock);
 	assert_qgroups_uptodate(trans);
 	return 0;
@@ -2581,12 +2763,12 @@ out:
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 flags,
-				int is_data)
+				int level, int is_data)
 {
 	struct btrfs_delayed_extent_op *extent_op;
 	int ret;
 
-	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+	extent_op = btrfs_alloc_delayed_extent_op();
 	if (!extent_op)
 		return -ENOMEM;
 
@@ -2594,11 +2776,12 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 	extent_op->update_flags = 1;
 	extent_op->update_key = 0;
 	extent_op->is_data = is_data ? 1 : 0;
+	extent_op->level = level;
 
 	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
 					  num_bytes, extent_op);
 	if (ret)
-		kfree(extent_op);
+		btrfs_free_delayed_extent_op(extent_op);
 	return ret;
 }
 
@@ -2971,6 +3154,11 @@ again:
 	WARN_ON(ret);
 
 	if (i_size_read(inode) > 0) {
+		ret = btrfs_check_trunc_cache_free_space(root,
+					&root->fs_info->global_block_rsv);
+		if (ret)
+			goto out_put;
+
 		ret = btrfs_truncate_free_space_cache(root, trans, path,
 						      inode);
 		if (ret)
@@ -3170,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	struct btrfs_space_info *found;
 	int i;
 	int factor;
+	int ret;
 
 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
 		     BTRFS_BLOCK_GROUP_RAID10))
@@ -3193,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	if (!found)
 		return -ENOMEM;
 
+	ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+	if (ret) {
+		kfree(found);
+		return ret;
+	}
+
 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 		INIT_LIST_HEAD(&found->block_groups[i]);
 	init_rwsem(&found->groups_sem);
@@ -3223,12 +3418,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 	u64 extra_flags = chunk_to_extended(flags) &
 				BTRFS_EXTENDED_PROFILE_MASK;
 
+	write_seqlock(&fs_info->profiles_lock);
 	if (flags & BTRFS_BLOCK_GROUP_DATA)
 		fs_info->avail_data_alloc_bits |= extra_flags;
 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
 		fs_info->avail_metadata_alloc_bits |= extra_flags;
 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 		fs_info->avail_system_alloc_bits |= extra_flags;
+	write_sequnlock(&fs_info->profiles_lock);
 }
 
 /*
@@ -3266,7 +3463,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
  * progress (either running or paused) picks the target profile (if it's
  * already available), otherwise falls back to plain reducing.
  */
-u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	/*
 	 * we add in the count of missing devices because we want
@@ -3276,6 +3473,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
 		root->fs_info->fs_devices->missing_devices;
 	u64 target;
+	u64 tmp;
 
 	/*
 	 * see if restripe for this chunk_type is in progress, if so
@@ -3292,40 +3490,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	}
 	spin_unlock(&root->fs_info->balance_lock);
 
+	/* First, mask out the RAID levels which aren't possible */
 	if (num_devices == 1)
-		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+			   BTRFS_BLOCK_GROUP_RAID5);
+	if (num_devices < 3)
+		flags &= ~BTRFS_BLOCK_GROUP_RAID6;
 	if (num_devices < 4)
 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
 
-	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
-	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-		      BTRFS_BLOCK_GROUP_RAID10))) {
-		flags &= ~BTRFS_BLOCK_GROUP_DUP;
-	}
+	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+	flags &= ~tmp;
 
-	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
-	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
-		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
-	}
-
-	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
-	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
-	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
-	     (flags & BTRFS_BLOCK_GROUP_DUP))) {
-		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
-	}
+	if (tmp & BTRFS_BLOCK_GROUP_RAID6)
+		tmp = BTRFS_BLOCK_GROUP_RAID6;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
+		tmp = BTRFS_BLOCK_GROUP_RAID5;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
+		tmp = BTRFS_BLOCK_GROUP_RAID10;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+		tmp = BTRFS_BLOCK_GROUP_RAID1;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+		tmp = BTRFS_BLOCK_GROUP_RAID0;
 
-	return extended_to_chunk(flags);
+	return extended_to_chunk(flags | tmp);
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		flags |= root->fs_info->avail_data_alloc_bits;
-	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		flags |= root->fs_info->avail_system_alloc_bits;
-	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		flags |= root->fs_info->avail_metadata_alloc_bits;
+	unsigned seq;
+
+	do {
+		seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+		if (flags & BTRFS_BLOCK_GROUP_DATA)
+			flags |= root->fs_info->avail_data_alloc_bits;
+		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+			flags |= root->fs_info->avail_system_alloc_bits;
+		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+			flags |= root->fs_info->avail_metadata_alloc_bits;
+	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
 
 	return btrfs_reduce_alloc_profile(root, flags);
 }
@@ -3333,6 +3539,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
 	u64 flags;
+	u64 ret;
 
 	if (data)
 		flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3548,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 	else
 		flags = BTRFS_BLOCK_GROUP_METADATA;
 
-	return get_alloc_profile(root, flags);
+	ret = get_alloc_profile(root, flags);
+	return ret;
 }
 
 /*
@@ -3357,7 +3565,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 	int ret = 0, committed = 0, alloc_chunk = 1;
 
 	/* make sure bytes are sectorsize aligned */
-	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+	bytes = ALIGN(bytes, root->sectorsize);
 
 	if (root == root->fs_info->tree_root ||
 	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
@@ -3412,10 +3620,11 @@ alloc:
 		}
 
 		/*
-		 * If we have less pinned bytes than we want to allocate then
-		 * don't bother committing the transaction, it won't help us.
+		 * If we don't have enough pinned space to deal with this
+		 * allocation don't bother committing the transaction.
 		 */
-		if (data_sinfo->bytes_pinned < bytes)
+		if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
+					   bytes) < 0)
 			committed = 1;
 		spin_unlock(&data_sinfo->lock);
 
@@ -3424,6 +3633,7 @@ commit_trans:
 		if (!committed &&
 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
 			committed = 1;
+
 			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
@@ -3452,10 +3662,11 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	struct btrfs_space_info *data_sinfo;
 
 	/* make sure bytes are sectorsize aligned */
-	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+	bytes = ALIGN(bytes, root->sectorsize);
 
 	data_sinfo = root->fs_info->data_sinfo;
 	spin_lock(&data_sinfo->lock);
+	WARN_ON(data_sinfo->bytes_may_use < bytes);
 	data_sinfo->bytes_may_use -= bytes;
 	trace_btrfs_space_reservation(root->fs_info, "space_info",
 				      data_sinfo->flags, bytes, 0);
@@ -3475,6 +3686,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 	rcu_read_unlock();
 }
 
+static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
+{
+	return (global->size << 1);
+}
+
 static int should_alloc_chunk(struct btrfs_root *root,
 			      struct btrfs_space_info *sinfo, int force)
 {
@@ -3492,7 +3708,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	 * global_rsv, it doesn't change except when the transaction commits.
 	 */
 	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
-		num_allocated += global_rsv->size;
+		num_allocated += calc_global_rsv_need_space(global_rsv);
 
 	/*
 	 * in limited mode, we want to have some free space up to
@@ -3516,8 +3732,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
 {
 	u64 num_dev;
 
-	if (type & BTRFS_BLOCK_GROUP_RAID10 ||
-	    type & BTRFS_BLOCK_GROUP_RAID0)
+	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
+		    BTRFS_BLOCK_GROUP_RAID0 |
+		    BTRFS_BLOCK_GROUP_RAID5 |
+		    BTRFS_BLOCK_GROUP_RAID6))
 		num_dev = root->fs_info->fs_devices->rw_devices;
 	else if (type & BTRFS_BLOCK_GROUP_RAID1)
 		num_dev = 2;
@@ -3543,8 +3761,8 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
 
 	thresh = get_system_chunk_thresh(root, type);
 	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
-		printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
-		       left, thresh, type);
+		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
+			left, thresh, type);
 		dump_space_info(info, 0, 0);
 	}
 
@@ -3564,6 +3782,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	int wait_for_alloc = 0;
 	int ret = 0;
 
+	/* Don't re-enter if we're already allocating a chunk */
+	if (trans->allocating_chunk)
+		return -ENOSPC;
+
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
 		ret = update_space_info(extent_root->fs_info, flags,
@@ -3606,6 +3828,8 @@ again:
 		goto again;
 	}
 
+	trans->allocating_chunk = true;
+
 	/*
 	 * If we have mixed data/metadata chunks we want to make sure we keep
 	 * allocating mixed chunks instead of individual chunks.
@@ -3632,19 +3856,20 @@ again:
 	check_system_chunk(trans, extent_root, flags);
 
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
-	if (ret < 0 && ret != -ENOSPC)
-		goto out;
+	trans->allocating_chunk = false;
 
 	spin_lock(&space_info->lock);
+	if (ret < 0 && ret != -ENOSPC)
+		goto out;
 	if (ret)
 		space_info->full = 1;
 	else
 		ret = 1;
 
 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+out:
 	space_info->chunk_alloc = 0;
 	spin_unlock(&space_info->lock);
-out:
 	mutex_unlock(&fs_info->chunk_mutex);
 	return ret;
 }
@@ -3653,13 +3878,29 @@ static int can_overcommit(struct btrfs_root *root,
 			  struct btrfs_space_info *space_info, u64 bytes,
 			  enum btrfs_reserve_flush_enum flush)
 {
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	u64 profile = btrfs_get_alloc_profile(root, 0);
+	u64 space_size;
 	u64 avail;
 	u64 used;
+	u64 to_add;
 
 	used = space_info->bytes_used + space_info->bytes_reserved +
-		space_info->bytes_pinned + space_info->bytes_readonly +
-		space_info->bytes_may_use;
+		space_info->bytes_pinned + space_info->bytes_readonly;
+
+	/*
+	 * We only want to allow over committing if we have lots of actual space
+	 * free, but if we don't have enough space to handle the global reserve
+	 * space then we could end up having a real enospc problem when trying
+	 * to allocate a chunk or some other such important allocation.
+	 */
+	spin_lock(&global_rsv->lock);
+	space_size = calc_global_rsv_need_space(global_rsv);
+	spin_unlock(&global_rsv->lock);
+	if (used + space_size >= space_info->total_bytes)
+		return 0;
+
+	used += space_info->bytes_may_use;
 
 	spin_lock(&root->fs_info->free_chunk_lock);
 	avail = root->fs_info->free_chunk_space;
@@ -3667,40 +3908,58 @@ static int can_overcommit(struct btrfs_root *root,
 
 	/*
 	 * If we have dup, raid1 or raid10 then only half of the free
-	 * space is actually useable.
+	 * space is actually useable.  For raid56, the space info used
+	 * doesn't include the parity drive, so we don't have to
+	 * change the math
 	 */
 	if (profile & (BTRFS_BLOCK_GROUP_DUP |
 		       BTRFS_BLOCK_GROUP_RAID1 |
 		       BTRFS_BLOCK_GROUP_RAID10))
 		avail >>= 1;
 
+	to_add = space_info->total_bytes;
+
 	/*
 	 * If we aren't flushing all things, let us overcommit up to
 	 * 1/2th of the space. If we can flush, don't let us overcommit
 	 * too much, let it overcommit up to 1/8 of the space.
 	 */
 	if (flush == BTRFS_RESERVE_FLUSH_ALL)
-		avail >>= 3;
+		to_add >>= 3;
 	else
-		avail >>= 1;
+		to_add >>= 1;
 
-	if (used + bytes < space_info->total_bytes + avail)
+	/*
+	 * Limit the overcommit to the amount of free space we could possibly
+	 * allocate for chunks.
+	 */
+	to_add = min(avail, to_add);
+
+	if (used + bytes < space_info->total_bytes + to_add)
 		return 1;
 	return 0;
 }
 
-static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
-					       unsigned long nr_pages,
-					       enum wb_reason reason)
+static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+					 unsigned long nr_pages)
 {
-	if (!writeback_in_progress(sb->s_bdi) &&
-	    down_read_trylock(&sb->s_umount)) {
-		writeback_inodes_sb_nr(sb, nr_pages, reason);
+	struct super_block *sb = root->fs_info->sb;
+
+	if (down_read_trylock(&sb->s_umount)) {
+		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
 		up_read(&sb->s_umount);
-		return 1;
+	} else {
+		/*
+		 * We needn't worry the filesystem going from r/w to r/o though
+		 * we don't acquire ->s_umount mutex, because the filesystem
+		 * should guarantee the delalloc inodes list be empty after
+		 * the filesystem is readonly(all dirty pages are written to
+		 * the disk).
+		 */
+		btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+		if (!current->journal_info)
+			btrfs_wait_all_ordered_extents(root->fs_info, 0);
 	}
-
-	return 0;
 }
 
 /*
@@ -3724,21 +3983,19 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	space_info = block_rsv->space_info;
 
 	smp_mb();
-	delalloc_bytes = root->fs_info->delalloc_bytes;
+	delalloc_bytes = percpu_counter_sum_positive(
+						&root->fs_info->delalloc_bytes);
 	if (delalloc_bytes == 0) {
 		if (trans)
 			return;
-		btrfs_wait_ordered_extents(root, 0);
+		btrfs_wait_all_ordered_extents(root->fs_info, 0);
 		return;
 	}
 
 	while (delalloc_bytes && loops < 3) {
 		max_reclaim = min(delalloc_bytes, to_reclaim);
 		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-		writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
-						    nr_pages,
-						    WB_REASON_FS_FREE_SPACE);
-
+		btrfs_writeback_inodes_sb_nr(root, nr_pages);
 		/*
 		 * We need to wait for the async pages to actually start before
 		 * we do anything.
@@ -3759,14 +4016,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
 		loops++;
 		if (wait_ordered && !trans) {
-			btrfs_wait_ordered_extents(root, 0);
+			btrfs_wait_all_ordered_extents(root->fs_info, 0);
 		} else {
 			time_left = schedule_timeout_killable(1);
 			if (time_left)
 				break;
 		}
 		smp_mb();
-		delalloc_bytes = root->fs_info->delalloc_bytes;
+		delalloc_bytes = percpu_counter_sum_positive(
+						&root->fs_info->delalloc_bytes);
 	}
 }
 
@@ -3796,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root,
 
 	/* See if there is enough pinned space to make this reservation */
 	spin_lock(&space_info->lock);
-	if (space_info->bytes_pinned >= bytes) {
+	if (percpu_counter_compare(&space_info->total_bytes_pinned,
+				   bytes) >= 0) {
 		spin_unlock(&space_info->lock);
 		goto commit;
 	}
@@ -3811,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root,
 
 	spin_lock(&space_info->lock);
 	spin_lock(&delayed_rsv->lock);
-	if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
+	if (percpu_counter_compare(&space_info->total_bytes_pinned,
+				   bytes - delayed_rsv->size) >= 0) {
 		spin_unlock(&delayed_rsv->lock);
 		spin_unlock(&space_info->lock);
 		return -ENOSPC;
@@ -4030,6 +4290,15 @@ again:
 		goto again;
 
 out:
+	if (ret == -ENOSPC &&
+	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+		struct btrfs_block_rsv *global_rsv =
+			&root->fs_info->global_block_rsv;
+
+		if (block_rsv != global_rsv &&
+		    !block_rsv_use_bytes(global_rsv, orig_bytes))
+			ret = 0;
+	}
 	if (flushing) {
 		spin_lock(&space_info->lock);
 		space_info->flush = 0;
@@ -4087,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
 	spin_unlock(&block_rsv->lock);
 }
 
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_rsv *dest, u64 num_bytes,
+			     int min_factor)
+{
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	u64 min_bytes;
+
+	if (global_rsv->space_info != dest->space_info)
+		return -ENOSPC;
+
+	spin_lock(&global_rsv->lock);
+	min_bytes = div_factor(global_rsv->size, min_factor);
+	if (global_rsv->reserved < min_bytes + num_bytes) {
+		spin_unlock(&global_rsv->lock);
+		return -ENOSPC;
+	}
+	global_rsv->reserved -= num_bytes;
+	if (global_rsv->reserved < global_rsv->size)
+		global_rsv->full = 0;
+	spin_unlock(&global_rsv->lock);
+
+	block_rsv_add_bytes(dest, num_bytes, 1);
+	return 0;
+}
+
 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
 				    struct btrfs_block_rsv *block_rsv,
 				    struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -4308,7 +4602,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	spin_lock(&sinfo->lock);
 	spin_lock(&block_rsv->lock);
 
-	block_rsv->size = num_bytes;
+	block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
 
 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
 		    sinfo->bytes_reserved + sinfo->bytes_readonly +
@@ -4354,6 +4648,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+	if (fs_info->quota_root)
+		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
 
 	update_global_block_rsv(fs_info);
@@ -4416,19 +4712,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)
 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
 
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending)
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * qgroup_reserved: used to return the reserved size in qgroup
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reseravtion mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+				     struct btrfs_block_rsv *rsv,
+				     int items,
+				     u64 *qgroup_reserved)
 {
-	struct btrfs_root *root = pending->root;
-	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
-	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
-	/*
-	 * two for root back/forward refs, two for directory entries,
-	 * one for root of the snapshot and one for parent inode.
-	 */
-	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
-	dst_rsv->space_info = src_rsv->space_info;
-	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+	u64 num_bytes;
+	int ret;
+
+	if (root->fs_info->quota_enabled) {
+		/* One for parent inode, two for dir entries */
+		num_bytes = 3 * root->leafsize;
+		ret = btrfs_qgroup_reserve(root, num_bytes);
+		if (ret)
+			return ret;
+	} else {
+		num_bytes = 0;
+	}
+
+	*qgroup_reserved = num_bytes;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, items);
+	rsv->space_info = __find_space_info(root->fs_info,
+					    BTRFS_BLOCK_GROUP_METADATA);
+	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+				  BTRFS_RESERVE_FLUSH_ALL);
+	if (ret) {
+		if (*qgroup_reserved)
+			btrfs_qgroup_free(root, *qgroup_reserved);
+	}
+
+	return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+				      struct btrfs_block_rsv *rsv,
+				      u64 qgroup_reserved)
+{
+	btrfs_block_rsv_release(root, rsv, (u64)-1);
+	if (qgroup_reserved)
+		btrfs_qgroup_free(root, qgroup_reserved);
 }
 
 /**
@@ -4536,6 +4873,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 	int ret = 0;
 	bool delalloc_lock = true;
+	u64 to_free = 0;
+	unsigned dropped;
 
 	/* If we are a free space inode we need to not flush since we will be in
 	 * the middle of a transaction commit.  We also don't need the delalloc
@@ -4579,54 +4918,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	csum_bytes = BTRFS_I(inode)->csum_bytes;
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	if (root->fs_info->quota_enabled)
+	if (root->fs_info->quota_enabled) {
 		ret = btrfs_qgroup_reserve(root, num_bytes +
 					   nr_extents * root->leafsize);
+		if (ret)
+			goto out_fail;
+	}
 
-	/*
-	 * ret != 0 here means the qgroup reservation failed, we go straight to
-	 * the shared error handling then.
-	 */
-	if (ret == 0)
-		ret = reserve_metadata_bytes(root, block_rsv,
-					     to_reserve, flush);
-
-	if (ret) {
-		u64 to_free = 0;
-		unsigned dropped;
-
-		spin_lock(&BTRFS_I(inode)->lock);
-		dropped = drop_outstanding_extent(inode);
-		/*
-		 * If the inodes csum_bytes is the same as the original
-		 * csum_bytes then we know we haven't raced with any free()ers
-		 * so we can just reduce our inodes csum bytes and carry on.
-		 * Otherwise we have to do the normal free thing to account for
-		 * the case that the free side didn't free up its reserve
-		 * because of this outstanding reservation.
-		 */
-		if (BTRFS_I(inode)->csum_bytes == csum_bytes)
-			calc_csum_metadata_size(inode, num_bytes, 0);
-		else
-			to_free = calc_csum_metadata_size(inode, num_bytes, 0);
-		spin_unlock(&BTRFS_I(inode)->lock);
-		if (dropped)
-			to_free += btrfs_calc_trans_metadata_size(root, dropped);
-
-		if (to_free) {
-			btrfs_block_rsv_release(root, block_rsv, to_free);
-			trace_btrfs_space_reservation(root->fs_info,
-						      "delalloc",
-						      btrfs_ino(inode),
-						      to_free, 0);
-		}
-		if (root->fs_info->quota_enabled) {
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+	if (unlikely(ret)) {
+		if (root->fs_info->quota_enabled)
 			btrfs_qgroup_free(root, num_bytes +
 						nr_extents * root->leafsize);
-		}
-		if (delalloc_lock)
-			mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
-		return ret;
+		goto out_fail;
 	}
 
 	spin_lock(&BTRFS_I(inode)->lock);
@@ -4647,6 +4951,69 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
 	return 0;
+
+out_fail:
+	spin_lock(&BTRFS_I(inode)->lock);
+	dropped = drop_outstanding_extent(inode);
+	/*
+	 * If the inodes csum_bytes is the same as the original
+	 * csum_bytes then we know we haven't raced with any free()ers
+	 * so we can just reduce our inodes csum bytes and carry on.
+	 */
+	if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
+		calc_csum_metadata_size(inode, num_bytes, 0);
+	} else {
+		u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
+		u64 bytes;
+
+		/*
+		 * This is tricky, but first we need to figure out how much we
+		 * free'd from any free-ers that occured during this
+		 * reservation, so we reset ->csum_bytes to the csum_bytes
+		 * before we dropped our lock, and then call the free for the
+		 * number of bytes that were freed while we were trying our
+		 * reservation.
+		 */
+		bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
+		BTRFS_I(inode)->csum_bytes = csum_bytes;
+		to_free = calc_csum_metadata_size(inode, bytes, 0);
+
+
+		/*
+		 * Now we need to see how much we would have freed had we not
+		 * been making this reservation and our ->csum_bytes were not
+		 * artificially inflated.
+		 */
+		BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
+		bytes = csum_bytes - orig_csum_bytes;
+		bytes = calc_csum_metadata_size(inode, bytes, 0);
+
+		/*
+		 * Now reset ->csum_bytes to what it should be.  If bytes is
+		 * more than to_free then we would have free'd more space had we
+		 * not had an artificially high ->csum_bytes, so we need to free
+		 * the remainder.  If bytes is the same or less then we don't
+		 * need to do anything, the other free-ers did the correct
+		 * thing.
+		 */
+		BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
+		if (bytes > to_free)
+			to_free = bytes - to_free;
+		else
+			to_free = 0;
+	}
+	spin_unlock(&BTRFS_I(inode)->lock);
+	if (dropped)
+		to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+	if (to_free) {
+		btrfs_block_rsv_release(root, block_rsv, to_free);
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), to_free, 0);
+	}
+	if (delalloc_lock)
+		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+	return ret;
 }
 
 /**
@@ -4668,7 +5035,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 	spin_lock(&BTRFS_I(inode)->lock);
 	dropped = drop_outstanding_extent(inode);
 
-	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+	if (num_bytes)
+		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
 	spin_unlock(&BTRFS_I(inode)->lock);
 	if (dropped > 0)
 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4735,8 +5103,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
 	btrfs_free_reserved_data_space(inode, num_bytes);
 }
 
-static int update_block_group(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc)
 {
 	struct btrfs_block_group_cache *cache = NULL;
@@ -4747,14 +5114,14 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	int factor;
 
 	/* block accounting for super block */
-	spin_lock(&info->delalloc_lock);
+	spin_lock(&info->delalloc_root_lock);
 	old_val = btrfs_super_bytes_used(info->super_copy);
 	if (alloc)
 		old_val += num_bytes;
 	else
 		old_val -= num_bytes;
 	btrfs_set_super_bytes_used(info->super_copy, old_val);
-	spin_unlock(&info->delalloc_lock);
+	spin_unlock(&info->delalloc_root_lock);
 
 	while (total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
@@ -4773,7 +5140,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		 * space back to the block group, otherwise we will leak space.
 		 */
 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
-			cache_block_group(cache, trans, NULL, 1);
+			cache_block_group(cache, 1);
 
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
@@ -4823,6 +5190,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 	struct btrfs_block_group_cache *cache;
 	u64 bytenr;
 
+	spin_lock(&root->fs_info->block_group_cache_lock);
+	bytenr = root->fs_info->first_logical_byte;
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+
+	if (bytenr < (u64)-1)
+		return bytenr;
+
 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
 	if (!cache)
 		return 0;
@@ -4873,14 +5247,15 @@ int btrfs_pin_extent(struct btrfs_root *root,
 /*
  * this function must be called within transaction
  */
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
 				    u64 bytenr, u64 num_bytes)
 {
 	struct btrfs_block_group_cache *cache;
+	int ret;
 
 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
-	BUG_ON(!cache); /* Logic error */
+	if (!cache)
+		return -EINVAL;
 
 	/*
 	 * pull in the free space cache (if any) so that our pin
@@ -4888,13 +5263,87 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
 	 * to one because the slow code to read in the free extents does check
 	 * the pinned extents.
 	 */
-	cache_block_group(cache, trans, root, 1);
+	cache_block_group(cache, 1);
 
 	pin_down_extent(root, cache, bytenr, num_bytes, 0);
 
 	/* remove us from the free space cache (if we're there at all) */
-	btrfs_remove_free_space(cache, bytenr, num_bytes);
+	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
 	btrfs_put_block_group(cache);
+	return ret;
+}
+
+static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
+{
+	int ret;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_caching_control *caching_ctl;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, start);
+	if (!block_group)
+		return -EINVAL;
+
+	cache_block_group(block_group, 0);
+	caching_ctl = get_caching_control(block_group);
+
+	if (!caching_ctl) {
+		/* Logic error */
+		BUG_ON(!block_group_cache_done(block_group));
+		ret = btrfs_remove_free_space(block_group, start, num_bytes);
+	} else {
+		mutex_lock(&caching_ctl->mutex);
+
+		if (start >= caching_ctl->progress) {
+			ret = add_excluded_extent(root, start, num_bytes);
+		} else if (start + num_bytes <= caching_ctl->progress) {
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
+		} else {
+			num_bytes = caching_ctl->progress - start;
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
+			if (ret)
+				goto out_lock;
+
+			num_bytes = (start + num_bytes) -
+				caching_ctl->progress;
+			start = caching_ctl->progress;
+			ret = add_excluded_extent(root, start, num_bytes);
+		}
+out_lock:
+		mutex_unlock(&caching_ctl->mutex);
+		put_caching_control(caching_ctl);
+	}
+	btrfs_put_block_group(block_group);
+	return ret;
+}
+
+int btrfs_exclude_logged_extents(struct btrfs_root *log,
+				 struct extent_buffer *eb)
+{
+	struct btrfs_file_extent_item *item;
+	struct btrfs_key key;
+	int found_type;
+	int i;
+
+	if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
+		return 0;
+
+	for (i = 0; i < btrfs_header_nritems(eb); i++) {
+		btrfs_item_key_to_cpu(eb, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+		found_type = btrfs_file_extent_type(eb, item);
+		if (found_type == BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+			continue;
+		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		__exclude_logged_extent(log, key.objectid, key.offset);
+	}
+
 	return 0;
 }
 
@@ -4960,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 	struct btrfs_caching_control *next;
 	struct btrfs_caching_control *caching_ctl;
 	struct btrfs_block_group_cache *cache;
+	struct btrfs_space_info *space_info;
 
 	down_write(&fs_info->extent_commit_sem);
 
@@ -4982,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 
 	up_write(&fs_info->extent_commit_sem);
 
+	list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
+		percpu_counter_set(&space_info->total_bytes_pinned, 0);
+
 	update_global_block_rsv(fs_info);
 }
 
@@ -5079,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
+			     u64 owner, u64 root_objectid)
+{
+	struct btrfs_space_info *space_info;
+	u64 flags;
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+			flags = BTRFS_BLOCK_GROUP_SYSTEM;
+		else
+			flags = BTRFS_BLOCK_GROUP_METADATA;
+	} else {
+		flags = BTRFS_BLOCK_GROUP_DATA;
+	}
+
+	space_info = __find_space_info(fs_info, flags);
+	BUG_ON(!space_info); /* Logic bug */
+	percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
+}
+
+
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 parent,
@@ -5100,6 +5574,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	int num_to_del = 1;
 	u32 item_size;
 	u64 refs;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -5111,6 +5587,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
 	BUG_ON(!is_data && refs_to_drop != 1);
 
+	if (is_data)
+		skinny_metadata = 0;
+
 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
 				    bytenr, num_bytes, parent,
 				    root_objectid, owner_objectid,
@@ -5127,6 +5606,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				found_extent = 1;
 				break;
 			}
+			if (key.type == BTRFS_METADATA_ITEM_KEY &&
+			    key.offset == owner_objectid) {
+				found_extent = 1;
+				break;
+			}
 			if (path->slots[0] - extent_slot > 5)
 				break;
 			extent_slot--;
@@ -5152,12 +5636,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			key.type = BTRFS_EXTENT_ITEM_KEY;
 			key.offset = num_bytes;
 
+			if (!is_data && skinny_metadata) {
+				key.type = BTRFS_METADATA_ITEM_KEY;
+				key.offset = owner_objectid;
+			}
+
 			ret = btrfs_search_slot(trans, extent_root,
 						&key, path, -1, 1);
+			if (ret > 0 && skinny_metadata && path->slots[0]) {
+				/*
+				 * Couldn't find our skinny metadata item,
+				 * see if we have ye olde extent item.
+				 */
+				path->slots[0]--;
+				btrfs_item_key_to_cpu(path->nodes[0], &key,
+						      path->slots[0]);
+				if (key.objectid == bytenr &&
+				    key.type == BTRFS_EXTENT_ITEM_KEY &&
+				    key.offset == num_bytes)
+					ret = 0;
+			}
+
+			if (ret > 0 && skinny_metadata) {
+				skinny_metadata = false;
+				key.type = BTRFS_EXTENT_ITEM_KEY;
+				key.offset = num_bytes;
+				btrfs_release_path(path);
+				ret = btrfs_search_slot(trans, extent_root,
+							&key, path, -1, 1);
+			}
+
 			if (ret) {
-				printk(KERN_ERR "umm, got %d back from search"
-				       ", was looking for %llu\n", ret,
-				       (unsigned long long)bytenr);
+				btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+					ret, (unsigned long long)bytenr);
 				if (ret > 0)
 					btrfs_print_leaf(extent_root,
 							 path->nodes[0]);
@@ -5171,13 +5682,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	} else if (ret == -ENOENT) {
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
-		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-		       "parent %llu root %llu  owner %llu offset %llu\n",
-		       (unsigned long long)bytenr,
-		       (unsigned long long)parent,
-		       (unsigned long long)root_objectid,
-		       (unsigned long long)owner_objectid,
-		       (unsigned long long)owner_offset);
+		btrfs_err(info,
+			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
+			(unsigned long long)bytenr,
+			(unsigned long long)parent,
+			(unsigned long long)root_objectid,
+			(unsigned long long)owner_objectid,
+			(unsigned long long)owner_offset);
 	} else {
 		btrfs_abort_transaction(trans, extent_root, ret);
 		goto out;
@@ -5205,9 +5716,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		ret = btrfs_search_slot(trans, extent_root, &key, path,
 					-1, 1);
 		if (ret) {
-			printk(KERN_ERR "umm, got %d back from search"
-			       ", was looking for %llu\n", ret,
-			       (unsigned long long)bytenr);
+			btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+				ret, (unsigned long long)bytenr);
 			btrfs_print_leaf(extent_root, path->nodes[0]);
 		}
 		if (ret < 0) {
@@ -5223,7 +5733,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(item_size < sizeof(*ei));
 	ei = btrfs_item_ptr(leaf, extent_slot,
 			    struct btrfs_extent_item);
-	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
+	    key.type == BTRFS_EXTENT_ITEM_KEY) {
 		struct btrfs_tree_block_info *bi;
 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
 		bi = (struct btrfs_tree_block_info *)(ei + 1);
@@ -5231,7 +5742,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	}
 
 	refs = btrfs_extent_refs(leaf, ei);
-	BUG_ON(refs < refs_to_drop);
+	if (refs < refs_to_drop) {
+		btrfs_err(info, "trying to drop %d refs but we only have %Lu "
+			  "for bytenr %Lu\n", refs_to_drop, refs, bytenr);
+		ret = -EINVAL;
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
+	}
 	refs -= refs_to_drop;
 
 	if (refs > 0) {
@@ -5256,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				goto out;
 			}
 		}
+		add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
+				 root_objectid);
 	} else {
 		if (found_extent) {
 			BUG_ON(is_data && refs_to_drop !=
@@ -5285,7 +5804,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		}
 
-		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
+		ret = update_block_group(root, bytenr, num_bytes, 0);
 		if (ret) {
 			btrfs_abort_transaction(trans, extent_root, ret);
 			goto out;
@@ -5330,7 +5849,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	if (head->extent_op) {
 		if (!head->must_insert_reserved)
 			goto out;
-		kfree(head->extent_op);
+		btrfs_free_delayed_extent_op(head->extent_op);
 		head->extent_op = NULL;
 	}
 
@@ -5379,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   u64 parent, int last_ref)
 {
 	struct btrfs_block_group_cache *cache = NULL;
+	int pin = 1;
 	int ret;
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -5411,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+		pin = 0;
 	}
 out:
+	if (pin)
+		add_pinned_bytes(root->fs_info, buf->len,
+				 btrfs_header_level(buf),
+				 root->root_key.objectid);
+
 	/*
 	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
 	 * anymore.
@@ -5429,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
+	add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
+
 	/*
 	 * tree log blocks never actually go into the extent allocation
 	 * tree, just update pinning info and exit early.
@@ -5453,10 +5981,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return ret;
 }
 
-static u64 stripe_align(struct btrfs_root *root, u64 val)
+static u64 stripe_align(struct btrfs_root *root,
+			struct btrfs_block_group_cache *cache,
+			u64 val, u64 num_bytes)
 {
-	u64 mask = ((u64)root->stripesize - 1);
-	u64 ret = (val + mask) & ~mask;
+	u64 ret = ALIGN(val, root->stripesize);
 	return ret;
 }
 
@@ -5476,7 +6005,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
 				u64 num_bytes)
 {
 	struct btrfs_caching_control *caching_ctl;
-	DEFINE_WAIT(wait);
 
 	caching_ctl = get_caching_control(cache);
 	if (!caching_ctl)
@@ -5493,7 +6021,6 @@ static noinline int
 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_caching_control *caching_ctl;
-	DEFINE_WAIT(wait);
 
 	caching_ctl = get_caching_control(cache);
 	if (!caching_ctl)
@@ -5507,20 +6034,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 
 int __get_raid_index(u64 flags)
 {
-	int index;
-
 	if (flags & BTRFS_BLOCK_GROUP_RAID10)
-		index = 0;
+		return BTRFS_RAID_RAID10;
 	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-		index = 1;
+		return BTRFS_RAID_RAID1;
 	else if (flags & BTRFS_BLOCK_GROUP_DUP)
-		index = 2;
+		return BTRFS_RAID_DUP;
 	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-		index = 3;
-	else
-		index = 4;
+		return BTRFS_RAID_RAID0;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+		return BTRFS_RAID_RAID5;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+		return BTRFS_RAID_RAID6;
 
-	return index;
+	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 }
 
 static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5547,7 +6074,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *orig_root,
 				     u64 num_bytes, u64 empty_size,
 				     u64 hint_byte, struct btrfs_key *ins,
-				     u64 data)
+				     u64 flags)
 {
 	int ret = 0;
 	struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -5558,8 +6085,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	int empty_cluster = 2 * 1024 * 1024;
 	struct btrfs_space_info *space_info;
 	int loop = 0;
-	int index = __get_raid_index(data);
-	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
+	int index = __get_raid_index(flags);
+	int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
 		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
 	bool found_uncached_bg = false;
 	bool failed_cluster_refill = false;
@@ -5572,11 +6099,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	ins->objectid = 0;
 	ins->offset = 0;
 
-	trace_find_free_extent(orig_root, num_bytes, empty_size, data);
+	trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
 
-	space_info = __find_space_info(root->fs_info, data);
+	space_info = __find_space_info(root->fs_info, flags);
 	if (!space_info) {
-		printk(KERN_ERR "No space info for %llu\n", data);
+		btrfs_err(root->fs_info, "No space info for %llu", flags);
 		return -ENOSPC;
 	}
 
@@ -5587,13 +6114,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	if (btrfs_mixed_space_info(space_info))
 		use_cluster = false;
 
-	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
+	if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
 		last_ptr = &root->fs_info->meta_alloc_cluster;
 		if (!btrfs_test_opt(root, SSD))
 			empty_cluster = 64 * 1024;
 	}
 
-	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+	if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
 	    btrfs_test_opt(root, SSD)) {
 		last_ptr = &root->fs_info->data_alloc_cluster;
 	}
@@ -5622,7 +6149,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 		 * However if we are re-searching with an ideal block group
 		 * picked out then we don't care that the block group is cached.
 		 */
-		if (block_group && block_group_bits(block_group, data) &&
+		if (block_group && block_group_bits(block_group, flags) &&
 		    block_group->cached != BTRFS_CACHE_NO) {
 			down_read(&space_info->groups_sem);
 			if (list_empty(&block_group->list) ||
@@ -5660,9 +6187,11 @@ search:
 		 * raid types, but we want to make sure we only allocate
 		 * for the proper type.
 		 */
-		if (!block_group_bits(block_group, data)) {
+		if (!block_group_bits(block_group, flags)) {
 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
 				BTRFS_BLOCK_GROUP_RAID1 |
+				BTRFS_BLOCK_GROUP_RAID5 |
+				BTRFS_BLOCK_GROUP_RAID6 |
 				BTRFS_BLOCK_GROUP_RAID10;
 
 			/*
@@ -5670,7 +6199,7 @@ search:
 			 * doesn't provide them, bail.  This does allow us to
 			 * fill raid0 from raid1.
 			 */
-			if ((data & extra) && !(block_group->flags & extra))
+			if ((flags & extra) && !(block_group->flags & extra))
 				goto loop;
 		}
 
@@ -5678,8 +6207,7 @@ have_block_group:
 		cached = block_group_cache_done(block_group);
 		if (unlikely(!cached)) {
 			found_uncached_bg = true;
-			ret = cache_block_group(block_group, trans,
-						orig_root, 0);
+			ret = cache_block_group(block_group, 0);
 			BUG_ON(ret < 0);
 			ret = 0;
 		}
@@ -5692,6 +6220,7 @@ have_block_group:
 		 * lets look there
 		 */
 		if (last_ptr) {
+			unsigned long aligned_cluster;
 			/*
 			 * the refill lock keeps out other
 			 * people trying to start a new cluster
@@ -5701,7 +6230,7 @@ have_block_group:
 			if (used_block_group != block_group &&
 			    (!used_block_group ||
 			     used_block_group->ro ||
-			     !block_group_bits(used_block_group, data))) {
+			     !block_group_bits(used_block_group, flags))) {
 				used_block_group = block_group;
 				goto refill_cluster;
 			}
@@ -5758,11 +6287,15 @@ refill_cluster:
 				goto unclustered_alloc;
 			}
 
+			aligned_cluster = max_t(unsigned long,
+						empty_cluster + empty_size,
+					      block_group->full_stripe_len);
+
 			/* allocate a cluster in this block group */
 			ret = btrfs_find_space_cluster(trans, root,
 					       block_group, last_ptr,
 					       search_start, num_bytes,
-					       empty_cluster + empty_size);
+					       aligned_cluster);
 			if (ret == 0) {
 				/*
 				 * now pull our allocation out of this
@@ -5833,7 +6366,8 @@ unclustered_alloc:
 			goto loop;
 		}
 checks:
-		search_start = stripe_align(root, offset);
+		search_start = stripe_align(root, used_block_group,
+					    offset, num_bytes);
 
 		/* move on to the next group */
 		if (search_start + num_bytes >
@@ -5892,7 +6426,7 @@ loop:
 		index = 0;
 		loop++;
 		if (loop == LOOP_ALLOC_CHUNK) {
-			ret = do_chunk_alloc(trans, root, data,
+			ret = do_chunk_alloc(trans, root, flags,
 					     CHUNK_ALLOC_FORCE);
 			/*
 			 * Do not bail out on ENOSPC since we
@@ -5970,21 +6504,22 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 num_bytes, u64 min_alloc_size,
 			 u64 empty_size, u64 hint_byte,
-			 struct btrfs_key *ins, u64 data)
+			 struct btrfs_key *ins, int is_data)
 {
 	bool final_tried = false;
+	u64 flags;
 	int ret;
 
-	data = btrfs_get_alloc_profile(root, data);
+	flags = btrfs_get_alloc_profile(root, is_data);
 again:
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(trans, root, num_bytes, empty_size,
-			       hint_byte, ins, data);
+			       hint_byte, ins, flags);
 
 	if (ret == -ENOSPC) {
 		if (!final_tried) {
 			num_bytes = num_bytes >> 1;
-			num_bytes = num_bytes & ~(root->sectorsize - 1);
+			num_bytes = round_down(num_bytes, root->sectorsize);
 			num_bytes = max(num_bytes, min_alloc_size);
 			if (num_bytes == min_alloc_size)
 				final_tried = true;
@@ -5992,10 +6527,10 @@ again:
 		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
 			struct btrfs_space_info *sinfo;
 
-			sinfo = __find_space_info(root->fs_info, data);
-			printk(KERN_ERR "btrfs allocation failed flags %llu, "
-			       "wanted %llu\n", (unsigned long long)data,
-			       (unsigned long long)num_bytes);
+			sinfo = __find_space_info(root->fs_info, flags);
+			btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
+				(unsigned long long)flags,
+				(unsigned long long)num_bytes);
 			if (sinfo)
 				dump_space_info(sinfo, num_bytes, 1);
 		}
@@ -6014,8 +6549,8 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
 
 	cache = btrfs_lookup_block_group(root->fs_info, start);
 	if (!cache) {
-		printk(KERN_ERR "Unable to find block group for %llu\n",
-		       (unsigned long long)start);
+		btrfs_err(root->fs_info, "Unable to find block group for %llu",
+			(unsigned long long)start);
 		return -ENOSPC;
 	}
 
@@ -6108,11 +6643,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+	ret = update_block_group(root, ins->objectid, ins->offset, 1);
 	if (ret) { /* -ENOENT, logic error */
-		printk(KERN_ERR "btrfs update block group failed for %llu "
-		       "%llu\n", (unsigned long long)ins->objectid,
-		       (unsigned long long)ins->offset);
+		btrfs_err(fs_info, "update block group failed for %llu %llu",
+			(unsigned long long)ins->objectid,
+			(unsigned long long)ins->offset);
 		BUG();
 	}
 	return ret;
@@ -6131,7 +6666,12 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_inline_ref *iref;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
+	u32 size = sizeof(*extent_item) + sizeof(*iref);
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
+
+	if (!skinny_metadata)
+		size += sizeof(*block_info);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -6152,12 +6692,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
 	btrfs_set_extent_flags(leaf, extent_item,
 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
-	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
 
-	btrfs_set_tree_block_key(leaf, block_info, key);
-	btrfs_set_tree_block_level(leaf, block_info, level);
+	if (skinny_metadata) {
+		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+	} else {
+		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+		btrfs_set_tree_block_key(leaf, block_info, key);
+		btrfs_set_tree_block_level(leaf, block_info, level);
+		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+	}
 
-	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
 	if (parent > 0) {
 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 		btrfs_set_extent_inline_ref_type(leaf, iref,
@@ -6172,11 +6716,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_free_path(path);
 
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+	ret = update_block_group(root, ins->objectid, root->leafsize, 1);
 	if (ret) { /* -ENOENT, logic error */
-		printk(KERN_ERR "btrfs update block group failed for %llu "
-		       "%llu\n", (unsigned long long)ins->objectid,
-		       (unsigned long long)ins->offset);
+		btrfs_err(fs_info, "update block group failed for %llu %llu",
+			(unsigned long long)ins->objectid,
+			(unsigned long long)ins->offset);
 		BUG();
 	}
 	return ret;
@@ -6210,58 +6754,33 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	struct btrfs_block_group_cache *block_group;
-	struct btrfs_caching_control *caching_ctl;
-	u64 start = ins->objectid;
-	u64 num_bytes = ins->offset;
-
-	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-	cache_block_group(block_group, trans, NULL, 0);
-	caching_ctl = get_caching_control(block_group);
-
-	if (!caching_ctl) {
-		BUG_ON(!block_group_cache_done(block_group));
-		ret = btrfs_remove_free_space(block_group, start, num_bytes);
-		BUG_ON(ret); /* -ENOMEM */
-	} else {
-		mutex_lock(&caching_ctl->mutex);
-
-		if (start >= caching_ctl->progress) {
-			ret = add_excluded_extent(root, start, num_bytes);
-			BUG_ON(ret); /* -ENOMEM */
-		} else if (start + num_bytes <= caching_ctl->progress) {
-			ret = btrfs_remove_free_space(block_group,
-						      start, num_bytes);
-			BUG_ON(ret); /* -ENOMEM */
-		} else {
-			num_bytes = caching_ctl->progress - start;
-			ret = btrfs_remove_free_space(block_group,
-						      start, num_bytes);
-			BUG_ON(ret); /* -ENOMEM */
 
-			start = caching_ctl->progress;
-			num_bytes = ins->objectid + ins->offset -
-				    caching_ctl->progress;
-			ret = add_excluded_extent(root, start, num_bytes);
-			BUG_ON(ret); /* -ENOMEM */
-		}
-
-		mutex_unlock(&caching_ctl->mutex);
-		put_caching_control(caching_ctl);
+	/*
+	 * Mixed block groups will exclude before processing the log so we only
+	 * need to do the exlude dance if this fs isn't mixed.
+	 */
+	if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
+		ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
+		if (ret)
+			return ret;
 	}
 
+	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	if (!block_group)
+		return -EINVAL;
+
 	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
 					  RESERVE_ALLOC_NO_ACCOUNT);
 	BUG_ON(ret); /* logic error */
-	btrfs_put_block_group(block_group);
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
 					 0, owner, offset, ins, 1);
+	btrfs_put_block_group(block_group);
 	return ret;
 }
 
-struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize,
-					    int level)
+static struct extent_buffer *
+btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      u64 bytenr, u32 blocksize, int level)
 {
 	struct extent_buffer *buf;
 
@@ -6304,49 +6823,51 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	int ret;
+	bool global_updated = false;
 
 	block_rsv = get_block_rsv(trans, root);
 
-	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize,
-					     BTRFS_RESERVE_NO_FLUSH);
-		/*
-		 * If we couldn't reserve metadata bytes try and use some from
-		 * the global reserve.
-		 */
-		if (ret && block_rsv != global_rsv) {
-			ret = block_rsv_use_bytes(global_rsv, blocksize);
-			if (!ret)
-				return global_rsv;
-			return ERR_PTR(ret);
-		} else if (ret) {
-			return ERR_PTR(ret);
-		}
-		return block_rsv;
-	}
-
+	if (unlikely(block_rsv->size == 0))
+		goto try_reserve;
+again:
 	ret = block_rsv_use_bytes(block_rsv, blocksize);
 	if (!ret)
 		return block_rsv;
-	if (ret && !block_rsv->failfast) {
+
+	if (block_rsv->failfast)
+		return ERR_PTR(ret);
+
+	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
+		global_updated = true;
+		update_global_block_rsv(root->fs_info);
+		goto again;
+	}
+
+	if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
 		static DEFINE_RATELIMIT_STATE(_rs,
-				DEFAULT_RATELIMIT_INTERVAL,
-				/*DEFAULT_RATELIMIT_BURST*/ 2);
+				DEFAULT_RATELIMIT_INTERVAL * 10,
+				/*DEFAULT_RATELIMIT_BURST*/ 1);
 		if (__ratelimit(&_rs))
-			WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
-			     ret);
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize,
-					     BTRFS_RESERVE_NO_FLUSH);
-		if (!ret) {
-			return block_rsv;
-		} else if (ret && block_rsv != global_rsv) {
-			ret = block_rsv_use_bytes(global_rsv, blocksize);
-			if (!ret)
-				return global_rsv;
-		}
+			WARN(1, KERN_DEBUG
+				"btrfs: block rsv returned %d\n", ret);
 	}
-
-	return ERR_PTR(-ENOSPC);
+try_reserve:
+	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+				     BTRFS_RESERVE_NO_FLUSH);
+	if (!ret)
+		return block_rsv;
+	/*
+	 * If we couldn't reserve metadata bytes try and use some from
+	 * the global reserve if its space type is the same as the global
+	 * reservation.
+	 */
+	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
+	    block_rsv->space_info == global_rsv->space_info) {
+		ret = block_rsv_use_bytes(global_rsv, blocksize);
+		if (!ret)
+			return global_rsv;
+	}
+	return ERR_PTR(ret);
 }
 
 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
@@ -6374,7 +6895,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	struct extent_buffer *buf;
 	u64 flags = 0;
 	int ret;
-
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
 
 	block_rsv = use_block_rsv(trans, root, blocksize);
 	if (IS_ERR(block_rsv))
@@ -6400,16 +6922,20 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
 		struct btrfs_delayed_extent_op *extent_op;
-		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		extent_op = btrfs_alloc_delayed_extent_op();
 		BUG_ON(!extent_op); /* -ENOMEM */
 		if (key)
 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
 		else
 			memset(&extent_op->key, 0, sizeof(extent_op->key));
 		extent_op->flags_to_set = flags;
-		extent_op->update_key = 1;
+		if (skinny_metadata)
+			extent_op->update_key = 0;
+		else
+			extent_op->update_key = 1;
 		extent_op->update_flags = 1;
 		extent_op->is_data = 0;
+		extent_op->level = level;
 
 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
 					ins.objectid,
@@ -6484,8 +7010,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 			continue;
 
 		/* We don't lock the tree block, it's OK to be racy here */
-		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
-					       &refs, &flags);
+		ret = btrfs_lookup_extent_info(trans, root, bytenr,
+					       wc->level - 1, 1, &refs,
+					       &flags);
 		/* We don't care about errors in readahead. */
 		if (ret < 0)
 			continue;
@@ -6522,7 +7049,7 @@ reada:
 }
 
 /*
- * hepler to process tree block while walking down the tree.
+ * helper to process tree block while walking down the tree.
  *
  * when wc->stage == UPDATE_BACKREF, this function updates
  * back refs for pointers in the block.
@@ -6552,7 +7079,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
 		BUG_ON(!path->locks[level]);
 		ret = btrfs_lookup_extent_info(trans, root,
-					       eb->start, eb->len,
+					       eb->start, level, 1,
 					       &wc->refs[level],
 					       &wc->flags[level]);
 		BUG_ON(ret == -ENOMEM);
@@ -6580,7 +7107,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
 		BUG_ON(ret); /* -ENOMEM */
 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
-						  eb->len, flag, 0);
+						  eb->len, flag,
+						  btrfs_header_level(eb), 0);
 		BUG_ON(ret); /* -ENOMEM */
 		wc->flags[level] |= flag;
 	}
@@ -6597,7 +7125,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 }
 
 /*
- * hepler to process tree block pointer.
+ * helper to process tree block pointer.
  *
  * when wc->stage == DROP_REFERENCE, this function checks
  * reference count of the block pointed to. if the block
@@ -6650,7 +7178,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	btrfs_tree_lock(next);
 	btrfs_set_lock_blocking(next);
 
-	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+	ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
 				       &wc->refs[level - 1],
 				       &wc->flags[level - 1]);
 	if (ret < 0) {
@@ -6658,7 +7186,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
-	BUG_ON(wc->refs[level - 1] == 0);
+	if (unlikely(wc->refs[level - 1] == 0)) {
+		btrfs_err(root->fs_info, "Missing references.");
+		BUG();
+	}
 	*lookup_info = 0;
 
 	if (wc->stage == DROP_REFERENCE) {
@@ -6697,8 +7228,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 		if (reada && level == 1)
 			reada_walk_down(trans, root, wc, path);
 		next = read_tree_block(root, bytenr, blocksize, generation);
-		if (!next)
+		if (!next || !extent_buffer_uptodate(next)) {
+			free_extent_buffer(next);
 			return -EIO;
+		}
 		btrfs_tree_lock(next);
 		btrfs_set_lock_blocking(next);
 	}
@@ -6735,7 +7268,7 @@ skip:
 }
 
 /*
- * hepler to process tree block while walking up the tree.
+ * helper to process tree block while walking up the tree.
  *
  * when wc->stage == DROP_REFERENCE, this function drops
  * reference count on the block.
@@ -6781,7 +7314,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
 			ret = btrfs_lookup_extent_info(trans, root,
-						       eb->start, eb->len,
+						       eb->start, level, 1,
 						       &wc->refs[level],
 						       &wc->flags[level]);
 			if (ret < 0) {
@@ -6917,6 +7450,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  * reference count by one. if update_ref is true, this function
  * also make sure backrefs for the shared block and all lower level
  * blocks are properly updated.
+ *
+ * If called with for_reloc == 0, may exit early with -EAGAIN
  */
 int btrfs_drop_snapshot(struct btrfs_root *root,
 			 struct btrfs_block_rsv *block_rsv, int update_ref,
@@ -6991,8 +7526,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 
 			ret = btrfs_lookup_extent_info(trans, root,
 						path->nodes[level]->start,
-						path->nodes[level]->len,
-						&wc->refs[level],
+						level, 1, &wc->refs[level],
 						&wc->flags[level]);
 			if (ret < 0) {
 				err = ret;
@@ -7018,6 +7552,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
+		if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+			pr_debug("btrfs: drop snapshot early exit\n");
+			err = -EAGAIN;
+			goto out_end_trans;
+		}
+
 		ret = walk_down_tree(trans, root, path, wc);
 		if (ret < 0) {
 			err = ret;
@@ -7075,8 +7615,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	}
 
 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
-					   NULL, NULL);
+		ret = btrfs_find_root(tree_root, &root->root_key, path,
+				      NULL, NULL);
 		if (ret < 0) {
 			btrfs_abort_transaction(trans, tree_root, ret);
 			err = ret;
@@ -7093,11 +7633,11 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	}
 
 	if (root->in_radix) {
-		btrfs_free_fs_root(tree_root->fs_info, root);
+		btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
 	} else {
 		free_extent_buffer(root->node);
 		free_extent_buffer(root->commit_root);
-		kfree(root);
+		btrfs_put_fs_root(root);
 	}
 out_end_trans:
 	btrfs_end_transaction_throttle(trans, tree_root);
@@ -7203,6 +7743,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 		root->fs_info->fs_devices->missing_devices;
 
 	stripped = BTRFS_BLOCK_GROUP_RAID0 |
+		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
 	if (num_devices == 1) {
@@ -7409,6 +7950,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 	struct btrfs_space_info *space_info;
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_device *device;
+	struct btrfs_trans_handle *trans;
 	u64 min_free;
 	u64 dev_min = 1;
 	u64 dev_nr = 0;
@@ -7481,20 +8023,27 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 		index = get_block_group_index(block_group);
 	}
 
-	if (index == 0) {
+	if (index == BTRFS_RAID_RAID10) {
 		dev_min = 4;
 		/* Divide by 2 */
 		min_free >>= 1;
-	} else if (index == 1) {
+	} else if (index == BTRFS_RAID_RAID1) {
 		dev_min = 2;
-	} else if (index == 2) {
+	} else if (index == BTRFS_RAID_DUP) {
 		/* Multiply by 2 */
 		min_free <<= 1;
-	} else if (index == 3) {
+	} else if (index == BTRFS_RAID_RAID0) {
 		dev_min = fs_devices->rw_devices;
 		do_div(min_free, dev_min);
 	}
 
+	/* We need to do this so that we can look at pending chunks */
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
 	mutex_lock(&root->fs_info->chunk_mutex);
 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 		u64 dev_offset;
@@ -7505,7 +8054,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 		 */
 		if (device->total_bytes > device->bytes_used + min_free &&
 		    !device->is_tgtdev_for_dev_replace) {
-			ret = find_free_dev_extent(device, min_free,
+			ret = find_free_dev_extent(trans, device, min_free,
 						   &dev_offset, NULL);
 			if (!ret)
 				dev_nr++;
@@ -7517,6 +8066,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
 		}
 	}
 	mutex_unlock(&root->fs_info->chunk_mutex);
+	btrfs_end_transaction(trans, root);
 out:
 	btrfs_put_block_group(block_group);
 	return ret;
@@ -7651,12 +8201,15 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		space_info = list_entry(info->space_info.next,
 					struct btrfs_space_info,
 					list);
-		if (space_info->bytes_pinned > 0 ||
-		    space_info->bytes_reserved > 0 ||
-		    space_info->bytes_may_use > 0) {
-			WARN_ON(1);
-			dump_space_info(space_info, 0, 0);
+		if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
+			if (space_info->bytes_pinned > 0 ||
+			    space_info->bytes_reserved > 0 ||
+			    space_info->bytes_may_use > 0) {
+				WARN_ON(1);
+				dump_space_info(space_info, 0, 0);
+			}
 		}
+		percpu_counter_destroy(&space_info->total_bytes_pinned);
 		list_del(&space_info->list);
 		kfree(space_info);
 	}
@@ -7754,7 +8307,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		btrfs_release_path(path);
 		cache->flags = btrfs_block_group_flags(&cache->item);
 		cache->sectorsize = root->sectorsize;
-
+		cache->full_stripe_len = btrfs_full_stripe_len(root,
+					       &root->fs_info->mapping_tree,
+					       found_key.objectid);
 		btrfs_init_free_space_ctl(cache);
 
 		/*
@@ -7762,7 +8317,17 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		 * info has super bytes accounted for, otherwise we'll think
 		 * we have more space than we actually do.
 		 */
-		exclude_super_stripes(root, cache);
+		ret = exclude_super_stripes(root, cache);
+		if (ret) {
+			/*
+			 * We may have excluded something, so call this just in
+			 * case.
+			 */
+			free_excluded_extents(root, cache);
+			kfree(cache->free_space_ctl);
+			kfree(cache);
+			goto error;
+		}
 
 		/*
 		 * check for two cases, either we are full, and therefore
@@ -7785,10 +8350,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			free_excluded_extents(root, cache);
 		}
 
+		ret = btrfs_add_block_group_cache(root->fs_info, cache);
+		if (ret) {
+			btrfs_remove_free_space_cache(cache);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
+
 		ret = update_space_info(info, cache->flags, found_key.offset,
 					btrfs_block_group_used(&cache->item),
 					&space_info);
-		BUG_ON(ret); /* -ENOMEM */
+		if (ret) {
+			btrfs_remove_free_space_cache(cache);
+			spin_lock(&info->block_group_cache_lock);
+			rb_erase(&cache->cache_node,
+				 &info->block_group_cache_tree);
+			spin_unlock(&info->block_group_cache_lock);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
+
 		cache->space_info = space_info;
 		spin_lock(&cache->space_info->lock);
 		cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7796,9 +8377,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
 		__link_block_group(space_info, cache);
 
-		ret = btrfs_add_block_group_cache(root->fs_info, cache);
-		BUG_ON(ret); /* Logic error */
-
 		set_avail_alloc_bits(root->fs_info, cache->flags);
 		if (btrfs_chunk_readonly(root, cache->key.objectid))
 			set_block_group_ro(cache, 1);
@@ -7808,6 +8386,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		if (!(get_alloc_profile(root, space_info->flags) &
 		      (BTRFS_BLOCK_GROUP_RAID10 |
 		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_RAID5 |
+		       BTRFS_BLOCK_GROUP_RAID6 |
 		       BTRFS_BLOCK_GROUP_DUP)))
 			continue;
 		/*
@@ -7852,6 +8432,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 					sizeof(item));
 		if (ret)
 			btrfs_abort_transaction(trans, extent_root, ret);
+		ret = btrfs_finish_chunk_alloc(trans, extent_root,
+					       key.objectid, key.offset);
+		if (ret)
+			btrfs_abort_transaction(trans, extent_root, ret);
 	}
 }
 
@@ -7883,6 +8467,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 	cache->sectorsize = root->sectorsize;
 	cache->fs_info = root->fs_info;
+	cache->full_stripe_len = btrfs_full_stripe_len(root,
+					       &root->fs_info->mapping_tree,
+					       chunk_offset);
 
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
@@ -7899,16 +8486,41 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	cache->last_byte_to_unpin = (u64)-1;
 	cache->cached = BTRFS_CACHE_FINISHED;
-	exclude_super_stripes(root, cache);
+	ret = exclude_super_stripes(root, cache);
+	if (ret) {
+		/*
+		 * We may have excluded something, so call this just in
+		 * case.
+		 */
+		free_excluded_extents(root, cache);
+		kfree(cache->free_space_ctl);
+		kfree(cache);
+		return ret;
+	}
 
 	add_new_free_space(cache, root->fs_info, chunk_offset,
 			   chunk_offset + size);
 
 	free_excluded_extents(root, cache);
 
+	ret = btrfs_add_block_group_cache(root->fs_info, cache);
+	if (ret) {
+		btrfs_remove_free_space_cache(cache);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
-	BUG_ON(ret); /* -ENOMEM */
+	if (ret) {
+		btrfs_remove_free_space_cache(cache);
+		spin_lock(&root->fs_info->block_group_cache_lock);
+		rb_erase(&cache->cache_node,
+			 &root->fs_info->block_group_cache_tree);
+		spin_unlock(&root->fs_info->block_group_cache_lock);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
 	update_global_block_rsv(root->fs_info);
 
 	spin_lock(&cache->space_info->lock);
@@ -7917,9 +8529,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	__link_block_group(cache->space_info, cache);
 
-	ret = btrfs_add_block_group_cache(root->fs_info, cache);
-	BUG_ON(ret); /* Logic error */
-
 	list_add_tail(&cache->new_bg_list, &trans->new_bgs);
 
 	set_avail_alloc_bits(extent_root->fs_info, type);
@@ -7932,12 +8541,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 	u64 extra_flags = chunk_to_extended(flags) &
 				BTRFS_EXTENDED_PROFILE_MASK;
 
+	write_seqlock(&fs_info->profiles_lock);
 	if (flags & BTRFS_BLOCK_GROUP_DATA)
 		fs_info->avail_data_alloc_bits &= ~extra_flags;
 	if (flags & BTRFS_BLOCK_GROUP_METADATA)
 		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
 	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 		fs_info->avail_system_alloc_bits &= ~extra_flags;
+	write_sequnlock(&fs_info->profiles_lock);
 }
 
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -8036,6 +8647,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	spin_lock(&root->fs_info->block_group_cache_lock);
 	rb_erase(&block_group->cache_node,
 		 &root->fs_info->block_group_cache_tree);
+
+	if (root->fs_info->first_logical_byte == block_group->key.objectid)
+		root->fs_info->first_logical_byte = (u64)-1;
 	spin_unlock(&root->fs_info->block_group_cache_lock);
 
 	down_write(&block_group->space_info->groups_sem);
@@ -8158,9 +8772,16 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 
 		if (end - start >= range->minlen) {
 			if (!block_group_cache_done(cache)) {
-				ret = cache_block_group(cache, NULL, root, 0);
-				if (!ret)
-					wait_block_group_cache_done(cache);
+				ret = cache_block_group(cache, 0);
+				if (ret) {
+					btrfs_put_block_group(cache);
+					break;
+				}
+				ret = wait_block_group_cache_done(cache);
+				if (ret) {
+					btrfs_put_block_group(cache);
+					break;
+				}
 			}
 			ret = btrfs_trim_block_group(cache,
 						     &group_trimmed,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1b319df29ee..583d98bd065 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4,7 +4,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
-#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
@@ -24,13 +23,83 @@
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
+static struct bio_set *btrfs_bioset;
 
+#ifdef CONFIG_BTRFS_DEBUG
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
-#define LEAK_DEBUG 0
-#if LEAK_DEBUG
 static DEFINE_SPINLOCK(leak_lock);
+
+static inline
+void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&leak_lock, flags);
+	list_add(new, head);
+	spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_del(struct list_head *entry)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&leak_lock, flags);
+	list_del(entry);
+	spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_check(void)
+{
+	struct extent_state *state;
+	struct extent_buffer *eb;
+
+	while (!list_empty(&states)) {
+		state = list_entry(states.next, struct extent_state, leak_list);
+		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+		       "state %lu in tree %p refs %d\n",
+		       (unsigned long long)state->start,
+		       (unsigned long long)state->end,
+		       state->state, state->tree, atomic_read(&state->refs));
+		list_del(&state->leak_list);
+		kmem_cache_free(extent_state_cache, state);
+	}
+
+	while (!list_empty(&buffers)) {
+		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+		       "refs %d\n", (unsigned long long)eb->start,
+		       eb->len, atomic_read(&eb->refs));
+		list_del(&eb->leak_list);
+		kmem_cache_free(extent_buffer_cache, eb);
+	}
+}
+
+#define btrfs_debug_check_extent_io_range(inode, start, end)		\
+	__btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+		struct inode *inode, u64 start, u64 end)
+{
+	u64 isize = i_size_read(inode);
+
+	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+		printk_ratelimited(KERN_DEBUG
+		    "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+				caller,
+				(unsigned long long)btrfs_ino(inode),
+				(unsigned long long)isize,
+				(unsigned long long)start,
+				(unsigned long long)end);
+	}
+}
+#else
+#define btrfs_leak_debug_add(new, head)	do {} while (0)
+#define btrfs_leak_debug_del(entry)	do {} while (0)
+#define btrfs_leak_debug_check()	do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
 #endif
 
 #define BUFFER_LRU_MAX 64
@@ -76,38 +145,26 @@ int __init extent_io_init(void)
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_buffer_cache)
 		goto free_state_cache;
+
+	btrfs_bioset = bioset_create(BIO_POOL_SIZE,
+				     offsetof(struct btrfs_io_bio, bio));
+	if (!btrfs_bioset)
+		goto free_buffer_cache;
 	return 0;
 
+free_buffer_cache:
+	kmem_cache_destroy(extent_buffer_cache);
+	extent_buffer_cache = NULL;
+
 free_state_cache:
 	kmem_cache_destroy(extent_state_cache);
+	extent_state_cache = NULL;
 	return -ENOMEM;
 }
 
 void extent_io_exit(void)
 {
-	struct extent_state *state;
-	struct extent_buffer *eb;
-
-	while (!list_empty(&states)) {
-		state = list_entry(states.next, struct extent_state, leak_list);
-		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
-		       "state %lu in tree %p refs %d\n",
-		       (unsigned long long)state->start,
-		       (unsigned long long)state->end,
-		       state->state, state->tree, atomic_read(&state->refs));
-		list_del(&state->leak_list);
-		kmem_cache_free(extent_state_cache, state);
-
-	}
-
-	while (!list_empty(&buffers)) {
-		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
-		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
-		       "refs %d\n", (unsigned long long)eb->start,
-		       eb->len, atomic_read(&eb->refs));
-		list_del(&eb->leak_list);
-		kmem_cache_free(extent_buffer_cache, eb);
-	}
+	btrfs_leak_debug_check();
 
 	/*
 	 * Make sure all delayed rcu free are flushed before we
@@ -118,6 +175,8 @@ void extent_io_exit(void)
 		kmem_cache_destroy(extent_state_cache);
 	if (extent_buffer_cache)
 		kmem_cache_destroy(extent_buffer_cache);
+	if (btrfs_bioset)
+		bioset_free(btrfs_bioset);
 }
 
 void extent_io_tree_init(struct extent_io_tree *tree,
@@ -135,9 +194,6 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 static struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
-#if LEAK_DEBUG
-	unsigned long flags;
-#endif
 
 	state = kmem_cache_alloc(extent_state_cache, mask);
 	if (!state)
@@ -145,11 +201,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
 	state->state = 0;
 	state->private = 0;
 	state->tree = NULL;
-#if LEAK_DEBUG
-	spin_lock_irqsave(&leak_lock, flags);
-	list_add(&state->leak_list, &states);
-	spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+	btrfs_leak_debug_add(&state->leak_list, &states);
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
 	trace_alloc_extent_state(state, mask, _RET_IP_);
@@ -161,15 +213,8 @@ void free_extent_state(struct extent_state *state)
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
-#if LEAK_DEBUG
-		unsigned long flags;
-#endif
 		WARN_ON(state->tree);
-#if LEAK_DEBUG
-		spin_lock_irqsave(&leak_lock, flags);
-		list_del(&state->leak_list);
-		spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+		btrfs_leak_debug_del(&state->leak_list);
 		trace_free_extent_state(state, _RET_IP_);
 		kmem_cache_free(extent_state_cache, state);
 	}
@@ -309,21 +354,21 @@ static void merge_state(struct extent_io_tree *tree,
 }
 
 static void set_state_cb(struct extent_io_tree *tree,
-			 struct extent_state *state, int *bits)
+			 struct extent_state *state, unsigned long *bits)
 {
 	if (tree->ops && tree->ops->set_bit_hook)
 		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 }
 
 static void clear_state_cb(struct extent_io_tree *tree,
-			   struct extent_state *state, int *bits)
+			   struct extent_state *state, unsigned long *bits)
 {
 	if (tree->ops && tree->ops->clear_bit_hook)
 		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
-			   struct extent_state *state, int *bits);
+			   struct extent_state *state, unsigned long *bits);
 
 /*
  * insert an extent_state struct into the tree.  'bits' are set on the
@@ -337,7 +382,7 @@ static void set_state_bits(struct extent_io_tree *tree,
  */
 static int insert_state(struct extent_io_tree *tree,
 			struct extent_state *state, u64 start, u64 end,
-			int *bits)
+			unsigned long *bits)
 {
 	struct rb_node *node;
 
@@ -425,10 +470,10 @@ static struct extent_state *next_state(struct extent_state *state)
  */
 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 					    struct extent_state *state,
-					    int *bits, int wake)
+					    unsigned long *bits, int wake)
 {
 	struct extent_state *next;
-	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
+	unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
 
 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
@@ -464,7 +509,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
 	return prealloc;
 }
 
-void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 {
 	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
 		    "Extent tree was modified by another "
@@ -484,7 +529,7 @@ void extent_io_tree_panic(struct extent_io_tree *tree, int err)
  * This takes the tree lock, and returns 0 on success and < 0 on error.
  */
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete,
+		     unsigned long bits, int wake, int delete,
 		     struct extent_state **cached_state,
 		     gfp_t mask)
 {
@@ -496,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	int err;
 	int clear = 0;
 
+	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
+	if (bits & EXTENT_DELALLOC)
+		bits |= EXTENT_NORESERVE;
+
 	if (delete)
 		bits |= ~EXTENT_CTLBITS;
 	bits |= EXTENT_FIRST_DELALLOC;
@@ -645,11 +695,14 @@ static void wait_on_state(struct extent_io_tree *tree,
  * The range [start, end] is inclusive.
  * The tree lock is taken by this function
  */
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			    unsigned long bits)
 {
 	struct extent_state *state;
 	struct rb_node *node;
 
+	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
 	spin_lock(&tree->lock);
 again:
 	while (1) {
@@ -686,9 +739,9 @@ out:
 
 static void set_state_bits(struct extent_io_tree *tree,
 			   struct extent_state *state,
-			   int *bits)
+			   unsigned long *bits)
 {
-	int bits_to_set = *bits & ~EXTENT_CTLBITS;
+	unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
 
 	set_state_cb(tree, state, bits);
 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
@@ -731,8 +784,9 @@ static void uncache_state(struct extent_state **cached_ptr)
 
 static int __must_check
 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		 int bits, int exclusive_bits, u64 *failed_start,
-		 struct extent_state **cached_state, gfp_t mask)
+		 unsigned long bits, unsigned long exclusive_bits,
+		 u64 *failed_start, struct extent_state **cached_state,
+		 gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -741,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	u64 last_start;
 	u64 last_end;
 
+	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
 	bits |= EXTENT_FIRST_DELALLOC;
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
@@ -924,9 +980,9 @@ search_again:
 	goto again;
 }
 
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
-		   u64 *failed_start, struct extent_state **cached_state,
-		   gfp_t mask)
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   unsigned long bits, u64 * failed_start,
+		   struct extent_state **cached_state, gfp_t mask)
 {
 	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
 				cached_state, mask);
@@ -951,7 +1007,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
  * boundary bits like LOCK.
  */
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		       int bits, int clear_bits,
+		       unsigned long bits, unsigned long clear_bits,
 		       struct extent_state **cached_state, gfp_t mask)
 {
 	struct extent_state *state;
@@ -961,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	u64 last_start;
 	u64 last_end;
 
+	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
 		prealloc = alloc_extent_state(mask);
@@ -1144,14 +1202,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 }
 
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		    int bits, gfp_t mask)
+		    unsigned long bits, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, bits, NULL,
 			      NULL, mask);
 }
 
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		      int bits, gfp_t mask)
+		      unsigned long bits, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
 }
@@ -1190,7 +1248,7 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask)
 {
-	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
+	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
 			      cached_state, mask);
 }
 
@@ -1206,7 +1264,7 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
  * us if waiting is desired.
  */
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, struct extent_state **cached_state)
+		     unsigned long bits, struct extent_state **cached_state)
 {
 	int err;
 	u64 failed_start;
@@ -1258,6 +1316,39 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 				GFP_NOFS);
 }
 
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(inode->i_mapping, index);
+		BUG_ON(!page); /* Pages should be in the extent_io_tree */
+		clear_page_dirty_for_io(page);
+		page_cache_release(page);
+		index++;
+	}
+	return 0;
+}
+
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(inode->i_mapping, index);
+		BUG_ON(!page); /* Pages should be in the extent_io_tree */
+		account_page_redirty(page);
+		__set_page_dirty_nobuffers(page);
+		page_cache_release(page);
+		index++;
+	}
+	return 0;
+}
+
 /*
  * helper function to set both pages and extents in the tree writeback
  */
@@ -1281,8 +1372,9 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
  * return it.  tree->lock must be held.  NULL will returned if
  * nothing was found after 'start'
  */
-struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
-						 u64 start, int bits)
+static struct extent_state *
+find_first_extent_bit_state(struct extent_io_tree *tree,
+			    u64 start, unsigned long bits)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1316,7 +1408,7 @@ out:
  * If nothing was found, 1 is returned. If found something, return 0.
  */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits,
+			  u64 *start_ret, u64 *end_ret, unsigned long bits,
 			  struct extent_state **cached_state)
 {
 	struct extent_state *state;
@@ -1606,7 +1698,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
-	int clear_bits = 0;
+	unsigned long clear_bits = 0;
 
 	if (op & EXTENT_CLEAR_UNLOCK)
 		clear_bits |= EXTENT_LOCKED;
@@ -1745,6 +1837,64 @@ out:
 	return ret;
 }
 
+void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[],
+			    int count)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	BUG_ON(!node);
+
+	state = rb_entry(node, struct extent_state, rb_node);
+	BUG_ON(state->start != start);
+
+	while (count) {
+		state->private = *csums++;
+		count--;
+		state = next_state(state);
+	}
+	spin_unlock(&tree->lock);
+}
+
+static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index)
+{
+	struct bio_vec *bvec = bio->bi_io_vec + bio_index;
+
+	return page_offset(bvec->bv_page) + bvec->bv_offset;
+}
+
+void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int bio_index,
+			u32 csums[], int count)
+{
+	struct rb_node *node;
+	struct extent_state *state = NULL;
+	u64 start;
+
+	spin_lock(&tree->lock);
+	do {
+		start = __btrfs_get_bio_offset(bio, bio_index);
+		if (state == NULL || state->start != start) {
+			node = tree_search(tree, start);
+			BUG_ON(!node);
+
+			state = rb_entry(node, struct extent_state, rb_node);
+			BUG_ON(state->start != start);
+		}
+		state->private = *csums++;
+		count--;
+		bio_index++;
+
+		state = next_state(state);
+	} while (count);
+	spin_unlock(&tree->lock);
+}
+
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 {
 	struct rb_node *node;
@@ -1779,7 +1929,7 @@ out:
  * range is found set.
  */
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int filled, struct extent_state *cached)
+		   unsigned long bits, int filled, struct extent_state *cached)
 {
 	struct extent_state *state = NULL;
 	struct rb_node *node;
@@ -1834,35 +1984,13 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
  */
 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 {
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
 		SetPageUptodate(page);
 }
 
 /*
- * helper function to unlock a page if all the extents in the tree
- * for that page are unlocked
- */
-static void check_page_locked(struct extent_io_tree *tree, struct page *page)
-{
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
-		unlock_page(page);
-}
-
-/*
- * helper function to end page writeback if all the extents
- * in the tree for that page are done with writeback
- */
-static void check_page_writeback(struct extent_io_tree *tree,
-				 struct page *page)
-{
-	end_page_writeback(page);
-}
-
-/*
  * When IO fails, either with EIO or csum verification fails, we
  * try other mirrors that might have a good copy of the data.  This
  * io_failure_record is used to record state as we go through all the
@@ -1895,13 +2023,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
 	if (ret)
 		err = ret;
 
-	if (did_repair) {
-		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
-					rec->start + rec->len - 1,
-					EXTENT_DAMAGED, GFP_NOFS);
-		if (ret && !err)
-			err = ret;
-	}
+	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+				rec->start + rec->len - 1,
+				EXTENT_DAMAGED, GFP_NOFS);
+	if (ret && !err)
+		err = ret;
 
 	kfree(rec);
 	return err;
@@ -1932,11 +2058,16 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 	u64 map_length = 0;
 	u64 sector;
 	struct btrfs_bio *bbio = NULL;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	int ret;
 
 	BUG_ON(!mirror_num);
 
-	bio = bio_alloc(GFP_NOFS, 1);
+	/* we can't repair anything in raid56 yet */
+	if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
+		return 0;
+
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 	if (!bio)
 		return -EIO;
 	bio->bi_private = &compl;
@@ -1960,7 +2091,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 		return -EIO;
 	}
 	bio->bi_bdev = dev->bdev;
-	bio_add_page(bio, page, length, start-page_offset(page));
+	bio_add_page(bio, page, length, start - page_offset(page));
 	btrfsic_submit_bio(WRITE_SYNC, bio);
 	wait_for_completion(&compl);
 
@@ -2052,6 +2183,7 @@ static int clean_io_failure(u64 start, struct page *page)
 						failrec->failed_mirror);
 			did_repair = !ret;
 		}
+		ret = 0;
 	}
 
 out:
@@ -2225,7 +2357,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
 		return -EIO;
 	}
 
-	bio = bio_alloc(GFP_NOFS, 1);
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 	if (!bio) {
 		free_io_failure(inode, failrec, 0);
 		return -EIO;
@@ -2287,20 +2419,24 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
-	int whole_page;
 
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
 
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			 bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
+		/* We always issue full-page reads, but if some block
+		 * in a page fails to read, blk_update_request() will
+		 * advance bv_offset and adjust bv_len to compensate.
+		 * Print a warning for nonzero offsets, and an error
+		 * if they don't add up to a full page.  */
+		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
+			printk("%s page write in btrfs with offset %u and length %u\n",
+			       bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
+			       ? KERN_ERR "partial" : KERN_INFO "incomplete",
+			       bvec->bv_offset, bvec->bv_len);
 
-		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
-			whole_page = 1;
-		else
-			whole_page = 0;
+		start = page_offset(page);
+		end = start + bvec->bv_offset + bvec->bv_len - 1;
 
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
@@ -2308,10 +2444,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 		if (end_extent_writepage(page, err, start, end))
 			continue;
 
-		if (whole_page)
-			end_page_writeback(page);
-		else
-			check_page_writeback(tree, page);
+		end_page_writeback(page);
 	} while (bvec >= bio->bi_io_vec);
 
 	bio_put(bio);
@@ -2336,7 +2469,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
-	int whole_page;
 	int mirror;
 	int ret;
 
@@ -2347,20 +2479,27 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		struct page *page = bvec->bv_page;
 		struct extent_state *cached = NULL;
 		struct extent_state *state;
+		struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+		struct inode *inode = page->mapping->host;
 
 		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
-			 "mirror=%ld\n", (u64)bio->bi_sector, err,
-			 (long int)bio->bi_bdev);
-		tree = &BTRFS_I(page->mapping->host)->io_tree;
-
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
-
-		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
-			whole_page = 1;
-		else
-			whole_page = 0;
+			 "mirror=%lu\n", (u64)bio->bi_sector, err,
+			 io_bio->mirror_num);
+		tree = &BTRFS_I(inode)->io_tree;
+
+		/* We always issue full-page reads, but if some block
+		 * in a page fails to read, blk_update_request() will
+		 * advance bv_offset and adjust bv_len to compensate.
+		 * Print a warning for nonzero offsets, and an error
+		 * if they don't add up to a full page.  */
+		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
+			printk("%s page read in btrfs with offset %u and length %u\n",
+			       bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
+			       ? KERN_ERR "partial" : KERN_INFO "incomplete",
+			       bvec->bv_offset, bvec->bv_len);
+
+		start = page_offset(page);
+		end = start + bvec->bv_offset + bvec->bv_len - 1;
 
 		if (++bvec <= bvec_end)
 			prefetchw(&bvec->bv_page->flags);
@@ -2376,7 +2515,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		}
 		spin_unlock(&tree->lock);
 
-		mirror = (int)(unsigned long)bio->bi_bdev;
+		mirror = io_bio->mirror_num;
 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
 			ret = tree->ops->readpage_end_io_hook(page, start, end,
 							      state, mirror);
@@ -2419,39 +2558,43 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		}
 		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
 
-		if (whole_page) {
-			if (uptodate) {
-				SetPageUptodate(page);
-			} else {
-				ClearPageUptodate(page);
-				SetPageError(page);
-			}
-			unlock_page(page);
+		if (uptodate) {
+			loff_t i_size = i_size_read(inode);
+			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+			unsigned offset;
+
+			/* Zero out the end if this page straddles i_size */
+			offset = i_size & (PAGE_CACHE_SIZE-1);
+			if (page->index == end_index && offset)
+				zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+			SetPageUptodate(page);
 		} else {
-			if (uptodate) {
-				check_page_uptodate(tree, page);
-			} else {
-				ClearPageUptodate(page);
-				SetPageError(page);
-			}
-			check_page_locked(tree, page);
+			ClearPageUptodate(page);
+			SetPageError(page);
 		}
+		unlock_page(page);
 	} while (bvec <= bvec_end);
 
 	bio_put(bio);
 }
 
+/*
+ * this allocates from the btrfs_bioset.  We're returning a bio right now
+ * but you can call btrfs_io_bio for the appropriate container_of magic
+ */
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags)
 {
 	struct bio *bio;
 
-	bio = bio_alloc(gfp_flags, nr_vecs);
+	bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
 
 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
-		while (!bio && (nr_vecs /= 2))
-			bio = bio_alloc(gfp_flags, nr_vecs);
+		while (!bio && (nr_vecs /= 2)) {
+			bio = bio_alloc_bioset(gfp_flags,
+					       nr_vecs, btrfs_bioset);
+		}
 	}
 
 	if (bio) {
@@ -2462,6 +2605,19 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 	return bio;
 }
 
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+	return bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+}
+
+
+/* this also allocates from the btrfs_bioset */
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+	return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
+}
+
+
 static int __must_check submit_one_bio(int rw, struct bio *bio,
 				       int mirror_num, unsigned long bio_flags)
 {
@@ -2471,7 +2627,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
 	struct extent_io_tree *tree = bio->bi_private;
 	u64 start;
 
-	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+	start = page_offset(page) + bvec->bv_offset;
 
 	bio->bi_private = NULL;
 
@@ -2489,13 +2645,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
 	return ret;
 }
 
-static int merge_bio(struct extent_io_tree *tree, struct page *page,
+static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
 		     unsigned long offset, size_t size, struct bio *bio,
 		     unsigned long bio_flags)
 {
 	int ret = 0;
 	if (tree->ops && tree->ops->merge_bio_hook)
-		ret = tree->ops->merge_bio_hook(page, offset, size, bio,
+		ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
 						bio_flags);
 	BUG_ON(ret < 0);
 	return ret;
@@ -2526,11 +2682,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 		if (old_compressed)
 			contig = bio->bi_sector == sector;
 		else
-			contig = bio->bi_sector + (bio->bi_size >> 9) ==
-				sector;
+			contig = bio_end_sector(bio) == sector;
 
 		if (prev_bio_flags != bio_flags || !contig ||
-		    merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
+		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
 		    bio_add_page(bio, page, page_size, offset) < page_size) {
 			ret = submit_one_bio(rw, bio, mirror_num,
 					     prev_bio_flags);
@@ -2562,7 +2717,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	return ret;
 }
 
-void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+static void attach_extent_buffer_page(struct extent_buffer *eb,
+				      struct page *page)
 {
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
@@ -2592,10 +2748,10 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 				   struct page *page,
 				   get_extent_t *get_extent,
 				   struct bio **bio, int mirror_num,
-				   unsigned long *bio_flags)
+				   unsigned long *bio_flags, int rw)
 {
 	struct inode *inode = page->mapping->host;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
@@ -2648,6 +2804,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 		}
 	}
 	while (cur <= end) {
+		unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+
 		if (cur >= last_byte) {
 			char *userpage;
 			struct extent_state *cached = NULL;
@@ -2682,7 +2840,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		cur_end = min(extent_map_end(em) - 1, end);
-		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		iosize = ALIGN(iosize, blocksize);
 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
 			disk_io_size = em->block_len;
 			sector = em->block_start >> 9;
@@ -2735,26 +2893,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			continue;
 		}
 
-		ret = 0;
-		if (tree->ops && tree->ops->readpage_io_hook) {
-			ret = tree->ops->readpage_io_hook(page, cur,
-							  cur + iosize - 1);
-		}
-		if (!ret) {
-			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
-			pnr -= page->index;
-			ret = submit_extent_page(READ, tree, page,
+		pnr -= page->index;
+		ret = submit_extent_page(rw, tree, page,
 					 sector, disk_io_size, pg_offset,
 					 bdev, bio, pnr,
 					 end_bio_extent_readpage, mirror_num,
 					 *bio_flags,
 					 this_bio_flag);
-			if (!ret) {
-				nr++;
-				*bio_flags = this_bio_flag;
-			}
-		}
-		if (ret) {
+		if (!ret) {
+			nr++;
+			*bio_flags = this_bio_flag;
+		} else {
 			SetPageError(page);
 			unlock_extent(tree, cur, cur + iosize - 1);
 		}
@@ -2778,7 +2927,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 	int ret;
 
 	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
-				      &bio_flags);
+				      &bio_flags, READ);
 	if (bio)
 		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
 	return ret;
@@ -2806,7 +2955,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	struct inode *inode = page->mapping->host;
 	struct extent_page_data *epd = data;
 	struct extent_io_tree *tree = epd->tree;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 delalloc_start;
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
@@ -2847,7 +2996,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
 	   (page->index == end_index && !pg_offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0);
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
 		unlock_page(page);
 		return 0;
 	}
@@ -2982,7 +3131,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		BUG_ON(extent_map_end(em) <= cur);
 		BUG_ON(end < cur);
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
-		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		iosize = ALIGN(iosize, blocksize);
 		sector = (em->block_start + extent_offset) >> 9;
 		bdev = em->bdev;
 		block_start = em->block_start;
@@ -3077,7 +3226,7 @@ static int eb_wait(void *word)
 	return 0;
 }
 
-static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
 {
 	wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
 		    TASK_UNINTERRUPTIBLE);
@@ -3124,12 +3273,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
 		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
 		spin_unlock(&eb->refs_lock);
 		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-		spin_lock(&fs_info->delalloc_lock);
-		if (fs_info->dirty_metadata_bytes >= eb->len)
-			fs_info->dirty_metadata_bytes -= eb->len;
-		else
-			WARN_ON(1);
-		spin_unlock(&fs_info->delalloc_lock);
+		__percpu_counter_add(&fs_info->dirty_metadata_bytes,
+				     -eb->len,
+				     fs_info->dirty_metadata_batch);
 		ret = 1;
 	} else {
 		spin_unlock(&eb->refs_lock);
@@ -3205,7 +3351,7 @@ static int write_one_eb(struct extent_buffer *eb,
 	u64 offset = eb->start;
 	unsigned long i, num_pages;
 	unsigned long bio_flags = 0;
-	int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
+	int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
 	int ret = 0;
 
 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
@@ -3446,15 +3592,9 @@ retry:
 			 * swizzled back from swapper_space to tmpfs file
 			 * mapping
 			 */
-			if (tree->ops &&
-			    tree->ops->write_cache_pages_lock_hook) {
-				tree->ops->write_cache_pages_lock_hook(page,
-							       data, flush_fn);
-			} else {
-				if (!trylock_page(page)) {
-					flush_fn(data);
-					lock_page(page);
-				}
+			if (!trylock_page(page)) {
+				flush_fn(data);
+				lock_page(page);
 			}
 
 			if (unlikely(page->mapping != mapping)) {
@@ -3648,14 +3788,14 @@ int extent_readpages(struct extent_io_tree *tree,
 			continue;
 		for (i = 0; i < nr; i++) {
 			__extent_read_full_page(tree, pagepool[i], get_extent,
-					&bio, 0, &bio_flags);
+					&bio, 0, &bio_flags, READ);
 			page_cache_release(pagepool[i]);
 		}
 		nr = 0;
 	}
 	for (i = 0; i < nr; i++) {
 		__extent_read_full_page(tree, pagepool[i], get_extent,
-					&bio, 0, &bio_flags);
+					&bio, 0, &bio_flags, READ);
 		page_cache_release(pagepool[i]);
 	}
 
@@ -3674,11 +3814,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset)
 {
 	struct extent_state *cached_state = NULL;
-	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
 
-	start += (offset + blocksize - 1) & ~(blocksize - 1);
+	start += ALIGN(offset, blocksize);
 	if (start > end)
 		return 0;
 
@@ -3696,11 +3836,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
  * are locked or under IO and drops the related state bits if it is safe
  * to drop the page.
  */
-int try_release_extent_state(struct extent_map_tree *map,
-			     struct extent_io_tree *tree, struct page *page,
-			     gfp_t mask)
+static int try_release_extent_state(struct extent_map_tree *map,
+				    struct extent_io_tree *tree,
+				    struct page *page, gfp_t mask)
 {
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	int ret = 1;
 
@@ -3739,7 +3879,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 			       gfp_t mask)
 {
 	struct extent_map *em;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 
 	if ((mask & __GFP_WAIT) &&
@@ -3797,7 +3937,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
 		len = last - offset;
 		if (len == 0)
 			break;
-		len = (len + sectorsize - 1) & ~(sectorsize - 1);
+		len = ALIGN(len, sectorsize);
 		em = get_extent(inode, NULL, 0, offset, len, 0);
 		if (IS_ERR_OR_NULL(em))
 			return em;
@@ -3895,7 +4035,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		last_for_get_extent = isize;
 	}
 
-	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
 			 &cached_state);
 
 	em = get_extent_skip_holes(inode, start, last_for_get_extent,
@@ -3982,21 +4122,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 out_free:
 	free_extent_map(em);
 out:
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
 			     &cached_state, GFP_NOFS);
 	return ret;
 }
 
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
-#if LEAK_DEBUG
-	unsigned long flags;
-	spin_lock_irqsave(&leak_lock, flags);
-	list_del(&eb->leak_list);
-	spin_unlock_irqrestore(&leak_lock, flags);
-#endif
-	if (eb->pages && eb->pages != eb->inline_pages)
-		kfree(eb->pages);
+	btrfs_leak_debug_del(&eb->leak_list);
 	kmem_cache_free(extent_buffer_cache, eb);
 }
 
@@ -4006,9 +4139,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 						   gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
-#if LEAK_DEBUG
-	unsigned long flags;
-#endif
 
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	if (eb == NULL)
@@ -4028,28 +4158,18 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	init_waitqueue_head(&eb->write_lock_wq);
 	init_waitqueue_head(&eb->read_lock_wq);
 
-#if LEAK_DEBUG
-	spin_lock_irqsave(&leak_lock, flags);
-	list_add(&eb->leak_list, &buffers);
-	spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+	btrfs_leak_debug_add(&eb->leak_list, &buffers);
+
 	spin_lock_init(&eb->refs_lock);
 	atomic_set(&eb->refs, 1);
 	atomic_set(&eb->io_pages, 0);
 
-	if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
-		struct page **pages;
-		int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
-			PAGE_CACHE_SHIFT;
-		pages = kzalloc(num_pages, mask);
-		if (!pages) {
-			__free_extent_buffer(eb);
-			return NULL;
-		}
-		eb->pages = pages;
-	} else {
-		eb->pages = eb->inline_pages;
-	}
+	/*
+	 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
+	 */
+	BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
+		> MAX_INLINE_EXTENT_BUFFER_SIZE);
+	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
 
 	return eb;
 }
@@ -4180,6 +4300,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 
 static void check_buffer_tree_ref(struct extent_buffer *eb)
 {
+	int refs;
 	/* the ref bit is tricky.  We have to make sure it is set
 	 * if we have the buffer dirty.   Otherwise the
 	 * code to free a buffer can end up dropping a dirty
@@ -4200,6 +4321,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
 	 * So bump the ref count first, then set the bit.  If someone
 	 * beat us to it, drop the ref we added.
 	 */
+	refs = atomic_read(&eb->refs);
+	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		return;
+
 	spin_lock(&eb->refs_lock);
 	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
 		atomic_inc(&eb->refs);
@@ -4372,7 +4497,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
 }
 
 /* Expects to have eb->eb_lock already held */
-static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
+static int release_extent_buffer(struct extent_buffer *eb)
 {
 	WARN_ON(atomic_read(&eb->refs) == 0);
 	if (atomic_dec_and_test(&eb->refs)) {
@@ -4401,9 +4526,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
 
 void free_extent_buffer(struct extent_buffer *eb)
 {
+	int refs;
+	int old;
 	if (!eb)
 		return;
 
+	while (1) {
+		refs = atomic_read(&eb->refs);
+		if (refs <= 3)
+			break;
+		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
+		if (old == refs)
+			return;
+	}
+
 	spin_lock(&eb->refs_lock);
 	if (atomic_read(&eb->refs) == 2 &&
 	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
@@ -4419,7 +4555,7 @@ void free_extent_buffer(struct extent_buffer *eb)
 	 * I know this is terrible, but it's temporary until we stop tracking
 	 * the uptodate bits and such for the extent buffers.
 	 */
-	release_extent_buffer(eb, GFP_ATOMIC);
+	release_extent_buffer(eb);
 }
 
 void free_extent_buffer_stale(struct extent_buffer *eb)
@@ -4433,7 +4569,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
 	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
 		atomic_dec(&eb->refs);
-	release_extent_buffer(eb, GFP_NOFS);
+	release_extent_buffer(eb);
 }
 
 void clear_extent_buffer_dirty(struct extent_buffer *eb)
@@ -4485,17 +4621,6 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
 	return was_dirty;
 }
 
-static int range_straddles_pages(u64 start, u64 len)
-{
-	if (len < PAGE_CACHE_SIZE)
-		return 1;
-	if (start & (PAGE_CACHE_SIZE - 1))
-		return 1;
-	if ((start + len) & (PAGE_CACHE_SIZE - 1))
-		return 1;
-	return 0;
-}
-
 int clear_extent_buffer_uptodate(struct extent_buffer *eb)
 {
 	unsigned long i;
@@ -4527,37 +4652,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
 	return 0;
 }
 
-int extent_range_uptodate(struct extent_io_tree *tree,
-			  u64 start, u64 end)
-{
-	struct page *page;
-	int ret;
-	int pg_uptodate = 1;
-	int uptodate;
-	unsigned long index;
-
-	if (range_straddles_pages(start, end - start + 1)) {
-		ret = test_range_bit(tree, start, end,
-				     EXTENT_UPTODATE, 1, NULL);
-		if (ret)
-			return 1;
-	}
-	while (start <= end) {
-		index = start >> PAGE_CACHE_SHIFT;
-		page = find_get_page(tree->mapping, index);
-		if (!page)
-			return 1;
-		uptodate = PageUptodate(page);
-		page_cache_release(page);
-		if (!uptodate) {
-			pg_uptodate = 0;
-			break;
-		}
-		start += PAGE_CACHE_SIZE;
-	}
-	return pg_uptodate;
-}
-
 int extent_buffer_uptodate(struct extent_buffer *eb)
 {
 	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -4620,7 +4714,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			ClearPageError(page);
 			err = __extent_read_full_page(tree, page,
 						      get_extent, &bio,
-						      mirror_num, &bio_flags);
+						      mirror_num, &bio_flags,
+						      READ | REQ_META);
 			if (err)
 				ret = err;
 		} else {
@@ -4629,7 +4724,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 
 	if (bio) {
-		err = submit_one_bio(READ, bio, mirror_num, bio_flags);
+		err = submit_one_bio(READ | REQ_META, bio, mirror_num,
+				     bio_flags);
 		if (err)
 			return err;
 	}
@@ -4993,7 +5089,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	}
 }
 
-int try_release_extent_buffer(struct page *page, gfp_t mask)
+int try_release_extent_buffer(struct page *page)
 {
 	struct extent_buffer *eb;
 
@@ -5023,9 +5119,6 @@ int try_release_extent_buffer(struct page *page, gfp_t mask)
 	}
 	spin_unlock(&page->mapping->private_lock);
 
-	if ((mask & GFP_NOFS) == GFP_NOFS)
-		mask = GFP_NOFS;
-
 	/*
 	 * If tree ref isn't set then we know the ref on this eb is a real ref,
 	 * so just return, this page will likely be freed soon anyway.
@@ -5035,5 +5128,5 @@ int try_release_extent_buffer(struct page *page, gfp_t mask)
 		return 0;
 	}
 
-	return release_extent_buffer(eb, mask);
+	return release_extent_buffer(eb);
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2eacfabd326..3b8c4e26e1d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
 #define EXTENT_FIRST_DELALLOC (1 << 12)
 #define EXTENT_NEED_WAIT (1 << 13)
 #define EXTENT_DAMAGED (1 << 14)
+#define EXTENT_NORESERVE (1 << 15)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
@@ -72,26 +73,23 @@ struct extent_io_ops {
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	extent_submit_bio_hook_t *submit_bio_hook;
-	int (*merge_bio_hook)(struct page *page, unsigned long offset,
+	int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
 			      size_t size, struct bio *bio,
 			      unsigned long bio_flags);
-	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state, int mirror);
 	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state, int uptodate);
 	void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
-			     int *bits);
+			     unsigned long *bits);
 	void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-			       int *bits);
+			       unsigned long *bits);
 	void (*merge_extent_hook)(struct inode *inode,
 				  struct extent_state *new,
 				  struct extent_state *other);
 	void (*split_extent_hook)(struct inode *inode,
 				  struct extent_state *orig, u64 split);
-	int (*write_cache_pages_lock_hook)(struct page *page, void *data,
-					   void (*flush_fn)(void *));
 };
 
 struct extent_io_tree {
@@ -119,7 +117,9 @@ struct extent_state {
 	/* for use by the FS */
 	u64 private;
 
+#ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
+#endif
 };
 
 #define INLINE_EXTENT_BUFFER_PAGES 16
@@ -135,7 +135,6 @@ struct extent_buffer {
 	atomic_t refs;
 	atomic_t io_pages;
 	int read_mirror;
-	struct list_head leak_list;
 	struct rcu_head rcu_head;
 	pid_t lock_owner;
 
@@ -161,8 +160,10 @@ struct extent_buffer {
 	 */
 	wait_queue_head_t read_lock_wq;
 	wait_queue_head_t lock_wq;
-	struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES];
-	struct page **pages;
+	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+#ifdef CONFIG_BTRFS_DEBUG
+	struct list_head leak_list;
+#endif
 };
 
 static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -189,13 +190,10 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 int try_release_extent_mapping(struct extent_map_tree *map,
 			       struct extent_io_tree *tree, struct page *page,
 			       gfp_t mask);
-int try_release_extent_buffer(struct page *page, gfp_t mask);
-int try_release_extent_state(struct extent_map_tree *map,
-			     struct extent_io_tree *tree, struct page *page,
-			     gfp_t mask);
+int try_release_extent_buffer(struct page *page);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, struct extent_state **cached);
+		     unsigned long bits, struct extent_state **cached);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
 			 struct extent_state **cached, gfp_t mask);
@@ -211,16 +209,17 @@ u64 count_range_bits(struct extent_io_tree *tree,
 
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int filled, struct extent_state *cached_state);
+		   unsigned long bits, int filled,
+		   struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		      int bits, gfp_t mask);
+		      unsigned long bits, gfp_t mask);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete, struct extent_state **cached,
-		     gfp_t mask);
+		     unsigned long bits, int wake, int delete,
+		     struct extent_state **cached, gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		    int bits, gfp_t mask);
+		    unsigned long bits, gfp_t mask);
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, u64 *failed_start,
+		   unsigned long bits, u64 *failed_start,
 		   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
@@ -233,17 +232,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		       int bits, int clear_bits,
+		       unsigned long bits, unsigned long clear_bits,
 		       struct extent_state **cached_state, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
 		      struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits,
+			  u64 *start_ret, u64 *end_ret, unsigned long bits,
 			  struct extent_state **cached_state);
-struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
-						 u64 start, int bits);
 int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset);
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
@@ -265,6 +262,10 @@ int extent_readpages(struct extent_io_tree *tree,
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent);
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[],
+			    int count);
+void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio,
+			int bvec_index, u32 csums[], int count);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 
@@ -282,6 +283,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num);
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
 
 static inline unsigned long num_extent_pages(u64 start, u64 len)
 {
@@ -317,7 +319,6 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 			   unsigned long src_offset, unsigned long len);
 void memset_extent_buffer(struct extent_buffer *eb, char c,
 			  unsigned long start, unsigned long len);
-void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
 void clear_extent_buffer_dirty(struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_buffer *eb);
@@ -327,8 +328,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
 		      unsigned long min_len, char **map,
 		      unsigned long *map_start,
 		      unsigned long *map_len);
-int extent_range_uptodate(struct extent_io_tree *tree,
-			  u64 start, u64 end);
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
 int extent_clear_unlock_delalloc(struct inode *inode,
 				struct extent_io_tree *tree,
 				u64 start, u64 end, struct page *locked_page,
@@ -336,6 +337,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags);
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
 
 struct btrfs_fs_info;
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index fdb7a8db3b5..a4a7a1a8da9 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,6 +1,5 @@
 #include <linux/err.h>
 #include <linux/slab.h>
-#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
 #include "ctree.h"
@@ -175,6 +174,14 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 	    test_bit(EXTENT_FLAG_LOGGING, &next->flags))
 		return 0;
 
+	/*
+	 * We don't want to merge stuff that hasn't been written to the log yet
+	 * since it may not reflect exactly what is on disk, and that would be
+	 * bad.
+	 */
+	if (!list_empty(&prev->list) || !list_empty(&next->list))
+		return 0;
+
 	if (extent_map_end(prev) == next->start &&
 	    prev->flags == next->flags &&
 	    prev->bdev == next->bdev &&
@@ -210,9 +217,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 			em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
 			em->mod_start = merge->mod_start;
 			em->generation = max(em->generation, merge->generation);
-			list_move(&em->list, &tree->modified_extents);
 
-			list_del_init(&merge->list);
 			rb_erase(&merge->rb_node, &tree->map);
 			free_extent_map(merge);
 		}
@@ -228,7 +233,6 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 		merge->in_tree = 0;
 		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
 		em->generation = max(em->generation, merge->generation);
-		list_del_init(&merge->list);
 		free_extent_map(merge);
 	}
 }
@@ -303,7 +307,7 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
  * reference dropped if the merge attempt was successful.
  */
 int add_extent_mapping(struct extent_map_tree *tree,
-		       struct extent_map *em)
+		       struct extent_map *em, int modified)
 {
 	int ret = 0;
 	struct rb_node *rb;
@@ -325,7 +329,10 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	em->mod_start = em->start;
 	em->mod_len = em->len;
 
-	try_merge_map(tree, em);
+	if (modified)
+		list_move(&em->list, &tree->modified_extents);
+	else
+		try_merge_map(tree, em);
 out:
 	return ret;
 }
@@ -338,8 +345,9 @@ static u64 range_end(u64 start, u64 len)
 	return start + len;
 }
 
-struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree,
-					   u64 start, u64 len, int strict)
+static struct extent_map *
+__lookup_extent_mapping(struct extent_map_tree *tree,
+			u64 start, u64 len, int strict)
 {
 	struct extent_map *em;
 	struct rb_node *rb_node;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index c6598c89cff..61adc44b780 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,6 +26,7 @@ struct extent_map {
 	u64 mod_len;
 	u64 orig_start;
 	u64 orig_block_len;
+	u64 ram_bytes;
 	u64 block_start;
 	u64 block_len;
 	u64 generation;
@@ -61,7 +62,7 @@ void extent_map_tree_init(struct extent_map_tree *tree);
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
 int add_extent_mapping(struct extent_map_tree *tree,
-		       struct extent_map *em);
+		       struct extent_map *em, int modified);
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
 
 struct extent_map *alloc_extent_map(void);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 94aa53b3872..a7bfc954180 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
 
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
 				   sizeof(struct btrfs_ordered_sum)) / \
-				   sizeof(struct btrfs_sector_sum) * \
-				   (r)->sectorsize - (r)->sectorsize)
+				   sizeof(u32) * (r)->sectorsize)
 
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
@@ -83,10 +82,11 @@ out:
 	return ret;
 }
 
-struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, int cow)
+static struct btrfs_csum_item *
+btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+		  struct btrfs_root *root,
+		  struct btrfs_path *path,
+		  u64 bytenr, int cow)
 {
 	int ret;
 	struct btrfs_key file_key;
@@ -118,9 +118,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
 		csums_in_item /= csum_size;
 
-		if (csum_offset >= csums_in_item) {
+		if (csum_offset == csums_in_item) {
 			ret = -EFBIG;
 			goto fail;
+		} else if (csum_offset > csums_in_item) {
+			goto fail;
 		}
 	}
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
@@ -150,32 +152,12 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-u64 btrfs_file_extent_length(struct btrfs_path *path)
-{
-	int extent_type;
-	struct btrfs_file_extent_item *fi;
-	u64 len;
-
-	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
-			    struct btrfs_file_extent_item);
-	extent_type = btrfs_file_extent_type(path->nodes[0], fi);
-
-	if (extent_type == BTRFS_FILE_EXTENT_REG ||
-	    extent_type == BTRFS_FILE_EXTENT_PREALLOC)
-		len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
-	else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
-		len = btrfs_file_extent_inline_len(path->nodes[0], fi);
-	else
-		BUG();
-
-	return len;
-}
-
 static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 				   struct inode *inode, struct bio *bio,
 				   u64 logical_offset, u32 *dst, int dio)
 {
-	u32 sum;
+	u32 sum[16];
+	int len;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
 	u64 offset = 0;
@@ -184,7 +166,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 	u64 disk_bytenr;
 	u32 diff;
 	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-	int ret;
+	int count;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -212,10 +194,12 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 	if (dio)
 		offset = logical_offset;
 	while (bio_index < bio->bi_vcnt) {
+		len = min_t(int, ARRAY_SIZE(sum), bio->bi_vcnt - bio_index);
 		if (!dio)
 			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
-		if (ret == 0)
+		count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum,
+					       len);
+		if (count)
 			goto found;
 
 		if (!item || disk_bytenr < item_start_offset ||
@@ -228,10 +212,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 			item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
 						 path, disk_bytenr, 0);
 			if (IS_ERR(item)) {
-				ret = PTR_ERR(item);
-				if (ret == -ENOENT || ret == -EFBIG)
-					ret = 0;
-				sum = 0;
+				count = 1;
+				sum[0] = 0;
 				if (BTRFS_I(inode)->root->root_key.objectid ==
 				    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 					set_extent_bits(io_tree, offset,
@@ -267,19 +249,29 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 		diff = disk_bytenr - item_start_offset;
 		diff = diff / root->sectorsize;
 		diff = diff * csum_size;
-
-		read_extent_buffer(path->nodes[0], &sum,
+		count = min_t(int, len, (item_last_offset - disk_bytenr) >>
+					inode->i_sb->s_blocksize_bits);
+		read_extent_buffer(path->nodes[0], sum,
 				   ((unsigned long)item) + diff,
-				   csum_size);
+				   csum_size * count);
 found:
-		if (dst)
-			*dst++ = sum;
-		else
-			set_state_private(io_tree, offset, sum);
-		disk_bytenr += bvec->bv_len;
-		offset += bvec->bv_len;
-		bio_index++;
-		bvec++;
+		if (dst) {
+			memcpy(dst, sum, count * csum_size);
+			dst += count;
+		} else {
+			if (dio)
+				extent_cache_csums_dio(io_tree, offset, sum,
+						       count);
+			else
+				extent_cache_csums(io_tree, bio, bio_index, sum,
+					    count);
+		}
+		while (count--) {
+			disk_bytenr += bvec->bv_len;
+			offset += bvec->bv_len;
+			bio_index++;
+			bvec++;
+		}
 	}
 	btrfs_free_path(path);
 	return 0;
@@ -304,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
 	struct btrfs_csum_item *item;
 	LIST_HEAD(tmplist);
 	unsigned long offset;
@@ -356,11 +347,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
-		    key.type != BTRFS_EXTENT_CSUM_KEY)
-			break;
-
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.offset > end)
+		    key.type != BTRFS_EXTENT_CSUM_KEY ||
+		    key.offset > end)
 			break;
 
 		if (key.offset > start)
@@ -378,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 				      struct btrfs_csum_item);
 		while (start < csum_end) {
 			size = min_t(size_t, csum_end - start,
-					MAX_ORDERED_SUM_BYTES(root));
+				     MAX_ORDERED_SUM_BYTES(root));
 			sums = kzalloc(btrfs_ordered_sum_size(root, size),
-					GFP_NOFS);
+				       GFP_NOFS);
 			if (!sums) {
 				ret = -ENOMEM;
 				goto fail;
 			}
 
-			sector_sum = sums->sums;
 			sums->bytenr = start;
-			sums->len = size;
+			sums->len = (int)size;
 
 			offset = (start - key.offset) >>
 				root->fs_info->sb->s_blocksize_bits;
 			offset *= csum_size;
+			size >>= root->fs_info->sb->s_blocksize_bits;
 
-			while (size > 0) {
-				read_extent_buffer(path->nodes[0],
-						&sector_sum->sum,
-						((unsigned long)item) +
-						offset, csum_size);
-				sector_sum->bytenr = start;
-
-				size -= root->sectorsize;
-				start += root->sectorsize;
-				offset += csum_size;
-				sector_sum++;
-			}
+			read_extent_buffer(path->nodes[0],
+					   sums->sums,
+					   ((unsigned long)item) + offset,
+					   csum_size * size);
+
+			start += root->sectorsize * size;
 			list_add_tail(&sums->list, &tmplist);
 		}
 		path->slots[0]++;
@@ -427,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio, u64 file_start, int contig)
 {
 	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
 	struct btrfs_ordered_extent *ordered;
 	char *data;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
+	int index;
 	unsigned long total_bytes = 0;
 	unsigned long this_sum_bytes = 0;
 	u64 offset;
-	u64 disk_bytenr;
 
 	WARN_ON(bio->bi_vcnt <= 0);
 	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
 	if (!sums)
 		return -ENOMEM;
 
-	sector_sum = sums->sums;
-	disk_bytenr = (u64)bio->bi_sector << 9;
 	sums->len = bio->bi_size;
 	INIT_LIST_HEAD(&sums->list);
 
@@ -454,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 
 	ordered = btrfs_lookup_ordered_extent(inode, offset);
 	BUG_ON(!ordered); /* Logic error */
-	sums->bytenr = ordered->start;
+	sums->bytenr = (u64)bio->bi_sector << 9;
+	index = 0;
 
 	while (bio_index < bio->bi_vcnt) {
 		if (!contig)
@@ -473,29 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 			sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
 				       GFP_NOFS);
 			BUG_ON(!sums); /* -ENOMEM */
-			sector_sum = sums->sums;
 			sums->len = bytes_left;
 			ordered = btrfs_lookup_ordered_extent(inode, offset);
 			BUG_ON(!ordered); /* Logic error */
-			sums->bytenr = ordered->start;
+			sums->bytenr = ((u64)bio->bi_sector << 9) +
+				       total_bytes;
+			index = 0;
 		}
 
 		data = kmap_atomic(bvec->bv_page);
-		sector_sum->sum = ~(u32)0;
-		sector_sum->sum = btrfs_csum_data(root,
-						  data + bvec->bv_offset,
-						  sector_sum->sum,
-						  bvec->bv_len);
+		sums->sums[index] = ~(u32)0;
+		sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
+						    sums->sums[index],
+						    bvec->bv_len);
 		kunmap_atomic(data);
-		btrfs_csum_final(sector_sum->sum,
-				 (char *)&sector_sum->sum);
-		sector_sum->bytenr = disk_bytenr;
+		btrfs_csum_final(sums->sums[index],
+				 (char *)(sums->sums + index));
 
-		sector_sum++;
 		bio_index++;
+		index++;
 		total_bytes += bvec->bv_len;
 		this_sum_bytes += bvec->bv_len;
-		disk_bytenr += bvec->bv_len;
 		offset += bvec->bv_len;
 		bvec++;
 	}
@@ -516,8 +494,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
  * This calls btrfs_truncate_item with the correct args based on the
  * overlap, and fixes up the key as required.
  */
-static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
-				       struct btrfs_root *root,
+static noinline void truncate_one_csum(struct btrfs_root *root,
 				       struct btrfs_path *path,
 				       struct btrfs_key *key,
 				       u64 bytenr, u64 len)
@@ -542,7 +519,7 @@ static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
 		 */
 		u32 new_size = (bytenr - key->offset) >> blocksize_bits;
 		new_size *= csum_size;
-		btrfs_truncate_item(trans, root, path, new_size, 1);
+		btrfs_truncate_item(root, path, new_size, 1);
 	} else if (key->offset >= bytenr && csum_end > end_byte &&
 		   end_byte > key->offset) {
 		/*
@@ -554,10 +531,10 @@ static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
 		u32 new_size = (csum_end - end_byte) >> blocksize_bits;
 		new_size *= csum_size;
 
-		btrfs_truncate_item(trans, root, path, new_size, 0);
+		btrfs_truncate_item(root, path, new_size, 0);
 
 		key->offset = end_byte;
-		btrfs_set_item_key_safe(trans, root, path, key);
+		btrfs_set_item_key_safe(root, path, key);
 	} else {
 		BUG();
 	}
@@ -672,7 +649,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 
 			key.offset = end_byte - 1;
 		} else {
-			truncate_one_csum(trans, root, path, &key, bytenr, len);
+			truncate_one_csum(root, path, &key, bytenr, len);
 			if (key.offset < bytenr)
 				break;
 		}
@@ -688,41 +665,42 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums)
 {
-	u64 bytenr;
-	int ret;
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
-	u64 next_offset;
-	u64 total_bytes = 0;
-	int found_next;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
 	struct btrfs_csum_item *item_end;
 	struct extent_buffer *leaf = NULL;
+	u64 next_offset;
+	u64 total_bytes = 0;
 	u64 csum_offset;
-	struct btrfs_sector_sum *sector_sum;
+	u64 bytenr;
 	u32 nritems;
 	u32 ins_size;
+	int index = 0;
+	int found_next;
+	int ret;
 	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-
-	sector_sum = sums->sums;
-	trans->adding_csums = 1;
 again:
 	next_offset = (u64)-1;
 	found_next = 0;
+	bytenr = sums->bytenr + total_bytes;
 	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
-	file_key.offset = sector_sum->bytenr;
-	bytenr = sector_sum->bytenr;
+	file_key.offset = bytenr;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
 
-	item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+	item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
 	if (!IS_ERR(item)) {
-		leaf = path->nodes[0];
 		ret = 0;
+		leaf = path->nodes[0];
+		item_end = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_csum_item);
+		item_end = (struct btrfs_csum_item *)((char *)item_end +
+			   btrfs_item_size_nr(leaf, path->slots[0]));
 		goto found;
 	}
 	ret = PTR_ERR(item);
@@ -789,22 +767,34 @@ again:
 		goto insert;
 	}
 
-	if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+	if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
 	    csum_size) {
-		u32 diff = (csum_offset + 1) * csum_size;
+		int extend_nr;
+		u64 tmp;
+		u32 diff;
+		u32 free_space;
 
-		/*
-		 * is the item big enough already?  we dropped our lock
-		 * before and need to recheck
-		 */
-		if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
-			goto csum;
+		if (btrfs_leaf_free_space(root, leaf) <
+				 sizeof(struct btrfs_item) + csum_size * 2)
+			goto insert;
+
+		free_space = btrfs_leaf_free_space(root, leaf) -
+					 sizeof(struct btrfs_item) - csum_size;
+		tmp = sums->len - total_bytes;
+		tmp >>= root->fs_info->sb->s_blocksize_bits;
+		WARN_ON(tmp < 1);
+
+		extend_nr = max_t(int, 1, (int)tmp);
+		diff = (csum_offset + extend_nr) * csum_size;
+		diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
 
 		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
-		if (diff != csum_size)
-			goto insert;
+		diff = min(free_space, diff);
+		diff /= csum_size;
+		diff *= csum_size;
 
-		btrfs_extend_item(trans, root, path, diff);
+		btrfs_extend_item(root, path, diff);
+		ret = 0;
 		goto csum;
 	}
 
@@ -812,19 +802,13 @@ insert:
 	btrfs_release_path(path);
 	csum_offset = 0;
 	if (found_next) {
-		u64 tmp = total_bytes + root->sectorsize;
-		u64 next_sector = sector_sum->bytenr;
-		struct btrfs_sector_sum *next = sector_sum + 1;
+		u64 tmp;
 
-		while (tmp < sums->len) {
-			if (next_sector + root->sectorsize != next->bytenr)
-				break;
-			tmp += root->sectorsize;
-			next_sector = next->bytenr;
-			next++;
-		}
-		tmp = min(tmp, next_offset - file_key.offset);
+		tmp = sums->len - total_bytes;
 		tmp >>= root->fs_info->sb->s_blocksize_bits;
+		tmp = min(tmp, (next_offset - file_key.offset) >>
+					 root->fs_info->sb->s_blocksize_bits);
+
 		tmp = max((u64)1, tmp);
 		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
 		ins_size = csum_size * tmp;
@@ -841,31 +825,25 @@ insert:
 		WARN_ON(1);
 		goto fail_unlock;
 	}
-csum:
 	leaf = path->nodes[0];
+csum:
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
-	ret = 0;
+	item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+				      btrfs_item_size_nr(leaf, path->slots[0]));
 	item = (struct btrfs_csum_item *)((unsigned char *)item +
 					  csum_offset * csum_size);
 found:
-	item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
-	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
-				      btrfs_item_size_nr(leaf, path->slots[0]));
-next_sector:
-
-	write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
-
-	total_bytes += root->sectorsize;
-	sector_sum++;
-	if (total_bytes < sums->len) {
-		item = (struct btrfs_csum_item *)((char *)item +
-						  csum_size);
-		if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
-		    sector_sum->bytenr) {
-			bytenr = sector_sum->bytenr;
-			goto next_sector;
-		}
-	}
+	ins_size = (u32)(sums->len - total_bytes) >>
+		   root->fs_info->sb->s_blocksize_bits;
+	ins_size *= csum_size;
+	ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
+			      ins_size);
+	write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
+			    ins_size);
+
+	ins_size /= csum_size;
+	total_bytes += ins_size * root->sectorsize;
+	index += ins_size;
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	if (total_bytes < sums->len) {
@@ -874,7 +852,6 @@ next_sector:
 		goto again;
 	}
 out:
-	trans->adding_csums = 0;
 	btrfs_free_path(path);
 	return ret;
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index aeb84469d2c..a005fe2c072 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,17 +24,18 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/aio.h>
 #include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/slab.h>
+#include <linux/btrfs.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "tree-log.h"
 #include "locking.h"
@@ -192,8 +193,8 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
  * the same inode in the tree, we will merge them together (by
  * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
  */
-void btrfs_requeue_inode_defrag(struct inode *inode,
-				struct inode_defrag *defrag)
+static void btrfs_requeue_inode_defrag(struct inode *inode,
+				       struct inode_defrag *defrag)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
@@ -308,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
 		ret = PTR_ERR(inode_root);
 		goto cleanup;
 	}
-	if (btrfs_root_refs(&inode_root->root_item) == 0) {
-		ret = -ENOENT;
-		goto cleanup;
-	}
 
 	key.objectid = defrag->ino;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -374,6 +371,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 
 	atomic_inc(&fs_info->defrag_running);
 	while(1) {
+		/* Pause the auto defragger. */
+		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
+			     &fs_info->fs_state))
+			break;
+
 		if (!__need_auto_defrag(fs_info->tree_root))
 			break;
 
@@ -468,7 +470,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 /*
  * unlocks pages after btrfs_file_write is done with them
  */
-void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
 	size_t i;
 	for (i = 0; i < num_pages; i++) {
@@ -492,9 +494,9 @@ void btrfs_drop_pages(struct page **pages, size_t num_pages)
  * doing real data extents, marking pages dirty and delalloc as required.
  */
 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
-		      struct page **pages, size_t num_pages,
-		      loff_t pos, size_t write_bytes,
-		      struct extent_state **cached)
+			     struct page **pages, size_t num_pages,
+			     loff_t pos, size_t write_bytes,
+			     struct extent_state **cached)
 {
 	int err = 0;
 	int i;
@@ -505,8 +507,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 	loff_t isize = i_size_read(inode);
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
-	num_bytes = (write_bytes + pos - start_pos +
-		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+	num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
 
 	end_of_last_block = start_pos + num_bytes - 1;
 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -548,6 +549,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 	int testend = 1;
 	unsigned long flags;
 	int compressed = 0;
+	bool modified;
 
 	WARN_ON(end < start);
 	if (end == (u64)-1) {
@@ -557,6 +559,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 	while (1) {
 		int no_splits = 0;
 
+		modified = false;
 		if (!split)
 			split = alloc_extent_map();
 		if (!split2)
@@ -587,6 +590,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		}
 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+		clear_bit(EXTENT_FLAG_LOGGING, &flags);
+		modified = !list_empty(&em->list);
 		remove_extent_mapping(em_tree, em);
 		if (no_splits)
 			goto next;
@@ -602,15 +607,15 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 				split->block_len = em->block_len;
 			else
 				split->block_len = split->len;
+			split->ram_bytes = em->ram_bytes;
 			split->orig_block_len = max(split->block_len,
 						    em->orig_block_len);
 			split->generation = gen;
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
-			ret = add_extent_mapping(em_tree, split);
+			ret = add_extent_mapping(em_tree, split, modified);
 			BUG_ON(ret); /* Logic error */
-			list_move(&split->list, &em_tree->modified_extents);
 			free_extent_map(split);
 			split = split2;
 			split2 = NULL;
@@ -627,6 +632,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->generation = gen;
 			split->orig_block_len = max(em->block_len,
 						    em->orig_block_len);
+			split->ram_bytes = em->ram_bytes;
 
 			if (compressed) {
 				split->block_len = em->block_len;
@@ -638,9 +644,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 				split->orig_start = em->orig_start;
 			}
 
-			ret = add_extent_mapping(em_tree, split);
+			ret = add_extent_mapping(em_tree, split, modified);
 			BUG_ON(ret); /* Logic error */
-			list_move(&split->list, &em_tree->modified_extents);
 			free_extent_map(split);
 			split = NULL;
 		}
@@ -816,7 +821,7 @@ next_slot:
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = end;
-			btrfs_set_item_key_safe(trans, root, path, &new_key);
+			btrfs_set_item_key_safe(root, path, &new_key);
 
 			extent_offset += end - key.offset;
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
@@ -1032,7 +1037,7 @@ again:
 				     ino, bytenr, orig_offset,
 				     &other_start, &other_end)) {
 			new_key.offset = end;
-			btrfs_set_item_key_safe(trans, root, path, &new_key);
+			btrfs_set_item_key_safe(root, path, &new_key);
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_generation(leaf, fi,
@@ -1066,7 +1071,7 @@ again:
 							 trans->transid);
 			path->slots[0]++;
 			new_key.offset = start;
-			btrfs_set_item_key_safe(trans, root, path, &new_key);
+			btrfs_set_item_key_safe(root, path, &new_key);
 
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
@@ -1225,7 +1230,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 	struct extent_state *cached_state = NULL;
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int err = 0;
 	int faili = 0;
@@ -1308,17 +1313,69 @@ fail:
 
 }
 
+static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+				    size_t *write_bytes)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ordered_extent *ordered;
+	u64 lockstart, lockend;
+	u64 num_bytes;
+	int ret;
+
+	lockstart = round_down(pos, root->sectorsize);
+	lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+
+	while (1) {
+		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+		if (!ordered) {
+			break;
+		}
+		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+		return PTR_ERR(trans);
+	}
+
+	num_bytes = lockend - lockstart + 1;
+	ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
+			       NULL);
+	btrfs_end_transaction(trans, root);
+	if (ret <= 0) {
+		ret = 0;
+	} else {
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+				 NULL, GFP_NOFS);
+		*write_bytes = min_t(size_t, *write_bytes, num_bytes);
+	}
+
+	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+
+	return ret;
+}
+
 static noinline ssize_t __btrfs_buffered_write(struct file *file,
 					       struct iov_iter *i,
 					       loff_t pos)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page **pages = NULL;
+	u64 release_bytes = 0;
 	unsigned long first_index;
 	size_t num_written = 0;
 	int nrptrs;
 	int ret = 0;
+	bool only_release_metadata = false;
 	bool force_page_uptodate = false;
 
 	nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1339,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 					 offset);
 		size_t num_pages = (write_bytes + offset +
 				    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		size_t reserve_bytes;
 		size_t dirty_pages;
 		size_t copied;
 
@@ -1353,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 			break;
 		}
 
-		ret = btrfs_delalloc_reserve_space(inode,
-					num_pages << PAGE_CACHE_SHIFT);
+		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+		ret = btrfs_check_data_free_space(inode, reserve_bytes);
+		if (ret == -ENOSPC &&
+		    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+					      BTRFS_INODE_PREALLOC))) {
+			ret = check_can_nocow(inode, pos, &write_bytes);
+			if (ret > 0) {
+				only_release_metadata = true;
+				/*
+				 * our prealloc extent may be smaller than
+				 * write_bytes, so scale down.
+				 */
+				num_pages = (write_bytes + offset +
+					     PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT;
+				reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+				ret = 0;
+			} else {
+				ret = -ENOSPC;
+			}
+		}
+
 		if (ret)
 			break;
 
+		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+		if (ret) {
+			if (!only_release_metadata)
+				btrfs_free_reserved_data_space(inode,
+							       reserve_bytes);
+			break;
+		}
+
+		release_bytes = reserve_bytes;
+
 		/*
 		 * This is going to setup the pages array with the number of
 		 * pages we want, so we don't really need to worry about the
@@ -1366,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, write_bytes,
 				    force_page_uptodate);
-		if (ret) {
-			btrfs_delalloc_release_space(inode,
-					num_pages << PAGE_CACHE_SHIFT);
+		if (ret)
 			break;
-		}
 
 		copied = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, i);
@@ -1400,30 +1485,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		 * managed to copy.
 		 */
 		if (num_pages > dirty_pages) {
+			release_bytes = (num_pages - dirty_pages) <<
+				PAGE_CACHE_SHIFT;
 			if (copied > 0) {
 				spin_lock(&BTRFS_I(inode)->lock);
 				BTRFS_I(inode)->outstanding_extents++;
 				spin_unlock(&BTRFS_I(inode)->lock);
 			}
-			btrfs_delalloc_release_space(inode,
-					(num_pages - dirty_pages) <<
-					PAGE_CACHE_SHIFT);
+			if (only_release_metadata)
+				btrfs_delalloc_release_metadata(inode,
+								release_bytes);
+			else
+				btrfs_delalloc_release_space(inode,
+							     release_bytes);
 		}
 
+		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
 		if (copied > 0) {
 			ret = btrfs_dirty_pages(root, inode, pages,
 						dirty_pages, pos, copied,
 						NULL);
 			if (ret) {
-				btrfs_delalloc_release_space(inode,
-					dirty_pages << PAGE_CACHE_SHIFT);
 				btrfs_drop_pages(pages, num_pages);
 				break;
 			}
 		}
 
+		release_bytes = 0;
 		btrfs_drop_pages(pages, num_pages);
 
+		if (only_release_metadata && copied > 0) {
+			u64 lockstart = round_down(pos, root->sectorsize);
+			u64 lockend = lockstart +
+				(dirty_pages << PAGE_CACHE_SHIFT) - 1;
+
+			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				       lockend, EXTENT_NORESERVE, NULL,
+				       NULL, GFP_NOFS);
+			only_release_metadata = false;
+		}
+
 		cond_resched();
 
 		balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1436,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 
 	kfree(pages);
 
+	if (release_bytes) {
+		if (only_release_metadata)
+			btrfs_delalloc_release_metadata(inode, release_bytes);
+		else
+			btrfs_delalloc_release_space(inode, release_bytes);
+	}
+
 	return num_written ? num_written : ret;
 }
 
@@ -1500,7 +1608,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 				    unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	loff_t *ppos = &iocb->ki_pos;
 	u64 start_pos;
@@ -1509,8 +1617,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	size_t count, ocount;
 	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
 
-	sb_start_write(inode->i_sb);
-
 	mutex_lock(&inode->i_mutex);
 
 	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
@@ -1544,7 +1650,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	 * although we have opened a file as writable, we have
 	 * to stop this write operation to ensure FS consistency.
 	 */
-	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
 		mutex_unlock(&inode->i_mutex);
 		err = -EROFS;
 		goto out;
@@ -1612,7 +1718,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	if (sync)
 		atomic_dec(&BTRFS_I(inode)->sync_writers);
 out:
-	sb_end_write(inode->i_sb);
 	current->backing_dev_info = NULL;
 	return num_written ? num_written : err;
 }
@@ -1627,7 +1732,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 	 */
 	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
 			       &BTRFS_I(inode)->runtime_flags)) {
-		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+		struct btrfs_trans_handle *trans;
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+
+		/*
+		 * We need to block on a committing transaction to keep us from
+		 * throwing a ordered operation on to the list and causing
+		 * something like sync to deadlock trying to flush out this
+		 * inode.
+		 */
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+		btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
+		btrfs_end_transaction(trans, root);
 		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
 			filemap_flush(inode->i_mapping);
 	}
@@ -1654,16 +1772,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 	struct btrfs_trans_handle *trans;
+	bool full_sync = 0;
 
 	trace_btrfs_sync_file(file, datasync);
 
 	/*
 	 * We write the dirty pages in the range and wait until they complete
 	 * out of the ->i_mutex. If so, we can flush the dirty pages by
-	 * multi-task, and make the performance up.
+	 * multi-task, and make the performance up.  See
+	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
 	 */
 	atomic_inc(&BTRFS_I(inode)->sync_writers);
-	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
 	atomic_dec(&BTRFS_I(inode)->sync_writers);
 	if (ret)
 		return ret;
@@ -1675,7 +1798,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 * range being left.
 	 */
 	atomic_inc(&root->log_batch);
-	btrfs_wait_ordered_range(inode, start, end - start + 1);
+	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			     &BTRFS_I(inode)->runtime_flags);
+	if (full_sync)
+		btrfs_wait_ordered_range(inode, start, end - start + 1);
 	atomic_inc(&root->log_batch);
 
 	/*
@@ -1742,13 +1868,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	if (ret != BTRFS_NO_LOG_SYNC) {
 		if (ret > 0) {
+			/*
+			 * If we didn't already wait for ordered extents we need
+			 * to do that now.
+			 */
+			if (!full_sync)
+				btrfs_wait_ordered_range(inode, start,
+							 end - start + 1);
 			ret = btrfs_commit_transaction(trans, root);
 		} else {
 			ret = btrfs_sync_log(trans, root);
-			if (ret == 0)
+			if (ret == 0) {
 				ret = btrfs_end_transaction(trans, root);
-			else
+			} else {
+				if (!full_sync)
+					btrfs_wait_ordered_range(inode, start,
+								 end -
+								 start + 1);
 				ret = btrfs_commit_transaction(trans, root);
+			}
 		}
 	} else {
 		ret = btrfs_end_transaction(trans, root);
@@ -1847,7 +1985,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
 
 		path->slots[0]++;
 		key.offset = offset;
-		btrfs_set_item_key_safe(trans, root, path, &key);
+		btrfs_set_item_key_safe(root, path, &key);
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@ -1877,6 +2015,7 @@ out:
 	} else {
 		hole_em->start = offset;
 		hole_em->len = end - offset;
+		hole_em->ram_bytes = hole_em->len;
 		hole_em->orig_start = offset;
 
 		hole_em->block_start = EXTENT_MAP_HOLE;
@@ -1889,10 +2028,7 @@ out:
 		do {
 			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
 			write_lock(&em_tree->lock);
-			ret = add_extent_mapping(em_tree, hole_em);
-			if (!ret)
-				list_move(&hole_em->list,
-					  &em_tree->modified_extents);
+			ret = add_extent_mapping(em_tree, hole_em, 1);
 			write_unlock(&em_tree->lock);
 		} while (ret == -EEXIST);
 		free_extent_map(hole_em);
@@ -2102,8 +2238,9 @@ out:
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct extent_state *cached_state = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 cur_offset;
 	u64 last_byte;
 	u64 alloc_start;
@@ -2131,12 +2268,11 @@ static long btrfs_fallocate(struct file *file, int mode,
 	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
 	if (ret)
 		return ret;
-
-	/*
-	 * wait for ordered IO before we have any locks.  We'll loop again
-	 * below with the locks held.
-	 */
-	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+	if (root->fs_info->quota_enabled) {
+		ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
+		if (ret)
+			goto out_reserve_fail;
+	}
 
 	mutex_lock(&inode->i_mutex);
 	ret = inode_newsize_ok(inode, alloc_end);
@@ -2148,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode,
 					alloc_start);
 		if (ret)
 			goto out;
+	} else {
+		/*
+		 * If we are fallocating from the end of the file onward we
+		 * need to zero out the end of the page if i_size lands in the
+		 * middle of a page.
+		 */
+		ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+		if (ret)
+			goto out;
 	}
 
+	/*
+	 * wait for ordered IO before we have any locks.  We'll loop again
+	 * below with the locks held.
+	 */
+	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
 	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
@@ -2234,6 +2385,9 @@ static long btrfs_fallocate(struct file *file, int mode,
 			     &cached_state, GFP_NOFS);
 out:
 	mutex_unlock(&inode->i_mutex);
+	if (root->fs_info->quota_enabled)
+		btrfs_qgroup_free(root, alloc_end - alloc_start);
+out_reserve_fail:
 	/* Let go of our reservation. */
 	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 	return ret;
@@ -2379,20 +2533,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 		}
 	}
 
-	if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
-		offset = -EINVAL;
-		goto out;
-	}
-	if (offset > inode->i_sb->s_maxbytes) {
-		offset = -EINVAL;
-		goto out;
-	}
-
-	/* Special lock needed here? */
-	if (offset != file->f_pos) {
-		file->f_pos = offset;
-		file->f_version = 0;
-	}
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return offset;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0be7a8742a4..b21a3cd667d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -104,7 +104,8 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 
 	spin_lock(&block_group->lock);
 	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
-		printk(KERN_INFO "Old style space inode found, converting.\n");
+		btrfs_info(root->fs_info,
+			"Old style space inode found, converting.");
 		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
 			BTRFS_INODE_NODATACOW;
 		block_group->disk_cache_state = BTRFS_DC_CLEAR;
@@ -119,9 +120,10 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 	return inode;
 }
 
-int __create_free_space_inode(struct btrfs_root *root,
-			      struct btrfs_trans_handle *trans,
-			      struct btrfs_path *path, u64 ino, u64 offset)
+static int __create_free_space_inode(struct btrfs_root *root,
+				     struct btrfs_trans_handle *trans,
+				     struct btrfs_path *path,
+				     u64 ino, u64 offset)
 {
 	struct btrfs_key key;
 	struct btrfs_disk_key disk_key;
@@ -195,30 +197,32 @@ int create_free_space_inode(struct btrfs_root *root,
 					 block_group->key.objectid);
 }
 
-int btrfs_truncate_free_space_cache(struct btrfs_root *root,
-				    struct btrfs_trans_handle *trans,
-				    struct btrfs_path *path,
-				    struct inode *inode)
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+				       struct btrfs_block_rsv *rsv)
 {
-	struct btrfs_block_rsv *rsv;
 	u64 needed_bytes;
-	loff_t oldsize;
-	int ret = 0;
-
-	rsv = trans->block_rsv;
-	trans->block_rsv = &root->fs_info->global_block_rsv;
+	int ret;
 
 	/* 1 for slack space, 1 for updating the inode */
 	needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
 		btrfs_calc_trans_metadata_size(root, 1);
 
-	spin_lock(&trans->block_rsv->lock);
-	if (trans->block_rsv->reserved < needed_bytes) {
-		spin_unlock(&trans->block_rsv->lock);
-		trans->block_rsv = rsv;
-		return -ENOSPC;
-	}
-	spin_unlock(&trans->block_rsv->lock);
+	spin_lock(&rsv->lock);
+	if (rsv->reserved < needed_bytes)
+		ret = -ENOSPC;
+	else
+		ret = 0;
+	spin_unlock(&rsv->lock);
+	return ret;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_path *path,
+				    struct inode *inode)
+{
+	loff_t oldsize;
+	int ret = 0;
 
 	oldsize = i_size_read(inode);
 	btrfs_i_size_write(inode, 0);
@@ -230,9 +234,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	 */
 	ret = btrfs_truncate_inode_items(trans, root, inode,
 					 0, BTRFS_EXTENT_DATA_KEY);
-
 	if (ret) {
-		trans->block_rsv = rsv;
 		btrfs_abort_transaction(trans, root, ret);
 		return ret;
 	}
@@ -240,7 +242,6 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	ret = btrfs_update_inode(trans, root, inode);
 	if (ret)
 		btrfs_abort_transaction(trans, root, ret);
-	trans->block_rsv = rsv;
 
 	return ret;
 }
@@ -431,7 +432,7 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
 	if (index == 0)
 		offset = sizeof(u32) * io_ctl->num_pages;
 
-	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
 			      PAGE_CACHE_SIZE - offset);
 	btrfs_csum_final(crc, (char *)&crc);
 	io_ctl_unmap_page(io_ctl);
@@ -461,7 +462,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
 	kunmap(io_ctl->pages[0]);
 
 	io_ctl_map_page(io_ctl, 0);
-	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
 			      PAGE_CACHE_SIZE - offset);
 	btrfs_csum_final(crc, (char *)&crc);
 	if (val != crc) {
@@ -624,9 +625,9 @@ next:
 	spin_unlock(&ctl->tree_lock);
 }
 
-int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
-			    struct btrfs_free_space_ctl *ctl,
-			    struct btrfs_path *path, u64 offset)
+static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
+				   struct btrfs_free_space_ctl *ctl,
+				   struct btrfs_path *path, u64 offset)
 {
 	struct btrfs_free_space_header *header;
 	struct extent_buffer *leaf;
@@ -669,10 +670,11 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 	btrfs_release_path(path);
 
 	if (BTRFS_I(inode)->generation != generation) {
-		printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
-		       " not match free space cache generation (%llu)\n",
-		       (unsigned long long)BTRFS_I(inode)->generation,
-		       (unsigned long long)generation);
+		btrfs_err(root->fs_info,
+			"free space inode generation (%llu) "
+			"did not match free space cache generation (%llu)",
+			(unsigned long long)BTRFS_I(inode)->generation,
+			(unsigned long long)generation);
 		return 0;
 	}
 
@@ -721,8 +723,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 			ret = link_free_space(ctl, e);
 			spin_unlock(&ctl->tree_lock);
 			if (ret) {
-				printk(KERN_ERR "Duplicate entries in "
-				       "free space cache, dumping\n");
+				btrfs_err(root->fs_info,
+					"Duplicate entries in free space cache, dumping");
 				kmem_cache_free(btrfs_free_space_cachep, e);
 				goto free_cache;
 			}
@@ -741,8 +743,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 			ctl->op->recalc_thresholds(ctl);
 			spin_unlock(&ctl->tree_lock);
 			if (ret) {
-				printk(KERN_ERR "Duplicate entries in "
-				       "free space cache, dumping\n");
+				btrfs_err(root->fs_info,
+					"Duplicate entries in free space cache, dumping");
 				kmem_cache_free(btrfs_free_space_cachep, e);
 				goto free_cache;
 			}
@@ -833,8 +835,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 
 	if (!matched) {
 		__btrfs_remove_free_space_cache(ctl);
-		printk(KERN_ERR "block group %llu has an wrong amount of free "
-		       "space\n", block_group->key.objectid);
+		btrfs_err(fs_info, "block group %llu has wrong amount of free space",
+			block_group->key.objectid);
 		ret = -1;
 	}
 out:
@@ -845,8 +847,8 @@ out:
 		spin_unlock(&block_group->lock);
 		ret = 0;
 
-		printk(KERN_ERR "btrfs: failed to load free space cache "
-		       "for block group %llu\n", block_group->key.objectid);
+		btrfs_err(fs_info, "failed to load free space cache for block group %llu",
+			block_group->key.objectid);
 	}
 
 	iput(inode);
@@ -866,11 +868,11 @@ out:
  * on mount.  This will return 0 if it was successfull in writing the cache out,
  * and -1 if it was not.
  */
-int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
-			    struct btrfs_free_space_ctl *ctl,
-			    struct btrfs_block_group_cache *block_group,
-			    struct btrfs_trans_handle *trans,
-			    struct btrfs_path *path, u64 offset)
+static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+				   struct btrfs_free_space_ctl *ctl,
+				   struct btrfs_block_group_cache *block_group,
+				   struct btrfs_trans_handle *trans,
+				   struct btrfs_path *path, u64 offset)
 {
 	struct btrfs_free_space_header *header;
 	struct extent_buffer *leaf;
@@ -917,10 +919,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	/* Make sure we can fit our crcs into the first page */
 	if (io_ctl.check_crcs &&
-	    (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
-		WARN_ON(1);
+	    (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
 		goto out_nospc;
-	}
 
 	io_ctl_set_generation(&io_ctl, trans->transid);
 
@@ -1104,8 +1104,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 		spin_unlock(&block_group->lock);
 		ret = 0;
 #ifdef DEBUG
-		printk(KERN_ERR "btrfs: failed to write free space cache "
-		       "for block group %llu\n", block_group->key.objectid);
+		btrfs_err(root->fs_info,
+			"failed to write free space cache for block group %llu",
+			block_group->key.objectid);
 #endif
 	}
 
@@ -1356,6 +1357,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
 	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
 
+	max_bitmaps = max(max_bitmaps, 1);
+
 	BUG_ON(ctl->total_bitmaps > max_bitmaps);
 
 	/*
@@ -1463,10 +1466,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 }
 
 static struct btrfs_free_space *
-find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+		unsigned long align)
 {
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
+	u64 ctl_off;
+	u64 tmp;
+	u64 align_off;
 	int ret;
 
 	if (!ctl->free_space_offset.rb_node)
@@ -1481,15 +1488,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
 		if (entry->bytes < *bytes)
 			continue;
 
+		/* make sure the space returned is big enough
+		 * to match our requested alignment
+		 */
+		if (*bytes >= align) {
+			ctl_off = entry->offset - ctl->start;
+			tmp = ctl_off + align - 1;;
+			do_div(tmp, align);
+			tmp = tmp * align + ctl->start;
+			align_off = tmp - entry->offset;
+		} else {
+			align_off = 0;
+			tmp = entry->offset;
+		}
+
+		if (entry->bytes < *bytes + align_off)
+			continue;
+
 		if (entry->bitmap) {
-			ret = search_bitmap(ctl, entry, offset, bytes);
-			if (!ret)
+			ret = search_bitmap(ctl, entry, &tmp, bytes);
+			if (!ret) {
+				*offset = tmp;
 				return entry;
+			}
 			continue;
 		}
 
-		*offset = entry->offset;
-		*bytes = entry->bytes;
+		*offset = tmp;
+		*bytes = entry->bytes - align_off;
 		return entry;
 	}
 
@@ -1539,7 +1565,8 @@ again:
 	search_bytes = ctl->unit;
 	search_bytes = min(search_bytes, end - search_start + 1);
 	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
-	BUG_ON(ret < 0 || search_start != *offset);
+	if (ret < 0 || search_start != *offset)
+		return -EINVAL;
 
 	/* We may have found more bits than what we need */
 	search_bytes = min(search_bytes, *bytes);
@@ -1636,10 +1663,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
 	}
 
 	/*
-	 * some block groups are so tiny they can't be enveloped by a bitmap, so
-	 * don't even bother to create a bitmap for this
+	 * The original block groups from mkfs can be really small, like 8
+	 * megabytes, so don't bother with a bitmap for those entries.  However
+	 * some block groups can be smaller than what a bitmap would cover but
+	 * are still large enough that they could overflow the 32k memory limit,
+	 * so allow those block groups to still be allowed to have a bitmap
+	 * entry.
 	 */
-	if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
+	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
 		return false;
 
 	return true;
@@ -1941,7 +1972,6 @@ again:
 		re_search = true;
 		goto again;
 	}
-	BUG_ON(ret); /* logic error */
 out_lock:
 	spin_unlock(&ctl->tree_lock);
 out:
@@ -2035,7 +2065,8 @@ out:
 	return 0;
 }
 
-void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
+static void __btrfs_remove_free_space_cache_locked(
+				struct btrfs_free_space_ctl *ctl)
 {
 	struct btrfs_free_space *info;
 	struct rb_node *node;
@@ -2095,9 +2126,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 	struct btrfs_free_space *entry = NULL;
 	u64 bytes_search = bytes + empty_size;
 	u64 ret = 0;
+	u64 align_gap = 0;
+	u64 align_gap_len = 0;
 
 	spin_lock(&ctl->tree_lock);
-	entry = find_free_space(ctl, &offset, &bytes_search);
+	entry = find_free_space(ctl, &offset, &bytes_search,
+				block_group->full_stripe_len);
 	if (!entry)
 		goto out;
 
@@ -2107,9 +2141,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 		if (!entry->bytes)
 			free_bitmap(ctl, entry);
 	} else {
+
 		unlink_free_space(ctl, entry);
-		entry->offset += bytes;
-		entry->bytes -= bytes;
+		align_gap_len = offset - entry->offset;
+		align_gap = entry->offset;
+
+		entry->offset = offset + bytes;
+		WARN_ON(entry->bytes < bytes + align_gap_len);
+
+		entry->bytes -= bytes + align_gap_len;
 		if (!entry->bytes)
 			kmem_cache_free(btrfs_free_space_cachep, entry);
 		else
@@ -2119,6 +2159,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 out:
 	spin_unlock(&ctl->tree_lock);
 
+	if (align_gap_len)
+		__btrfs_add_free_space(ctl, align_gap, align_gap_len);
 	return ret;
 }
 
@@ -2891,8 +2933,9 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 	ret = __load_free_space_cache(root, inode, ctl, path, 0);
 
 	if (ret < 0)
-		printk(KERN_ERR "btrfs: failed to load free ino cache for "
-		       "root %llu\n", root->root_key.objectid);
+		btrfs_err(fs_info,
+			"failed to load free ino cache for root %llu",
+			root->root_key.objectid);
 out_put:
 	iput(inode);
 out:
@@ -2919,11 +2962,534 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 	if (ret) {
 		btrfs_delalloc_release_metadata(inode, inode->i_size);
 #ifdef DEBUG
-		printk(KERN_ERR "btrfs: failed to write free ino cache "
-		       "for root %llu\n", root->root_key.objectid);
+		btrfs_err(root->fs_info,
+			"failed to write free ino cache for root %llu",
+			root->root_key.objectid);
 #endif
 	}
 
 	iput(inode);
 	return ret;
 }
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+static struct btrfs_block_group_cache *init_test_block_group(void)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache)
+		return NULL;
+	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+					GFP_NOFS);
+	if (!cache->free_space_ctl) {
+		kfree(cache);
+		return NULL;
+	}
+
+	cache->key.objectid = 0;
+	cache->key.offset = 1024 * 1024 * 1024;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	cache->sectorsize = 4096;
+
+	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->new_bg_list);
+
+	btrfs_init_free_space_ctl(cache);
+
+	return cache;
+}
+
+/*
+ * Checks to see if the given range is in the free space cache.  This is really
+ * just used to check the absence of space, so if there is free space in the
+ * range at all we will return 1.
+ */
+static int check_exists(struct btrfs_block_group_cache *cache, u64 offset,
+			u64 bytes)
+{
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	struct btrfs_free_space *info;
+	int ret = 0;
+
+	spin_lock(&ctl->tree_lock);
+	info = tree_search_offset(ctl, offset, 0, 0);
+	if (!info) {
+		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					  1, 0);
+		if (!info)
+			goto out;
+	}
+
+have_info:
+	if (info->bitmap) {
+		u64 bit_off, bit_bytes;
+		struct rb_node *n;
+		struct btrfs_free_space *tmp;
+
+		bit_off = offset;
+		bit_bytes = ctl->unit;
+		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+		if (!ret) {
+			if (bit_off == offset) {
+				ret = 1;
+				goto out;
+			} else if (bit_off > offset &&
+				   offset + bytes > bit_off) {
+				ret = 1;
+				goto out;
+			}
+		}
+
+		n = rb_prev(&info->offset_index);
+		while (n) {
+			tmp = rb_entry(n, struct btrfs_free_space,
+				       offset_index);
+			if (tmp->offset + tmp->bytes < offset)
+				break;
+			if (offset + bytes < tmp->offset) {
+				n = rb_prev(&info->offset_index);
+				continue;
+			}
+			info = tmp;
+			goto have_info;
+		}
+
+		n = rb_next(&info->offset_index);
+		while (n) {
+			tmp = rb_entry(n, struct btrfs_free_space,
+				       offset_index);
+			if (offset + bytes < tmp->offset)
+				break;
+			if (tmp->offset + tmp->bytes < offset) {
+				n = rb_next(&info->offset_index);
+				continue;
+			}
+			info = tmp;
+			goto have_info;
+		}
+
+		goto out;
+	}
+
+	if (info->offset == offset) {
+		ret = 1;
+		goto out;
+	}
+
+	if (offset > info->offset && offset < info->offset + info->bytes)
+		ret = 1;
+out:
+	spin_unlock(&ctl->tree_lock);
+	return ret;
+}
+
+/*
+ * Use this if you need to make a bitmap or extent entry specifically, it
+ * doesn't do any of the merging that add_free_space does, this acts a lot like
+ * how the free space cache loading stuff works, so you can get really weird
+ * configurations.
+ */
+static int add_free_space_entry(struct btrfs_block_group_cache *cache,
+				u64 offset, u64 bytes, bool bitmap)
+{
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	struct btrfs_free_space *info = NULL, *bitmap_info;
+	void *map = NULL;
+	u64 bytes_added;
+	int ret;
+
+again:
+	if (!info) {
+		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+		if (!info)
+			return -ENOMEM;
+	}
+
+	if (!bitmap) {
+		spin_lock(&ctl->tree_lock);
+		info->offset = offset;
+		info->bytes = bytes;
+		ret = link_free_space(ctl, info);
+		spin_unlock(&ctl->tree_lock);
+		if (ret)
+			kmem_cache_free(btrfs_free_space_cachep, info);
+		return ret;
+	}
+
+	if (!map) {
+		map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+		if (!map) {
+			kmem_cache_free(btrfs_free_space_cachep, info);
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock(&ctl->tree_lock);
+	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					 1, 0);
+	if (!bitmap_info) {
+		info->bitmap = map;
+		map = NULL;
+		add_new_bitmap(ctl, info, offset);
+		bitmap_info = info;
+	}
+
+	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+	bytes -= bytes_added;
+	offset += bytes_added;
+	spin_unlock(&ctl->tree_lock);
+
+	if (bytes)
+		goto again;
+
+	if (map)
+		kfree(map);
+	return 0;
+}
+
+#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
+
+/*
+ * This test just does basic sanity checking, making sure we can add an exten
+ * entry and remove space from either end and the middle, and make sure we can
+ * remove space that covers adjacent extent entries.
+ */
+static int test_extents(struct btrfs_block_group_cache *cache)
+{
+	int ret = 0;
+
+	test_msg("Running extent only tests\n");
+
+	/* First just make sure we can remove an entire entry */
+	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error adding initial extents %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing extent %d\n", ret);
+		return ret;
+	}
+
+	if (check_exists(cache, 0, 4 * 1024 * 1024)) {
+		test_msg("Full remove left some lingering space\n");
+		return -1;
+	}
+
+	/* Ok edge and middle cases now */
+	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error adding half extent %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing tail end %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing front end %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
+	if (ret) {
+		test_msg("Error removing middle piece %d\n", ret);
+		return ret;
+	}
+
+	if (check_exists(cache, 0, 1 * 1024 * 1024)) {
+		test_msg("Still have space at the front\n");
+		return -1;
+	}
+
+	if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
+		test_msg("Still have space in the middle\n");
+		return -1;
+	}
+
+	if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
+		test_msg("Still have space at the end\n");
+		return -1;
+	}
+
+	/* Cleanup */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	return 0;
+}
+
+static int test_bitmaps(struct btrfs_block_group_cache *cache)
+{
+	u64 next_bitmap_offset;
+	int ret;
+
+	test_msg("Running bitmap only tests\n");
+
+	ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't create a bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing bitmap full range %d\n", ret);
+		return ret;
+	}
+
+	if (check_exists(cache, 0, 4 * 1024 * 1024)) {
+		test_msg("Left some space in bitmap\n");
+		return -1;
+	}
+
+	ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add to our bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove middle chunk %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * The first bitmap we have starts at offset 0 so the next one is just
+	 * at the end of the first bitmap.
+	 */
+	next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+
+	/* Test a bit straddling two bitmaps */
+	ret = add_free_space_entry(cache, next_bitmap_offset -
+				   (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add space that straddles two bitmaps %d\n",
+				ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, next_bitmap_offset -
+				      (1 * 1024 * 1024), 2 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
+			 2 * 1024 * 1024)) {
+		test_msg("Left some space when removing overlapping\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	return 0;
+}
+
+/* This is the high grade jackassery */
+static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
+{
+	u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+	int ret;
+
+	test_msg("Running bitmap and extent tests\n");
+
+	/*
+	 * First let's do something simple, an extent at the same offset as the
+	 * bitmap, but the free space completely in the extent and then
+	 * completely in the bitmap.
+	 */
+	ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't create bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove extent entry %d\n", ret);
+		return ret;
+	}
+
+	if (check_exists(cache, 0, 1 * 1024 * 1024)) {
+		test_msg("Left remnants after our remove\n");
+		return -1;
+	}
+
+	/* Now to add back the extent entry and remove from the bitmap */
+	ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't re-add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove from bitmap %d\n", ret);
+		return ret;
+	}
+
+	if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
+		test_msg("Left remnants in the bitmap\n");
+		return -1;
+	}
+
+	/*
+	 * Ok so a little more evil, extent entry and bitmap at the same offset,
+	 * removing an overlapping chunk.
+	 */
+	ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add to a bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
+		test_msg("Left over peices after removing overlapping\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	/* Now with the extent entry offset into the bitmap */
+	ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add space to the bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent to the cache %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Problem removing overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
+		test_msg("Left something behind when removing space");
+		return -1;
+	}
+
+	/*
+	 * This has blown up in the past, the extent entry starts before the
+	 * bitmap entry, but we're trying to remove an offset that falls
+	 * completely within the bitmap range and is in both the extent entry
+	 * and the bitmap entry, looks like this
+	 *
+	 *   [ extent ]
+	 *      [ bitmap ]
+	 *        [ del ]
+	 */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
+				   4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
+				   5 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
+				      5 * 1024 * 1024);
+	if (ret) {
+		test_msg("Failed to free our space %d\n", ret);
+		return ret;
+	}
+
+	if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
+			 5 * 1024 * 1024)) {
+		test_msg("Left stuff over\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	/*
+	 * This blew up before, we have part of the free space in a bitmap and
+	 * then the entirety of the rest of the space in an extent.  This used
+	 * to return -EAGAIN back from btrfs_remove_extent, make sure this
+	 * doesn't happen.
+	 */
+	ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing bitmap and extent overlapping %d\n", ret);
+		return ret;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	return 0;
+}
+
+void btrfs_test_free_space_cache(void)
+{
+	struct btrfs_block_group_cache *cache;
+
+	test_msg("Running btrfs free space cache tests\n");
+
+	cache = init_test_block_group();
+	if (!cache) {
+		test_msg("Couldn't run the tests\n");
+		return;
+	}
+
+	if (test_extents(cache))
+		goto out;
+	if (test_bitmaps(cache))
+		goto out;
+	if (test_bitmaps_and_extents(cache))
+		goto out;
+out:
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	kfree(cache->free_space_ctl);
+	kfree(cache);
+	test_msg("Free space cache tests finished\n");
+}
+#undef test_msg
+#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+void btrfs_test_free_space_cache(void) {}
+#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 8f2613f779e..894116b7130 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -54,6 +54,8 @@ int create_free_space_inode(struct btrfs_root *root,
 			    struct btrfs_block_group_cache *block_group,
 			    struct btrfs_path *path);
 
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+				       struct btrfs_block_rsv *rsv);
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
 				    struct btrfs_path *path,
@@ -110,4 +112,7 @@ int btrfs_return_cluster_to_free_space(
 			       struct btrfs_free_cluster *cluster);
 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 			   u64 *trimmed, u64 start, u64 end, u64 minlen);
+
+void btrfs_test_free_space_cache(void);
+
 #endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 48b8fda9313..e0b7034d634 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -183,10 +183,11 @@ int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
 	return -ENOENT;
 }
 
-int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid, u64 *index)
+static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  const char *name, int name_len,
+				  u64 inode_objectid, u64 ref_objectid,
+				  u64 *index)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -246,7 +247,7 @@ int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 	memmove_extent_buffer(leaf, ptr, ptr + del_len,
 			      item_size - (ptr + del_len - item_start));
 
-	btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
+	btrfs_truncate_item(root, path, item_size - del_len, 1);
 
 out:
 	btrfs_free_path(path);
@@ -309,7 +310,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			      item_size - (ptr + sub_item_len - item_start));
-	btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
+	btrfs_truncate_item(root, path, item_size - sub_item_len, 1);
 out:
 	btrfs_free_path(path);
 
@@ -361,7 +362,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 						   name, name_len, NULL))
 			goto out;
 
-		btrfs_extend_item(trans, root, path, ins_len);
+		btrfs_extend_item(root, path, ins_len);
 		ret = 0;
 	}
 	if (ret < 0)
@@ -417,7 +418,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			goto out;
 
 		old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
-		btrfs_extend_item(trans, root, path, ins_len);
+		btrfs_extend_item(root, path, ins_len);
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_inode_ref);
 		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d26f67a59e3..2c66ddbbe67 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -429,11 +429,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 	num_bytes = trans->bytes_reserved;
 	/*
 	 * 1 item for inode item insertion if need
-	 * 3 items for inode item update (in the worst case)
+	 * 4 items for inode item update (in the worst case)
+	 * 1 items for slack space if we need do truncation
 	 * 1 item for free space object
 	 * 3 items for pre-allocation
 	 */
-	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
+	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10);
 	ret = btrfs_block_rsv_add(root, trans->block_rsv,
 				  trans->bytes_reserved,
 				  BTRFS_RESERVE_NO_FLUSH);
@@ -468,7 +469,8 @@ again:
 	if (i_size_read(inode) > 0) {
 		ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
 		if (ret) {
-			btrfs_abort_transaction(trans, root, ret);
+			if (ret != -ENOSPC)
+				btrfs_abort_transaction(trans, root, ret);
 			goto out_put;
 		}
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cc93b23ca35..6d1b93c8aaf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
+#include <linux/aio.h>
 #include <linux/bit_spinlock.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
@@ -39,12 +40,14 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/mount.h>
+#include <linux/btrfs.h>
+#include <linux/blkdev.h>
+#include <linux/posix_acl_xattr.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "ordered-data.h"
 #include "xattr.h"
@@ -54,6 +57,8 @@
 #include "locking.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "backref.h"
+#include "hash.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -98,7 +103,10 @@ static noinline int cow_file_range(struct inode *inode,
 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 					   u64 len, u64 orig_start,
 					   u64 block_start, u64 block_len,
-					   u64 orig_block_len, int type);
+					   u64 orig_block_len, u64 ram_bytes,
+					   int type);
+
+static int btrfs_dirty_inode(struct inode *inode);
 
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 				     struct inode *inode,  struct inode *dir,
@@ -231,8 +239,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	u64 isize = i_size_read(inode);
 	u64 actual_end = min(end + 1, isize);
 	u64 inline_len = actual_end - start;
-	u64 aligned_end = (end + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
+	u64 aligned_end = ALIGN(end, root->sectorsize);
 	u64 data_len = inline_len;
 	int ret;
 
@@ -265,6 +272,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 		return 1;
 	}
 
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 	return 0;
@@ -351,6 +359,7 @@ static noinline int compress_file_range(struct inode *inode,
 	int i;
 	int will_compress;
 	int compress_type = root->fs_info->compress_type;
+	int redirty = 0;
 
 	/* if this is a small write inside eof, kick off a defrag */
 	if ((end - start + 1) < 16 * 1024 &&
@@ -389,7 +398,7 @@ again:
 	 * a compressed extent to 128k.
 	 */
 	total_compressed = min(total_compressed, max_uncompressed);
-	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = ALIGN(end - start + 1, blocksize);
 	num_bytes = max(blocksize,  num_bytes);
 	total_in = 0;
 	ret = 0;
@@ -413,6 +422,17 @@ again:
 		if (BTRFS_I(inode)->force_compress)
 			compress_type = BTRFS_I(inode)->force_compress;
 
+		/*
+		 * we need to call clear_page_dirty_for_io on each
+		 * page in the range.  Otherwise applications with the file
+		 * mmap'd can wander in and change the page contents while
+		 * we are compressing them.
+		 *
+		 * If the compression fails for any reason, we set the pages
+		 * dirty again later on.
+		 */
+		extent_range_clear_dirty_for_io(inode, start, end);
+		redirty = 1;
 		ret = btrfs_compress_pages(compress_type,
 					   inode->i_mapping, start,
 					   total_compressed, pages,
@@ -488,15 +508,13 @@ cont:
 		 * up to a block size boundary so the allocator does sane
 		 * things
 		 */
-		total_compressed = (total_compressed + blocksize - 1) &
-			~(blocksize - 1);
+		total_compressed = ALIGN(total_compressed, blocksize);
 
 		/*
 		 * one last check to make sure the compression is really a
 		 * win, compare the page count read with the blocks on disk
 		 */
-		total_in = (total_in + PAGE_CACHE_SIZE - 1) &
-			~(PAGE_CACHE_SIZE - 1);
+		total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
 		if (total_compressed >= total_in) {
 			will_compress = 0;
 		} else {
@@ -554,6 +572,8 @@ cleanup_and_bail_uncompressed:
 			__set_page_dirty_nobuffers(locked_page);
 			/* unlocked later on in the async handlers */
 		}
+		if (redirty)
+			extent_range_redirty_for_io(inode, start, end);
 		add_async_extent(async_cow, start, end - start + 1,
 				 0, NULL, 0, BTRFS_COMPRESS_NONE);
 		*num_added += 1;
@@ -608,7 +628,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 	if (list_empty(&async_cow->extents))
 		return 0;
 
-
+again:
 	while (!list_empty(&async_cow->extents)) {
 		async_extent = list_entry(async_cow->extents.next,
 					  struct async_extent, list);
@@ -648,6 +668,8 @@ retry:
 						  async_extent->ram_size - 1,
 						  btrfs_get_extent,
 						  WB_SYNC_ALL);
+			else if (ret)
+				unlock_page(async_cow->locked_page);
 			kfree(async_extent);
 			cond_resched();
 			continue;
@@ -672,6 +694,7 @@ retry:
 
 		if (ret) {
 			int i;
+
 			for (i = 0; i < async_extent->nr_pages; i++) {
 				WARN_ON(async_extent->pages[i]->mapping);
 				page_cache_release(async_extent->pages[i]);
@@ -679,12 +702,14 @@ retry:
 			kfree(async_extent->pages);
 			async_extent->nr_pages = 0;
 			async_extent->pages = NULL;
-			unlock_extent(io_tree, async_extent->start,
-				      async_extent->start +
-				      async_extent->ram_size - 1);
-			if (ret == -ENOSPC)
+
+			if (ret == -ENOSPC) {
+				unlock_extent(io_tree, async_extent->start,
+					      async_extent->start +
+					      async_extent->ram_size - 1);
 				goto retry;
-			goto out_free; /* JDM: Requeue? */
+			}
+			goto out_free;
 		}
 
 		/*
@@ -696,14 +721,20 @@ retry:
 					async_extent->ram_size - 1, 0);
 
 		em = alloc_extent_map();
-		BUG_ON(!em); /* -ENOMEM */
+		if (!em) {
+			ret = -ENOMEM;
+			goto out_free_reserve;
+		}
 		em->start = async_extent->start;
 		em->len = async_extent->ram_size;
 		em->orig_start = em->start;
+		em->mod_start = em->start;
+		em->mod_len = em->len;
 
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
 		em->orig_block_len = ins.offset;
+		em->ram_bytes = async_extent->ram_size;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		em->compress_type = async_extent->compress_type;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -712,10 +743,7 @@ retry:
 
 		while (1) {
 			write_lock(&em_tree->lock);
-			ret = add_extent_mapping(em_tree, em);
-			if (!ret)
-				list_move(&em->list,
-					  &em_tree->modified_extents);
+			ret = add_extent_mapping(em_tree, em, 1);
 			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
@@ -726,6 +754,9 @@ retry:
 						async_extent->ram_size - 1, 0);
 		}
 
+		if (ret)
+			goto out_free_reserve;
+
 		ret = btrfs_add_ordered_extent_compress(inode,
 						async_extent->start,
 						ins.objectid,
@@ -733,7 +764,8 @@ retry:
 						ins.offset,
 						BTRFS_ORDERED_COMPRESSED,
 						async_extent->compress_type);
-		BUG_ON(ret); /* -ENOMEM */
+		if (ret)
+			goto out_free_reserve;
 
 		/*
 		 * clear dirty, set writeback and unlock the pages.
@@ -754,18 +786,30 @@ retry:
 				    ins.objectid,
 				    ins.offset, async_extent->pages,
 				    async_extent->nr_pages);
-
-		BUG_ON(ret); /* -ENOMEM */
 		alloc_hint = ins.objectid + ins.offset;
 		kfree(async_extent);
+		if (ret)
+			goto out;
 		cond_resched();
 	}
 	ret = 0;
 out:
 	return ret;
+out_free_reserve:
+	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
 out_free:
+	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+				     async_extent->start,
+				     async_extent->start +
+				     async_extent->ram_size - 1,
+				     NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+				     EXTENT_CLEAR_UNLOCK |
+				     EXTENT_CLEAR_DELALLOC |
+				     EXTENT_CLEAR_DIRTY |
+				     EXTENT_SET_WRITEBACK |
+				     EXTENT_END_WRITEBACK);
 	kfree(async_extent);
-	goto out;
+	goto again;
 }
 
 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -834,7 +878,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
 
 	BUG_ON(btrfs_is_free_space_inode(inode));
 
-	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = ALIGN(end - start + 1, blocksize);
 	num_bytes = max(blocksize,  num_bytes);
 	disk_num_bytes = num_bytes;
 
@@ -887,25 +931,28 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
 		}
 
 		em = alloc_extent_map();
-		BUG_ON(!em); /* -ENOMEM */
+		if (!em) {
+			ret = -ENOMEM;
+			goto out_reserve;
+		}
 		em->start = start;
 		em->orig_start = em->start;
 		ram_size = ins.offset;
 		em->len = ins.offset;
+		em->mod_start = em->start;
+		em->mod_len = em->len;
 
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
 		em->orig_block_len = ins.offset;
+		em->ram_bytes = ram_size;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 		em->generation = -1;
 
 		while (1) {
 			write_lock(&em_tree->lock);
-			ret = add_extent_mapping(em_tree, em);
-			if (!ret)
-				list_move(&em->list,
-					  &em_tree->modified_extents);
+			ret = add_extent_mapping(em_tree, em, 1);
 			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
@@ -914,11 +961,14 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
 			btrfs_drop_extent_cache(inode, start,
 						start + ram_size - 1, 0);
 		}
+		if (ret)
+			goto out_reserve;
 
 		cur_alloc_size = ins.offset;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
 					       ram_size, cur_alloc_size, 0);
-		BUG_ON(ret); /* -ENOMEM */
+		if (ret)
+			goto out_reserve;
 
 		if (root->root_key.objectid ==
 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
@@ -926,7 +976,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
 						      cur_alloc_size);
 			if (ret) {
 				btrfs_abort_transaction(trans, root, ret);
-				goto out_unlock;
+				goto out_reserve;
 			}
 		}
 
@@ -955,6 +1005,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
 out:
 	return ret;
 
+out_reserve:
+	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
 out_unlock:
 	extent_clear_unlock_delalloc(inode,
 		     &BTRFS_I(inode)->io_tree,
@@ -1158,6 +1210,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	u64 disk_bytenr;
 	u64 num_bytes;
 	u64 disk_num_bytes;
+	u64 ram_bytes;
 	int extent_type;
 	int ret, err;
 	int type;
@@ -1254,6 +1307,7 @@ next_slot:
 				    struct btrfs_file_extent_item);
 		extent_type = btrfs_file_extent_type(leaf, fi);
 
+		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
@@ -1337,16 +1391,16 @@ out_check:
 			em->block_len = num_bytes;
 			em->block_start = disk_bytenr;
 			em->orig_block_len = disk_num_bytes;
+			em->ram_bytes = ram_bytes;
 			em->bdev = root->fs_info->fs_devices->latest_bdev;
+			em->mod_start = em->start;
+			em->mod_len = em->len;
 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
 			set_bit(EXTENT_FLAG_FILLING, &em->flags);
 			em->generation = -1;
 			while (1) {
 				write_lock(&em_tree->lock);
-				ret = add_extent_mapping(em_tree, em);
-				if (!ret)
-					list_move(&em->list,
-						  &em_tree->modified_extents);
+				ret = add_extent_mapping(em_tree, em, 1);
 				write_unlock(&em_tree->lock);
 				if (ret != -EEXIST) {
 					free_extent_map(em);
@@ -1481,13 +1535,53 @@ static void btrfs_merge_extent_hook(struct inode *inode,
 	spin_unlock(&BTRFS_I(inode)->lock);
 }
 
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+				      struct inode *inode)
+{
+	spin_lock(&root->delalloc_lock);
+	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+			      &root->delalloc_inodes);
+		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			&BTRFS_I(inode)->runtime_flags);
+		root->nr_delalloc_inodes++;
+		if (root->nr_delalloc_inodes == 1) {
+			spin_lock(&root->fs_info->delalloc_root_lock);
+			BUG_ON(!list_empty(&root->delalloc_root));
+			list_add_tail(&root->delalloc_root,
+				      &root->fs_info->delalloc_roots);
+			spin_unlock(&root->fs_info->delalloc_root_lock);
+		}
+	}
+	spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+				     struct inode *inode)
+{
+	spin_lock(&root->delalloc_lock);
+	if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+		list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			  &BTRFS_I(inode)->runtime_flags);
+		root->nr_delalloc_inodes--;
+		if (!root->nr_delalloc_inodes) {
+			spin_lock(&root->fs_info->delalloc_root_lock);
+			BUG_ON(list_empty(&root->delalloc_root));
+			list_del_init(&root->delalloc_root);
+			spin_unlock(&root->fs_info->delalloc_root_lock);
+		}
+	}
+	spin_unlock(&root->delalloc_lock);
+}
+
 /*
  * extent_io.c set_bit_hook, used to track delayed allocation
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
 static void btrfs_set_bit_hook(struct inode *inode,
-			       struct extent_state *state, int *bits)
+			       struct extent_state *state, unsigned long *bits)
 {
 
 	/*
@@ -1508,14 +1602,14 @@ static void btrfs_set_bit_hook(struct inode *inode,
 			spin_unlock(&BTRFS_I(inode)->lock);
 		}
 
-		spin_lock(&root->fs_info->delalloc_lock);
+		__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
+				     root->fs_info->delalloc_batch);
+		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->delalloc_bytes += len;
-		root->fs_info->delalloc_bytes += len;
-		if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
-				      &root->fs_info->delalloc_inodes);
-		}
-		spin_unlock(&root->fs_info->delalloc_lock);
+		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+					 &BTRFS_I(inode)->runtime_flags))
+			btrfs_add_delalloc_inodes(root, inode);
+		spin_unlock(&BTRFS_I(inode)->lock);
 	}
 }
 
@@ -1523,7 +1617,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
 static void btrfs_clear_bit_hook(struct inode *inode,
-				 struct extent_state *state, int *bits)
+				 struct extent_state *state,
+				 unsigned long *bits)
 {
 	/*
 	 * set_bit and clear bit hooks normally require _irqsave/restore
@@ -1547,18 +1642,18 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 			btrfs_delalloc_release_metadata(inode, len);
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-		    && do_list)
+		    && do_list && !(state->state & EXTENT_NORESERVE))
 			btrfs_free_reserved_data_space(inode, len);
 
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->delalloc_bytes -= len;
+		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
+				     root->fs_info->delalloc_batch);
+		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->delalloc_bytes -= len;
-
 		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
-		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
-		}
-		spin_unlock(&root->fs_info->delalloc_lock);
+		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			     &BTRFS_I(inode)->runtime_flags))
+			btrfs_del_delalloc_inode(root, inode);
+		spin_unlock(&BTRFS_I(inode)->lock);
 	}
 }
 
@@ -1566,7 +1661,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
  * we don't create bios that span stripes or chunks
  */
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio,
 			 unsigned long bio_flags)
 {
@@ -1581,7 +1676,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 
 	length = bio->bi_size;
 	map_length = length;
-	ret = btrfs_map_block(root->fs_info, READ, logical,
+	ret = btrfs_map_block(root->fs_info, rw, logical,
 			      &map_length, NULL, 0);
 	/* Will always return 0 with map_multi == NULL */
 	BUG_ON(ret < 0);
@@ -1704,8 +1799,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 	struct btrfs_ordered_sum *sum;
 
 	list_for_each_entry(sum, list, list) {
+		trans->adding_csums = 1;
 		btrfs_csum_file_blocks(trans,
 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
+		trans->adding_csums = 0;
 	}
 	return 0;
 }
@@ -1892,6 +1989,638 @@ out:
 	return ret;
 }
 
+/* snapshot-aware defrag */
+struct sa_defrag_extent_backref {
+	struct rb_node node;
+	struct old_sa_defrag_extent *old;
+	u64 root_id;
+	u64 inum;
+	u64 file_pos;
+	u64 extent_offset;
+	u64 num_bytes;
+	u64 generation;
+};
+
+struct old_sa_defrag_extent {
+	struct list_head list;
+	struct new_sa_defrag_extent *new;
+
+	u64 extent_offset;
+	u64 bytenr;
+	u64 offset;
+	u64 len;
+	int count;
+};
+
+struct new_sa_defrag_extent {
+	struct rb_root root;
+	struct list_head head;
+	struct btrfs_path *path;
+	struct inode *inode;
+	u64 file_pos;
+	u64 len;
+	u64 bytenr;
+	u64 disk_len;
+	u8 compress_type;
+};
+
+static int backref_comp(struct sa_defrag_extent_backref *b1,
+			struct sa_defrag_extent_backref *b2)
+{
+	if (b1->root_id < b2->root_id)
+		return -1;
+	else if (b1->root_id > b2->root_id)
+		return 1;
+
+	if (b1->inum < b2->inum)
+		return -1;
+	else if (b1->inum > b2->inum)
+		return 1;
+
+	if (b1->file_pos < b2->file_pos)
+		return -1;
+	else if (b1->file_pos > b2->file_pos)
+		return 1;
+
+	/*
+	 * [------------------------------] ===> (a range of space)
+	 *     |<--->|   |<---->| =============> (fs/file tree A)
+	 * |<---------------------------->| ===> (fs/file tree B)
+	 *
+	 * A range of space can refer to two file extents in one tree while
+	 * refer to only one file extent in another tree.
+	 *
+	 * So we may process a disk offset more than one time(two extents in A)
+	 * and locate at the same extent(one extent in B), then insert two same
+	 * backrefs(both refer to the extent in B).
+	 */
+	return 0;
+}
+
+static void backref_insert(struct rb_root *root,
+			   struct sa_defrag_extent_backref *backref)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct sa_defrag_extent_backref *entry;
+	int ret;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
+
+		ret = backref_comp(backref, entry);
+		if (ret < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&backref->node, parent, p);
+	rb_insert_color(&backref->node, root);
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
+				       void *ctx)
+{
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_fs_info *fs_info;
+	struct old_sa_defrag_extent *old = ctx;
+	struct new_sa_defrag_extent *new = old->new;
+	struct btrfs_path *path = new->path;
+	struct btrfs_key key;
+	struct btrfs_root *root;
+	struct sa_defrag_extent_backref *backref;
+	struct extent_buffer *leaf;
+	struct inode *inode = new->inode;
+	int slot;
+	int ret;
+	u64 extent_offset;
+	u64 num_bytes;
+
+	if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
+	    inum == btrfs_ino(inode))
+		return 0;
+
+	key.objectid = root_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	fs_info = BTRFS_I(inode)->root->fs_info;
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		if (PTR_ERR(root) == -ENOENT)
+			return 0;
+		WARN_ON(1);
+		pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
+			 inum, offset, root_id);
+		return PTR_ERR(root);
+	}
+
+	key.objectid = inum;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (offset > (u64)-1 << 32)
+		key.offset = 0;
+	else
+		key.offset = offset;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		WARN_ON(1);
+		return ret;
+	}
+
+	while (1) {
+		cond_resched();
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				goto out;
+			}
+			continue;
+		}
+
+		path->slots[0]++;
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.objectid > inum)
+			goto out;
+
+		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+
+		extent = btrfs_item_ptr(leaf, slot,
+					struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
+			continue;
+
+		extent_offset = btrfs_file_extent_offset(leaf, extent);
+		if (key.offset - extent_offset != offset)
+			continue;
+
+		num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
+		if (extent_offset >= old->extent_offset + old->offset +
+		    old->len || extent_offset + num_bytes <=
+		    old->extent_offset + old->offset)
+			continue;
+
+		break;
+	}
+
+	backref = kmalloc(sizeof(*backref), GFP_NOFS);
+	if (!backref) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	backref->root_id = root_id;
+	backref->inum = inum;
+	backref->file_pos = offset + extent_offset;
+	backref->num_bytes = num_bytes;
+	backref->extent_offset = extent_offset;
+	backref->generation = btrfs_file_extent_generation(leaf, extent);
+	backref->old = old;
+	backref_insert(&new->root, backref);
+	old->count++;
+out:
+	btrfs_release_path(path);
+	WARN_ON(ret);
+	return ret;
+}
+
+static noinline bool record_extent_backrefs(struct btrfs_path *path,
+				   struct new_sa_defrag_extent *new)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
+	struct old_sa_defrag_extent *old, *tmp;
+	int ret;
+
+	new->path = path;
+
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		ret = iterate_inodes_from_logical(old->bytenr, fs_info,
+						  path, record_one_backref,
+						  old);
+		BUG_ON(ret < 0 && ret != -ENOENT);
+
+		/* no backref to be processed for this extent */
+		if (!old->count) {
+			list_del(&old->list);
+			kfree(old);
+		}
+	}
+
+	if (list_empty(&new->head))
+		return false;
+
+	return true;
+}
+
+static int relink_is_mergable(struct extent_buffer *leaf,
+			      struct btrfs_file_extent_item *fi,
+			      u64 disk_bytenr)
+{
+	if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr)
+		return 0;
+
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+		return 0;
+
+	if (btrfs_file_extent_compression(leaf, fi) ||
+	    btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int relink_extent_backref(struct btrfs_path *path,
+				 struct sa_defrag_extent_backref *prev,
+				 struct sa_defrag_extent_backref *backref)
+{
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct old_sa_defrag_extent *old = backref->old;
+	struct new_sa_defrag_extent *new = old->new;
+	struct inode *src_inode = new->inode;
+	struct inode *inode;
+	struct extent_state *cached = NULL;
+	int ret = 0;
+	u64 start;
+	u64 len;
+	u64 lock_start;
+	u64 lock_end;
+	bool merge = false;
+	int index;
+
+	if (prev && prev->root_id == backref->root_id &&
+	    prev->inum == backref->inum &&
+	    prev->file_pos + prev->num_bytes == backref->file_pos)
+		merge = true;
+
+	/* step 1: get root */
+	key.objectid = backref->root_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	fs_info = BTRFS_I(src_inode)->root->fs_info;
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		if (PTR_ERR(root) == -ENOENT)
+			return 0;
+		return PTR_ERR(root);
+	}
+
+	/* step 2: get inode */
+	key.objectid = backref->inum;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (IS_ERR(inode)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		return 0;
+	}
+
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+	/* step 3: relink backref */
+	lock_start = backref->file_pos;
+	lock_end = backref->file_pos + backref->num_bytes - 1;
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+			 0, &cached);
+
+	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		goto out_unlock;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_unlock;
+	}
+
+	key.objectid = backref->inum;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = backref->file_pos;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out_free_path;
+	} else if (ret > 0) {
+		ret = 0;
+		goto out_free_path;
+	}
+
+	extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_generation(path->nodes[0], extent) !=
+	    backref->generation)
+		goto out_free_path;
+
+	btrfs_release_path(path);
+
+	start = backref->file_pos;
+	if (backref->extent_offset < old->extent_offset + old->offset)
+		start += old->extent_offset + old->offset -
+			 backref->extent_offset;
+
+	len = min(backref->extent_offset + backref->num_bytes,
+		  old->extent_offset + old->offset + old->len);
+	len -= max(backref->extent_offset, old->extent_offset + old->offset);
+
+	ret = btrfs_drop_extents(trans, root, inode, start,
+				 start + len, 1);
+	if (ret)
+		goto out_free_path;
+again:
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = start;
+
+	path->leave_spinning = 1;
+	if (merge) {
+		struct btrfs_file_extent_item *fi;
+		u64 extent_len;
+		struct btrfs_key found_key;
+
+		ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+		if (ret < 0)
+			goto out_free_path;
+
+		path->slots[0]--;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+
+		if (relink_is_mergable(leaf, fi, new->bytenr) &&
+		    extent_len + found_key.offset == start) {
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_len + len);
+			btrfs_mark_buffer_dirty(leaf);
+			inode_add_bytes(inode, len);
+
+			ret = 1;
+			goto out_free_path;
+		} else {
+			merge = false;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+					sizeof(*extent));
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_free_path;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_file_extent_item);
+	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
+	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
+	btrfs_set_file_extent_num_bytes(leaf, item, len);
+	btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
+	btrfs_set_file_extent_generation(leaf, item, trans->transid);
+	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_compression(leaf, item, new->compress_type);
+	btrfs_set_file_extent_encryption(leaf, item, 0);
+	btrfs_set_file_extent_other_encoding(leaf, item, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+	inode_add_bytes(inode, len);
+	btrfs_release_path(path);
+
+	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
+			new->disk_len, 0,
+			backref->root_id, backref->inum,
+			new->file_pos, 0);	/* start - extent_offset */
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_free_path;
+	}
+
+	ret = 1;
+out_free_path:
+	btrfs_release_path(path);
+	path->leave_spinning = 0;
+	btrfs_end_transaction(trans, root);
+out_unlock:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+			     &cached, GFP_NOFS);
+	iput(inode);
+	return ret;
+}
+
+static void relink_file_extents(struct new_sa_defrag_extent *new)
+{
+	struct btrfs_path *path;
+	struct old_sa_defrag_extent *old, *tmp;
+	struct sa_defrag_extent_backref *backref;
+	struct sa_defrag_extent_backref *prev = NULL;
+	struct inode *inode;
+	struct btrfs_root *root;
+	struct rb_node *node;
+	int ret;
+
+	inode = new->inode;
+	root = BTRFS_I(inode)->root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return;
+
+	if (!record_extent_backrefs(path, new)) {
+		btrfs_free_path(path);
+		goto out;
+	}
+	btrfs_release_path(path);
+
+	while (1) {
+		node = rb_first(&new->root);
+		if (!node)
+			break;
+		rb_erase(node, &new->root);
+
+		backref = rb_entry(node, struct sa_defrag_extent_backref, node);
+
+		ret = relink_extent_backref(path, prev, backref);
+		WARN_ON(ret < 0);
+
+		kfree(prev);
+
+		if (ret == 1)
+			prev = backref;
+		else
+			prev = NULL;
+		cond_resched();
+	}
+	kfree(prev);
+
+	btrfs_free_path(path);
+
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		list_del(&old->list);
+		kfree(old);
+	}
+out:
+	atomic_dec(&root->fs_info->defrag_running);
+	wake_up(&root->fs_info->transaction_wait);
+
+	kfree(new);
+}
+
+static struct new_sa_defrag_extent *
+record_old_file_extents(struct inode *inode,
+			struct btrfs_ordered_extent *ordered)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct old_sa_defrag_extent *old, *tmp;
+	struct new_sa_defrag_extent *new;
+	int ret;
+
+	new = kmalloc(sizeof(*new), GFP_NOFS);
+	if (!new)
+		return NULL;
+
+	new->inode = inode;
+	new->file_pos = ordered->file_offset;
+	new->len = ordered->len;
+	new->bytenr = ordered->start;
+	new->disk_len = ordered->disk_len;
+	new->compress_type = ordered->compress_type;
+	new->root = RB_ROOT;
+	INIT_LIST_HEAD(&new->head);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto out_kfree;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = new->file_pos;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out_free_path;
+	if (ret > 0 && path->slots[0] > 0)
+		path->slots[0]--;
+
+	/* find out all the old extents for the file range */
+	while (1) {
+		struct btrfs_file_extent_item *extent;
+		struct extent_buffer *l;
+		int slot;
+		u64 num_bytes;
+		u64 offset;
+		u64 end;
+		u64 disk_bytenr;
+		u64 extent_offset;
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out_free_list;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid != btrfs_ino(inode))
+			break;
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+		if (key.offset >= new->file_pos + new->len)
+			break;
+
+		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
+
+		num_bytes = btrfs_file_extent_num_bytes(l, extent);
+		if (key.offset + num_bytes < new->file_pos)
+			goto next;
+
+		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
+		if (!disk_bytenr)
+			goto next;
+
+		extent_offset = btrfs_file_extent_offset(l, extent);
+
+		old = kmalloc(sizeof(*old), GFP_NOFS);
+		if (!old)
+			goto out_free_list;
+
+		offset = max(new->file_pos, key.offset);
+		end = min(new->file_pos + new->len, key.offset + num_bytes);
+
+		old->bytenr = disk_bytenr;
+		old->extent_offset = extent_offset;
+		old->offset = offset - key.offset;
+		old->len = end - offset;
+		old->new = new;
+		old->count = 0;
+		list_add_tail(&old->list, &new->head);
+next:
+		path->slots[0]++;
+		cond_resched();
+	}
+
+	btrfs_free_path(path);
+	atomic_inc(&root->fs_info->defrag_running);
+
+	return new;
+
+out_free_list:
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		list_del(&old->list);
+		kfree(old);
+	}
+out_free_path:
+	btrfs_free_path(path);
+out_kfree:
+	kfree(new);
+	return NULL;
+}
+
 /*
  * helper function for btrfs_finish_ordered_io, this
  * just reads in some of the csum leaves to prime them into ram
@@ -1909,6 +2638,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 	struct btrfs_trans_handle *trans = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_state *cached_state = NULL;
+	struct new_sa_defrag_extent *new = NULL;
 	int compress_type = 0;
 	int ret;
 	bool nolock;
@@ -1943,6 +2673,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 			 ordered_extent->file_offset + ordered_extent->len - 1,
 			 0, &cached_state);
 
+	ret = test_range_bit(io_tree, ordered_extent->file_offset,
+			ordered_extent->file_offset + ordered_extent->len - 1,
+			EXTENT_DEFRAG, 1, cached_state);
+	if (ret) {
+		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+		if (last_snapshot >= BTRFS_I(inode)->generation)
+			/* the inode is shared */
+			new = record_old_file_extents(inode, ordered_extent);
+
+		clear_extent_bit(io_tree, ordered_extent->file_offset,
+			ordered_extent->file_offset + ordered_extent->len - 1,
+			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
+	}
+
 	if (nolock)
 		trans = btrfs_join_transaction_nolock(root);
 	else
@@ -2001,17 +2745,33 @@ out:
 	if (trans)
 		btrfs_end_transaction(trans, root);
 
-	if (ret)
+	if (ret) {
 		clear_extent_uptodate(io_tree, ordered_extent->file_offset,
 				      ordered_extent->file_offset +
 				      ordered_extent->len - 1, NULL, GFP_NOFS);
 
+		/*
+		 * If the ordered extent had an IOERR or something else went
+		 * wrong we need to return the space for this ordered extent
+		 * back to the allocator.
+		 */
+		if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+			btrfs_free_reserved_extent(root, ordered_extent->start,
+						   ordered_extent->disk_len);
+	}
+
+
 	/*
 	 * This needs to be done to make sure anybody waiting knows we are done
 	 * updating everything for this ordered extent.
 	 */
 	btrfs_remove_ordered_extent(inode, ordered_extent);
 
+	/* for snapshot-aware defrag */
+	if (new)
+		relink_file_extents(new);
+
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
 	/* once for the tree */
@@ -2062,7 +2822,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state, int mirror)
 {
-	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+	size_t offset = start - page_offset(page);
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	char *kaddr;
@@ -2070,6 +2830,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u32 csum = ~(u32)0;
+	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+	                              DEFAULT_RATELIMIT_BURST);
 
 	if (PageChecked(page)) {
 		ClearPageChecked(page);
@@ -2096,7 +2858,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	if (ret)
 		goto zeroit;
 
-	csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
+	csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
 	btrfs_csum_final(csum, (char *)&csum);
 	if (csum != private)
 		goto zeroit;
@@ -2106,11 +2868,11 @@ good:
 	return 0;
 
 zeroit:
-	printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u "
-		       "private %llu\n",
-		       (unsigned long long)btrfs_ino(page->mapping->host),
-		       (unsigned long long)start, csum,
-		       (unsigned long long)private);
+	if (__ratelimit(&_rs))
+		btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u private %llu",
+			(unsigned long long)btrfs_ino(page->mapping->host),
+			(unsigned long long)start, csum,
+			(unsigned long long)private);
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr);
@@ -2167,11 +2929,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 	}
 }
 
-enum btrfs_orphan_cleanup_state {
-	ORPHAN_CLEANUP_STARTED	= 1,
-	ORPHAN_CLEANUP_DONE	= 2,
-};
-
 /*
  * This is called in transaction commit time. If there are no orphan
  * files in the subvolume, it removes orphan item and frees block_rsv
@@ -2301,7 +3058,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
  * We have done the truncate/delete so we can go ahead and remove the orphan
  * item for this particular inode.
  */
-int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
+			    struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int delete_item = 0;
@@ -2396,8 +3154,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		 */
 
 		if (found_key.offset == last_objectid) {
-			printk(KERN_ERR "btrfs: Error removing orphan entry, "
-			       "stopping orphan cleanup\n");
+			btrfs_err(root->fs_info,
+				"Error removing orphan entry, stopping orphan cleanup");
 			ret = -EINVAL;
 			goto out;
 		}
@@ -2454,8 +3212,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 				ret = PTR_ERR(trans);
 				goto out;
 			}
-			printk(KERN_ERR "auto deleting %Lu\n",
-			       found_key.objectid);
+			btrfs_debug(root->fs_info, "auto deleting %Lu",
+				found_key.objectid);
 			ret = btrfs_del_orphan_item(trans, root,
 						    found_key.objectid);
 			BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
@@ -2469,6 +3227,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		 */
 		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
 			&BTRFS_I(inode)->runtime_flags);
+		atomic_inc(&root->orphan_inodes);
 
 		/* if we have links, this was a truncate, lets do that */
 		if (inode->i_nlink) {
@@ -2482,15 +3241,20 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 			/* 1 for the orphan item deletion. */
 			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
+				iput(inode);
 				ret = PTR_ERR(trans);
 				goto out;
 			}
 			ret = btrfs_orphan_add(trans, inode);
 			btrfs_end_transaction(trans, root);
-			if (ret)
+			if (ret) {
+				iput(inode);
 				goto out;
+			}
 
 			ret = btrfs_truncate(inode);
+			if (ret)
+				btrfs_orphan_del(NULL, inode);
 		} else {
 			nr_unlink++;
 		}
@@ -2516,13 +3280,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 	}
 
 	if (nr_unlink)
-		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+		btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
 	if (nr_truncate)
-		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+		btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
 
 out:
 	if (ret)
-		printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
+		btrfs_crit(root->fs_info,
+			"could not do orphan cleanup %d", ret);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -2538,8 +3303,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 {
 	u32 nritems = btrfs_header_nritems(leaf);
 	struct btrfs_key found_key;
+	static u64 xattr_access = 0;
+	static u64 xattr_default = 0;
 	int scanned = 0;
 
+	if (!xattr_access) {
+		xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
+					strlen(POSIX_ACL_XATTR_ACCESS));
+		xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
+					strlen(POSIX_ACL_XATTR_DEFAULT));
+	}
+
 	slot++;
 	while (slot < nritems) {
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -2549,8 +3323,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 			return 0;
 
 		/* we found an xattr, assume we've got an acl */
-		if (found_key.type == BTRFS_XATTR_ITEM_KEY)
-			return 1;
+		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+			if (found_key.offset == xattr_access ||
+			    found_key.offset == xattr_default)
+				return 1;
+		}
 
 		/*
 		 * we found a key greater than an xattr key, there can't
@@ -2709,34 +3486,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_inode_item *item,
 			    struct inode *inode)
 {
-	btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
-	btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
-	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
-	btrfs_set_inode_mode(leaf, item, inode->i_mode);
-	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
+				   &token);
+	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
 
-	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
-			       inode->i_atime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
-				inode->i_atime.tv_nsec);
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+				     inode->i_atime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+				      inode->i_atime.tv_nsec, &token);
 
-	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
-			       inode->i_mtime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
-				inode->i_mtime.tv_nsec);
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+				     inode->i_mtime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+				      inode->i_mtime.tv_nsec, &token);
 
-	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
-			       inode->i_ctime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
-				inode->i_ctime.tv_nsec);
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+				     inode->i_ctime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+				      inode->i_ctime.tv_nsec, &token);
 
-	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
-	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
-	btrfs_set_inode_sequence(leaf, item, inode->i_version);
-	btrfs_set_inode_transid(leaf, item, trans->transid);
-	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
-	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-	btrfs_set_inode_block_group(leaf, item, 0);
+	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+				     &token);
+	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
+					 &token);
+	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
 }
 
 /*
@@ -2863,9 +3647,10 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
 				  dir_ino, &index);
 	if (ret) {
-		printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
-		       "inode %llu parent %llu\n", name_len, name,
-		       (unsigned long long)ino, (unsigned long long)dir_ino);
+		btrfs_info(root->fs_info,
+			"failed to delete reference to %.*s, inode %llu parent %llu",
+			name_len, name,
+			(unsigned long long)ino, (unsigned long long)dir_ino);
 		btrfs_abort_transaction(trans, root, ret);
 		goto err;
 	}
@@ -2887,6 +3672,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 					   dir, index);
 	if (ret == -ENOENT)
 		ret = 0;
+	else if (ret)
+		btrfs_abort_transaction(trans, root, ret);
 err:
 	btrfs_free_path(path);
 	if (ret)
@@ -2914,219 +3701,49 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	}
 	return ret;
 }
-		
-
-/* helper to check if there is any shared block in the path */
-static int check_path_shared(struct btrfs_root *root,
-			     struct btrfs_path *path)
-{
-	struct extent_buffer *eb;
-	int level;
-	u64 refs = 1;
-
-	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
-		int ret;
-
-		if (!path->nodes[level])
-			break;
-		eb = path->nodes[level];
-		if (!btrfs_block_can_be_shared(root, eb))
-			continue;
-		ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
-					       &refs, NULL);
-		if (refs > 1)
-			return 1;
-	}
-	return 0;
-}
 
 /*
  * helper to start transaction for unlink and rmdir.
  *
- * unlink and rmdir are special in btrfs, they do not always free space.
- * so in enospc case, we should make sure they will free space before
- * allowing them to use the global metadata reservation.
+ * unlink and rmdir are special in btrfs, they do not always free space, so
+ * if we cannot make our reservations the normal way try and see if there is
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
  */
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
-						       struct dentry *dentry)
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct btrfs_path *path;
-	struct btrfs_dir_item *di;
-	struct inode *inode = dentry->d_inode;
-	u64 index;
-	int check_link = 1;
-	int err = -ENOSPC;
 	int ret;
-	u64 ino = btrfs_ino(inode);
-	u64 dir_ino = btrfs_ino(dir);
 
 	/*
 	 * 1 for the possible orphan item
 	 * 1 for the dir item
 	 * 1 for the dir index
 	 * 1 for the inode ref
-	 * 1 for the inode ref in the tree log
-	 * 2 for the dir entries in the log
 	 * 1 for the inode
 	 */
-	trans = btrfs_start_transaction(root, 8);
+	trans = btrfs_start_transaction(root, 5);
 	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
 		return trans;
 
-	if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
-		return ERR_PTR(-ENOSPC);
-
-	/* check if there is someone else holds reference */
-	if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
-		return ERR_PTR(-ENOSPC);
-
-	if (atomic_read(&inode->i_count) > 2)
-		return ERR_PTR(-ENOSPC);
-
-	if (xchg(&root->fs_info->enospc_unlink, 1))
-		return ERR_PTR(-ENOSPC);
-
-	path = btrfs_alloc_path();
-	if (!path) {
-		root->fs_info->enospc_unlink = 0;
-		return ERR_PTR(-ENOMEM);
-	}
-
-	/* 1 for the orphan item */
-	trans = btrfs_start_transaction(root, 1);
-	if (IS_ERR(trans)) {
-		btrfs_free_path(path);
-		root->fs_info->enospc_unlink = 0;
-		return trans;
-	}
-
-	path->skip_locking = 1;
-	path->search_commit_root = 1;
-
-	ret = btrfs_lookup_inode(trans, root, path,
-				&BTRFS_I(dir)->location, 0);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-	if (ret == 0) {
-		if (check_path_shared(root, path))
-			goto out;
-	} else {
-		check_link = 0;
-	}
-	btrfs_release_path(path);
-
-	ret = btrfs_lookup_inode(trans, root, path,
-				&BTRFS_I(inode)->location, 0);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-	if (ret == 0) {
-		if (check_path_shared(root, path))
-			goto out;
-	} else {
-		check_link = 0;
-	}
-	btrfs_release_path(path);
+	if (PTR_ERR(trans) == -ENOSPC) {
+		u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
 
-	if (ret == 0 && S_ISREG(inode->i_mode)) {
-		ret = btrfs_lookup_file_extent(trans, root, path,
-					       ino, (u64)-1, 0);
-		if (ret < 0) {
-			err = ret;
-			goto out;
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans))
+			return trans;
+		ret = btrfs_cond_migrate_bytes(root->fs_info,
+					       &root->fs_info->trans_block_rsv,
+					       num_bytes, 5);
+		if (ret) {
+			btrfs_end_transaction(trans, root);
+			return ERR_PTR(ret);
 		}
-		BUG_ON(ret == 0); /* Corruption */
-		if (check_path_shared(root, path))
-			goto out;
-		btrfs_release_path(path);
-	}
-
-	if (!check_link) {
-		err = 0;
-		goto out;
-	}
-
-	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
-				dentry->d_name.name, dentry->d_name.len, 0);
-	if (IS_ERR(di)) {
-		err = PTR_ERR(di);
-		goto out;
-	}
-	if (di) {
-		if (check_path_shared(root, path))
-			goto out;
-	} else {
-		err = 0;
-		goto out;
-	}
-	btrfs_release_path(path);
-
-	ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
-					dentry->d_name.len, ino, dir_ino, 0,
-					&index);
-	if (ret) {
-		err = ret;
-		goto out;
-	}
-
-	if (check_path_shared(root, path))
-		goto out;
-
-	btrfs_release_path(path);
-
-	/*
-	 * This is a commit root search, if we can lookup inode item and other
-	 * relative items in the commit root, it means the transaction of
-	 * dir/file creation has been committed, and the dir index item that we
-	 * delay to insert has also been inserted into the commit root. So
-	 * we needn't worry about the delayed insertion of the dir index item
-	 * here.
-	 */
-	di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
-				dentry->d_name.name, dentry->d_name.len, 0);
-	if (IS_ERR(di)) {
-		err = PTR_ERR(di);
-		goto out;
-	}
-	BUG_ON(ret == -ENOENT);
-	if (check_path_shared(root, path))
-		goto out;
-
-	err = 0;
-out:
-	btrfs_free_path(path);
-	/* Migrate the orphan reservation over */
-	if (!err)
-		err = btrfs_block_rsv_migrate(trans->block_rsv,
-				&root->fs_info->global_block_rsv,
-				trans->bytes_reserved);
-
-	if (err) {
-		btrfs_end_transaction(trans, root);
-		root->fs_info->enospc_unlink = 0;
-		return ERR_PTR(err);
-	}
-
-	trans->block_rsv = &root->fs_info->global_block_rsv;
-	return trans;
-}
-
-static void __unlink_end_trans(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root)
-{
-	if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
-		btrfs_block_rsv_release(root, trans->block_rsv,
-					trans->bytes_reserved);
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
-		BUG_ON(!root->fs_info->enospc_unlink);
-		root->fs_info->enospc_unlink = 0;
+		trans->bytes_reserved = num_bytes;
 	}
-	btrfs_end_transaction(trans, root);
+	return trans;
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3136,7 +3753,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int ret;
 
-	trans = __unlink_start_trans(dir, dentry);
+	trans = __unlink_start_trans(dir);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
@@ -3154,7 +3771,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	}
 
 out:
-	__unlink_end_trans(trans, root);
+	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
 	return ret;
 }
@@ -3251,7 +3868,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
 		return -EPERM;
 
-	trans = __unlink_start_trans(dir, dentry);
+	trans = __unlink_start_trans(dir);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
@@ -3273,7 +3890,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!err)
 		btrfs_i_size_write(inode, 0);
 out:
-	__unlink_end_trans(trans, root);
+	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
 
 	return err;
@@ -3304,7 +3921,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	u64 extent_num_bytes = 0;
 	u64 extent_offset = 0;
 	u64 item_end = 0;
-	u64 mask = root->sectorsize - 1;
 	u32 found_type = (u8)-1;
 	int found_extent;
 	int del_item;
@@ -3328,7 +3944,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	 * extent just the way it is.
 	 */
 	if (root->ref_cows || root == root->fs_info->tree_root)
-		btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
+		btrfs_drop_extent_cache(inode, ALIGN(new_size,
+					root->sectorsize), (u64)-1, 0);
 
 	/*
 	 * This function is also used to drop the items in the log tree before
@@ -3407,10 +4024,9 @@ search_again:
 			if (!del_item) {
 				u64 orig_num_bytes =
 					btrfs_file_extent_num_bytes(leaf, fi);
-				extent_num_bytes = new_size -
-					found_key.offset + root->sectorsize - 1;
-				extent_num_bytes = extent_num_bytes &
-					~((u64)root->sectorsize - 1);
+				extent_num_bytes = ALIGN(new_size -
+						found_key.offset,
+						root->sectorsize);
 				btrfs_set_file_extent_num_bytes(leaf, fi,
 							 extent_num_bytes);
 				num_dec = (orig_num_bytes -
@@ -3450,8 +4066,7 @@ search_again:
 				}
 				size =
 				    btrfs_file_extent_calc_inline_size(size);
-				btrfs_truncate_item(trans, root, path,
-						    size, 1);
+				btrfs_truncate_item(root, path, size, 1);
 			} else if (root->ref_cows) {
 				inode_sub_bytes(inode, item_end + 1 -
 						found_key.offset);
@@ -3646,14 +4261,22 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	u64 mask = root->sectorsize - 1;
-	u64 hole_start = (oldsize + mask) & ~mask;
-	u64 block_end = (size + mask) & ~mask;
+	u64 hole_start = ALIGN(oldsize, root->sectorsize);
+	u64 block_end = ALIGN(size, root->sectorsize);
 	u64 last_byte;
 	u64 cur_offset;
 	u64 hole_size;
 	int err = 0;
 
+	/*
+	 * If our size started in the middle of a page we need to zero out the
+	 * rest of the page before we expand the i_size, otherwise we could
+	 * expose stale data.
+	 */
+	err = btrfs_truncate_page(inode, oldsize, 0, 0);
+	if (err)
+		return err;
+
 	if (size <= hole_start)
 		return 0;
 
@@ -3681,7 +4304,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 			break;
 		}
 		last_byte = min(extent_map_end(em), block_end);
-		last_byte = (last_byte + mask) & ~mask;
+		last_byte = ALIGN(last_byte , root->sectorsize);
 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
 			struct extent_map *hole_em;
 			hole_size = last_byte - cur_offset;
@@ -3726,16 +4349,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 			hole_em->block_start = EXTENT_MAP_HOLE;
 			hole_em->block_len = 0;
 			hole_em->orig_block_len = 0;
+			hole_em->ram_bytes = hole_size;
 			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
 			hole_em->generation = trans->transid;
 
 			while (1) {
 				write_lock(&em_tree->lock);
-				err = add_extent_mapping(em_tree, hole_em);
-				if (!err)
-					list_move(&hole_em->list,
-						  &em_tree->modified_extents);
+				err = add_extent_mapping(em_tree, hole_em, 1);
 				write_unlock(&em_tree->lock);
 				if (err != -EEXIST)
 					break;
@@ -3832,6 +4453,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 
 		/* we don't support swapfiles, so vmtruncate shouldn't fail */
 		truncate_setsize(inode, newsize);
+
+		/* Disable nonlocked read DIO to avoid the end less truncate */
+		btrfs_inode_block_unlocked_dio(inode);
+		inode_dio_wait(inode);
+		btrfs_inode_resume_unlocked_dio(inode);
+
 		ret = btrfs_truncate(inode);
 		if (ret && inode->i_nlink)
 			btrfs_orphan_del(NULL, inode);
@@ -3904,6 +4531,12 @@ void btrfs_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 
+	ret = btrfs_commit_inode_delayed_inode(inode);
+	if (ret) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+
 	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv) {
 		btrfs_orphan_del(NULL, inode);
@@ -3934,14 +4567,15 @@ void btrfs_evict_inode(struct inode *inode)
 			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
 
 		if (ret) {
-			printk(KERN_WARNING "Could not get space for a "
-			       "delete, will truncate on mount %d\n", ret);
+			btrfs_warn(root->fs_info,
+				"Could not get space for a delete, will truncate on mount %d",
+				ret);
 			btrfs_orphan_del(NULL, inode);
 			btrfs_free_block_rsv(root, rsv);
 			goto no_delete;
 		}
 
-		trans = btrfs_start_transaction_lflush(root, 1);
+		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans)) {
 			btrfs_orphan_del(NULL, inode);
 			btrfs_free_block_rsv(root, rsv);
@@ -3955,9 +4589,6 @@ void btrfs_evict_inode(struct inode *inode)
 			break;
 
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
-		ret = btrfs_update_inode(trans, root, inode);
-		BUG_ON(ret);
-
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
 		btrfs_btree_balance_dirty(root);
@@ -3979,6 +4610,7 @@ void btrfs_evict_inode(struct inode *inode)
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
 no_delete:
+	btrfs_remove_delayed_node(inode);
 	clear_inode(inode);
 	return;
 }
@@ -4072,11 +4704,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 		goto out;
 	}
 
-	if (btrfs_root_refs(&new_root->root_item) == 0) {
-		err = -ENOENT;
-		goto out;
-	}
-
 	*sub_root = new_root;
 	location->objectid = btrfs_root_dirid(&new_root->root_item);
 	location->type = BTRFS_INODE_ITEM_KEY;
@@ -4094,14 +4721,13 @@ static void inode_tree_add(struct inode *inode)
 	struct rb_node **p;
 	struct rb_node *parent;
 	u64 ino = btrfs_ino(inode);
-again:
-	p = &root->inode_tree.rb_node;
-	parent = NULL;
 
 	if (inode_unhashed(inode))
 		return;
-
+again:
+	parent = NULL;
 	spin_lock(&root->inode_lock);
+	p = &root->inode_tree.rb_node;
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
@@ -4343,8 +4969,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		if (!(inode->i_sb->s_flags & MS_RDONLY))
 			ret = btrfs_orphan_cleanup(sub_root);
 		up_read(&root->fs_info->cleanup_work_sem);
-		if (ret)
+		if (ret) {
+			iput(inode);
 			inode = ERR_PTR(ret);
+		}
 	}
 
 	return inode;
@@ -4388,10 +5016,9 @@ unsigned char btrfs_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
-static int btrfs_real_readdir(struct file *filp, void *dirent,
-			      filldir_t filldir)
+static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_item *item;
 	struct btrfs_dir_item *di;
@@ -4412,29 +5039,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 	char tmp_name[32];
 	char *name_ptr;
 	int name_len;
-	int is_curr = 0;	/* filp->f_pos points to the current index? */
+	int is_curr = 0;	/* ctx->pos points to the current index? */
 
 	/* FIXME, use a real flag for deciding about the key type */
 	if (root->fs_info->tree_root == root)
 		key_type = BTRFS_DIR_ITEM_KEY;
 
-	/* special case for "." */
-	if (filp->f_pos == 0) {
-		over = filldir(dirent, ".", 1,
-			       filp->f_pos, btrfs_ino(inode), DT_DIR);
-		if (over)
-			return 0;
-		filp->f_pos = 1;
-	}
-	/* special case for .., just use the back ref */
-	if (filp->f_pos == 1) {
-		u64 pino = parent_ino(filp->f_path.dentry);
-		over = filldir(dirent, "..", 2,
-			       filp->f_pos, pino, DT_DIR);
-		if (over)
-			return 0;
-		filp->f_pos = 2;
-	}
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -4448,7 +5061,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 	}
 
 	btrfs_set_key_type(&key, key_type);
-	key.offset = filp->f_pos;
+	key.offset = ctx->pos;
 	key.objectid = btrfs_ino(inode);
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -4474,14 +5087,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 			break;
 		if (btrfs_key_type(&found_key) != key_type)
 			break;
-		if (found_key.offset < filp->f_pos)
+		if (found_key.offset < ctx->pos)
 			goto next;
 		if (key_type == BTRFS_DIR_INDEX_KEY &&
 		    btrfs_should_delete_dir_index(&del_list,
 						  found_key.offset))
 			goto next;
 
-		filp->f_pos = found_key.offset;
+		ctx->pos = found_key.offset;
 		is_curr = 1;
 
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
@@ -4525,9 +5138,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 				over = 0;
 				goto skip;
 			}
-			over = filldir(dirent, name_ptr, name_len,
-				       found_key.offset, location.objectid,
-				       d_type);
+			over = !dir_emit(ctx, name_ptr, name_len,
+				       location.objectid, d_type);
 
 skip:
 			if (name_ptr != tmp_name)
@@ -4546,9 +5158,8 @@ next:
 
 	if (key_type == BTRFS_DIR_INDEX_KEY) {
 		if (is_curr)
-			filp->f_pos++;
-		ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
-						      &ins_list);
+			ctx->pos++;
+		ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
 		if (ret)
 			goto nopos;
 	}
@@ -4559,9 +5170,9 @@ next:
 		 * 32-bit glibc will use getdents64, but then strtol -
 		 * so the last number we can serve is this.
 		 */
-		filp->f_pos = 0x7fffffff;
+		ctx->pos = 0x7fffffff;
 	else
-		filp->f_pos++;
+		ctx->pos++;
 nopos:
 	ret = 0;
 err:
@@ -4602,7 +5213,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
  * FIXME, needs more benchmarking...there are no reasons other than performance
  * to keep or drop this code.
  */
-int btrfs_dirty_inode(struct inode *inode)
+static int btrfs_dirty_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
@@ -4854,7 +5465,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		if (btrfs_test_opt(root, NODATASUM))
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
 		if (btrfs_test_opt(root, NODATACOW))
-			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+				BTRFS_INODE_NODATASUM;
 	}
 
 	insert_inode_hash(inode);
@@ -5006,12 +5618,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	err = btrfs_update_inode(trans, root, inode);
-	if (err) {
-		drop_inode = 1;
-		goto out_unlock;
-	}
-
 	/*
 	* If the active LSM wants to access the inode during
 	* d_instantiate it needs these. Smack checks to see
@@ -5249,7 +5855,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
 		em->block_start += start_diff;
 		em->block_len -= start_diff;
 	}
-	return add_extent_mapping(em_tree, em);
+	return add_extent_mapping(em_tree, em, 0);
 }
 
 static noinline int uncompress_inline(struct btrfs_path *path,
@@ -5396,8 +6002,7 @@ again:
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		size_t size;
 		size = btrfs_file_extent_inline_len(leaf, item);
-		extent_end = (extent_start + size + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
+		extent_end = ALIGN(extent_start + size, root->sectorsize);
 	}
 
 	if (start >= extent_end) {
@@ -5424,6 +6029,7 @@ again:
 		goto not_found_em;
 	}
 
+	em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		em->start = extent_start;
@@ -5469,8 +6075,7 @@ again:
 		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
 				size - extent_offset);
 		em->start = extent_start + extent_offset;
-		em->len = (copy_size + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
+		em->len = ALIGN(copy_size, root->sectorsize);
 		em->orig_block_len = em->len;
 		em->orig_start = em->start;
 		if (compress_type) {
@@ -5533,18 +6138,18 @@ not_found_em:
 insert:
 	btrfs_release_path(path);
 	if (em->start > start || extent_map_end(em) <= start) {
-		printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
-		       "[%llu %llu]\n", (unsigned long long)em->start,
-		       (unsigned long long)em->len,
-		       (unsigned long long)start,
-		       (unsigned long long)len);
+		btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
+			(unsigned long long)em->start,
+			(unsigned long long)em->len,
+			(unsigned long long)start,
+			(unsigned long long)len);
 		err = -EIO;
 		goto out;
 	}
 
 	err = 0;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
+	ret = add_extent_mapping(em_tree, em, 0);
 	/* it is possible that someone inserted the extent into the tree
 	 * while we had the lock dropped.  It is also possible that
 	 * an overlapping map exists in the tree
@@ -5756,7 +6361,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 	}
 
 	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-			      ins.offset, ins.offset, 0);
+			      ins.offset, ins.offset, ins.offset, 0);
 	if (IS_ERR(em))
 		goto out;
 
@@ -5775,8 +6380,10 @@ out:
  * returns 1 when the nocow is safe, < 1 on error, 0 if the
  * block must be cow'd
  */
-static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
-				      struct inode *inode, u64 offset, u64 len)
+noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
+			      struct inode *inode, u64 offset, u64 *len,
+			      u64 *orig_start, u64 *orig_block_len,
+			      u64 *ram_bytes)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -5790,7 +6397,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
 	u64 num_bytes;
 	int slot;
 	int found_type;
-
+	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -5830,15 +6437,29 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
 		/* not a regular extent, must cow */
 		goto out;
 	}
+
+	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+		goto out;
+
 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-	backref_offset = btrfs_file_extent_offset(leaf, fi);
+	if (disk_bytenr == 0)
+		goto out;
 
-	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-	if (extent_end < offset + len) {
-		/* extent doesn't include our full range, must cow */
+	if (btrfs_file_extent_compression(leaf, fi) ||
+	    btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
 		goto out;
+
+	backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+	if (orig_start) {
+		*orig_start = key.offset - backref_offset;
+		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 	}
 
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+
 	if (btrfs_extent_readonly(root, disk_bytenr))
 		goto out;
 
@@ -5858,13 +6479,14 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
 	 */
 	disk_bytenr += backref_offset;
 	disk_bytenr += offset - key.offset;
-	num_bytes = min(offset + len, extent_end) - offset;
+	num_bytes = min(offset + *len, extent_end) - offset;
 	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
 				goto out;
 	/*
 	 * all of the above have passed, it is safe to overwrite this extent
 	 * without cow
 	 */
+	*len = num_bytes;
 	ret = 1;
 out:
 	btrfs_free_path(path);
@@ -5935,7 +6557,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 					   u64 len, u64 orig_start,
 					   u64 block_start, u64 block_len,
-					   u64 orig_block_len, int type)
+					   u64 orig_block_len, u64 ram_bytes,
+					   int type)
 {
 	struct extent_map_tree *em_tree;
 	struct extent_map *em;
@@ -5949,11 +6572,14 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 
 	em->start = start;
 	em->orig_start = orig_start;
+	em->mod_start = start;
+	em->mod_len = len;
 	em->len = len;
 	em->block_len = block_len;
 	em->block_start = block_start;
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	em->orig_block_len = orig_block_len;
+	em->ram_bytes = ram_bytes;
 	em->generation = -1;
 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
 	if (type == BTRFS_ORDERED_PREALLOC)
@@ -5963,10 +6589,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 		btrfs_drop_extent_cache(inode, em->start,
 				em->start + em->len - 1, 0);
 		write_lock(&em_tree->lock);
-		ret = add_extent_mapping(em_tree, em);
-		if (!ret)
-			list_move(&em->list,
-				  &em_tree->modified_extents);
+		ret = add_extent_mapping(em_tree, em, 1);
 		write_unlock(&em_tree->lock);
 	} while (ret == -EEXIST);
 
@@ -5990,16 +6613,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	u64 len = bh_result->b_size;
 	struct btrfs_trans_handle *trans;
 	int unlock_bits = EXTENT_LOCKED;
-	int ret;
+	int ret = 0;
 
-	if (create) {
-		ret = btrfs_delalloc_reserve_space(inode, len);
-		if (ret)
-			return ret;
+	if (create)
 		unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
-	} else {
+	else
 		len = min_t(u64, len, root->sectorsize);
-	}
 
 	lockstart = start;
 	lockend = start + len - 1;
@@ -6011,14 +6630,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
 		return -ENOTBLK;
 
-	if (create) {
-		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-				     lockend, EXTENT_DELALLOC, NULL,
-				     &cached_state, GFP_NOFS);
-		if (ret)
-			goto unlock_err;
-	}
-
 	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
 	if (IS_ERR(em)) {
 		ret = PTR_ERR(em);
@@ -6050,7 +6661,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
 		free_extent_map(em);
-		ret = 0;
 		goto unlock_err;
 	}
 
@@ -6074,7 +6684,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	     em->block_start != EXTENT_MAP_HOLE)) {
 		int type;
 		int ret;
-		u64 block_start;
+		u64 block_start, orig_start, orig_block_len, ram_bytes;
 
 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 			type = BTRFS_ORDERED_PREALLOC;
@@ -6092,16 +6702,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 		if (IS_ERR(trans))
 			goto must_cow;
 
-		if (can_nocow_odirect(trans, inode, start, len) == 1) {
-			u64 orig_start = em->orig_start;
-			u64 orig_block_len = em->orig_block_len;
-
+		if (can_nocow_extent(trans, inode, start, &len, &orig_start,
+				     &orig_block_len, &ram_bytes) == 1) {
 			if (type == BTRFS_ORDERED_PREALLOC) {
 				free_extent_map(em);
 				em = create_pinned_em(inode, start, len,
 						       orig_start,
 						       block_start, len,
-						       orig_block_len, type);
+						       orig_block_len,
+						       ram_bytes, type);
 				if (IS_ERR(em)) {
 					btrfs_end_transaction(trans, root);
 					goto unlock_err;
@@ -6148,6 +6757,15 @@ unlock:
 		 */
 		if (start + len > i_size_read(inode))
 			i_size_write(inode, start + len);
+
+		spin_lock(&BTRFS_I(inode)->lock);
+		BTRFS_I(inode)->outstanding_extents++;
+		spin_unlock(&BTRFS_I(inode)->lock);
+
+		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockstart + len - 1, EXTENT_DELALLOC, NULL,
+				     &cached_state, GFP_NOFS);
+		BUG_ON(ret);
 	}
 
 	/*
@@ -6156,24 +6774,9 @@ unlock:
 	 * aren't using if there is any left over space.
 	 */
 	if (lockstart < lockend) {
-		if (create && len < lockend - lockstart) {
-			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-					 lockstart + len - 1,
-					 unlock_bits | EXTENT_DEFRAG, 1, 0,
-					 &cached_state, GFP_NOFS);
-			/*
-			 * Beside unlock, we also need to cleanup reserved space
-			 * for the left range by attaching EXTENT_DO_ACCOUNTING.
-			 */
-			clear_extent_bit(&BTRFS_I(inode)->io_tree,
-					 lockstart + len, lockend,
-					 unlock_bits | EXTENT_DO_ACCOUNTING |
-					 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
-		} else {
-			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-					 lockend, unlock_bits, 1, 0,
-					 &cached_state, GFP_NOFS);
-		}
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				 lockend, unlock_bits, 1, 0,
+				 &cached_state, GFP_NOFS);
 	} else {
 		free_extent_state(cached_state);
 	}
@@ -6183,9 +6786,6 @@ unlock:
 	return 0;
 
 unlock_err:
-	if (create)
-		unlock_bits |= EXTENT_DO_ACCOUNTING;
-
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
 	return ret;
@@ -6204,7 +6804,11 @@ struct btrfs_dio_private {
 	/* IO errors */
 	int errors;
 
+	/* orig_bio is our btrfs_io_bio */
 	struct bio *orig_bio;
+
+	/* dio_bio came from fs/direct-io.c */
+	struct bio *dio_bio;
 };
 
 static void btrfs_endio_direct_read(struct bio *bio, int err)
@@ -6214,6 +6818,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 	struct bio_vec *bvec = bio->bi_io_vec;
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct bio *dio_bio;
 	u64 start;
 
 	start = dip->logical_offset;
@@ -6230,7 +6835,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 				goto failed;
 			local_irq_save(flags);
 			kaddr = kmap_atomic(page);
-			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+			csum = btrfs_csum_data(kaddr + bvec->bv_offset,
 					       csum, bvec->bv_len);
 			btrfs_csum_final(csum, (char *)&csum);
 			kunmap_atomic(kaddr);
@@ -6239,11 +6844,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 			flush_dcache_page(bvec->bv_page);
 			if (csum != private) {
 failed:
-				printk(KERN_ERR "btrfs csum failed ino %llu off"
-				      " %llu csum %u private %u\n",
-				      (unsigned long long)btrfs_ino(inode),
-				      (unsigned long long)start,
-				      csum, (unsigned)private);
+				btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u private %u",
+					(unsigned long long)btrfs_ino(inode),
+					(unsigned long long)start,
+					csum, (unsigned)private);
 				err = -EIO;
 			}
 		}
@@ -6254,14 +6858,15 @@ failed:
 
 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
 		      dip->logical_offset + dip->bytes - 1);
-	bio->bi_private = dip->private;
+	dio_bio = dip->dio_bio;
 
 	kfree(dip);
 
 	/* If we had a csum failure make sure to clear the uptodate flag */
 	if (err)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	dio_end_io(bio, err);
+		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+	dio_end_io(dio_bio, err);
+	bio_put(bio);
 }
 
 static void btrfs_endio_direct_write(struct bio *bio, int err)
@@ -6272,6 +6877,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
 	struct btrfs_ordered_extent *ordered = NULL;
 	u64 ordered_offset = dip->logical_offset;
 	u64 ordered_bytes = dip->bytes;
+	struct bio *dio_bio;
 	int ret;
 
 	if (err)
@@ -6299,14 +6905,15 @@ out_test:
 		goto again;
 	}
 out_done:
-	bio->bi_private = dip->private;
+	dio_bio = dip->dio_bio;
 
 	kfree(dip);
 
 	/* If we had an error make sure to clear the uptodate flag */
 	if (err)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	dio_end_io(bio, err);
+		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+	dio_end_io(dio_bio, err);
+	bio_put(bio);
 }
 
 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
@@ -6342,10 +6949,10 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 	if (!atomic_dec_and_test(&dip->pending_bios))
 		goto out;
 
-	if (dip->errors)
+	if (dip->errors) {
 		bio_io_error(dip->orig_bio);
-	else {
-		set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+	} else {
+		set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
 		bio_endio(dip->orig_bio, 0);
 	}
 out:
@@ -6426,19 +7033,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 	int async_submit = 0;
 
 	map_length = orig_bio->bi_size;
-	ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
+	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
 			      &map_length, NULL, 0);
 	if (ret) {
 		bio_put(orig_bio);
 		return -EIO;
 	}
-
 	if (map_length >= orig_bio->bi_size) {
 		bio = orig_bio;
 		goto submit;
 	}
 
-	async_submit = 1;
+	/* async crcs make it difficult to collect full stripe writes. */
+	if (btrfs_get_alloc_profile(root, 1) &
+	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+		async_submit = 0;
+	else
+		async_submit = 1;
+
 	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
 	if (!bio)
 		return -ENOMEM;
@@ -6480,7 +7092,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 			bio->bi_end_io = btrfs_end_dio_bio;
 
 			map_length = orig_bio->bi_size;
-			ret = btrfs_map_block(root->fs_info, READ,
+			ret = btrfs_map_block(root->fs_info, rw,
 					      start_sector << 9,
 					      &map_length, NULL, 0);
 			if (ret) {
@@ -6515,48 +7127,54 @@ out_err:
 	return 0;
 }
 
-static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
-				loff_t file_offset)
+static void btrfs_submit_direct(int rw, struct bio *dio_bio,
+				struct inode *inode, loff_t file_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_dio_private *dip;
-	struct bio_vec *bvec = bio->bi_io_vec;
+	struct bio *io_bio;
 	int skip_sum;
 	int write = rw & REQ_WRITE;
 	int ret = 0;
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
+	io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
+
+	if (!io_bio) {
+		ret = -ENOMEM;
+		goto free_ordered;
+	}
+
 	dip = kmalloc(sizeof(*dip), GFP_NOFS);
 	if (!dip) {
 		ret = -ENOMEM;
-		goto free_ordered;
+		goto free_io_bio;
 	}
 
-	dip->private = bio->bi_private;
+	dip->private = dio_bio->bi_private;
 	dip->inode = inode;
 	dip->logical_offset = file_offset;
-
-	dip->bytes = 0;
-	do {
-		dip->bytes += bvec->bv_len;
-		bvec++;
-	} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
-
-	dip->disk_bytenr = (u64)bio->bi_sector << 9;
-	bio->bi_private = dip;
+	dip->bytes = dio_bio->bi_size;
+	dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+	io_bio->bi_private = dip;
 	dip->errors = 0;
-	dip->orig_bio = bio;
+	dip->orig_bio = io_bio;
+	dip->dio_bio = dio_bio;
 	atomic_set(&dip->pending_bios, 0);
 
 	if (write)
-		bio->bi_end_io = btrfs_endio_direct_write;
+		io_bio->bi_end_io = btrfs_endio_direct_write;
 	else
-		bio->bi_end_io = btrfs_endio_direct_read;
+		io_bio->bi_end_io = btrfs_endio_direct_read;
 
 	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
 	if (!ret)
 		return;
+
+free_io_bio:
+	bio_put(io_bio);
+
 free_ordered:
 	/*
 	 * If this is a write, we need to clean up the reserved space and kill
@@ -6572,7 +7190,7 @@ free_ordered:
 		btrfs_put_ordered_extent(ordered);
 		btrfs_put_ordered_extent(ordered);
 	}
-	bio_endio(bio, ret);
+	bio_endio(dio_bio, ret);
 }
 
 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
@@ -6623,15 +7241,68 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	size_t count = 0;
+	int flags = 0;
+	bool wakeup = true;
+	bool relock = false;
+	ssize_t ret;
 
 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
 			    offset, nr_segs))
 		return 0;
 
-	return __blockdev_direct_IO(rw, iocb, inode,
-		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-		   btrfs_submit_direct, 0);
+	atomic_inc(&inode->i_dio_count);
+	smp_mb__after_atomic_inc();
+
+	/*
+	 * The generic stuff only does filemap_write_and_wait_range, which isn't
+	 * enough if we've written compressed pages to this area, so we need to
+	 * call btrfs_wait_ordered_range to make absolutely sure that any
+	 * outstanding dirty pages are on disk.
+	 */
+	count = iov_length(iov, nr_segs);
+	btrfs_wait_ordered_range(inode, offset, count);
+
+	if (rw & WRITE) {
+		/*
+		 * If the write DIO is beyond the EOF, we need update
+		 * the isize, but it is protected by i_mutex. So we can
+		 * not unlock the i_mutex at this case.
+		 */
+		if (offset + count <= inode->i_size) {
+			mutex_unlock(&inode->i_mutex);
+			relock = true;
+		}
+		ret = btrfs_delalloc_reserve_space(inode, count);
+		if (ret)
+			goto out;
+	} else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+				     &BTRFS_I(inode)->runtime_flags))) {
+		inode_dio_done(inode);
+		flags = DIO_LOCKING | DIO_SKIP_HOLES;
+		wakeup = false;
+	}
+
+	ret = __blockdev_direct_IO(rw, iocb, inode,
+			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+			iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+			btrfs_submit_direct, flags);
+	if (rw & WRITE) {
+		if (ret < 0 && ret != -EIOCBQUEUED)
+			btrfs_delalloc_release_space(inode, count);
+		else if (ret >= 0 && (size_t)ret < count)
+			btrfs_delalloc_release_space(inode,
+						     count - (size_t)ret);
+		else
+			btrfs_delalloc_release_metadata(inode, 0);
+	}
+out:
+	if (wakeup)
+		inode_dio_done(inode);
+	if (relock)
+		mutex_lock(&inode->i_mutex);
+
+	return ret;
 }
 
 #define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
@@ -6669,8 +7340,8 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
-int btrfs_writepages(struct address_space *mapping,
-		     struct writeback_control *wbc)
+static int btrfs_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
 
@@ -6711,7 +7382,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
 
-static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+static void btrfs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *tree;
@@ -6735,8 +7407,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		return;
 	}
 	lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
-	ordered = btrfs_lookup_ordered_extent(inode,
-					   page_offset(page));
+	ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
 	if (ordered) {
 		/*
 		 * IO on this page will never be started, so we need
@@ -6791,7 +7462,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = fdentry(vma->vm_file)->d_inode;
+	struct inode *inode = file_inode(vma->vm_file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
@@ -6912,16 +7583,12 @@ static int btrfs_truncate(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *rsv;
-	int ret;
+	int ret = 0;
 	int err = 0;
 	struct btrfs_trans_handle *trans;
 	u64 mask = root->sectorsize - 1;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
-	ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
-	if (ret)
-		return ret;
-
 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 
@@ -7179,15 +7846,15 @@ void btrfs_destroy_inode(struct inode *inode)
 	 */
 	smp_mb();
 	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-		spin_lock(&root->fs_info->ordered_extent_lock);
+		spin_lock(&root->fs_info->ordered_root_lock);
 		list_del_init(&BTRFS_I(inode)->ordered_operations);
-		spin_unlock(&root->fs_info->ordered_extent_lock);
+		spin_unlock(&root->fs_info->ordered_root_lock);
 	}
 
 	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
 		     &BTRFS_I(inode)->runtime_flags)) {
-		printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
-		       (unsigned long long)btrfs_ino(inode));
+		btrfs_info(root->fs_info, "inode %llu still on the orphan list",
+			(unsigned long long)btrfs_ino(inode));
 		atomic_dec(&root->orphan_inodes);
 	}
 
@@ -7196,10 +7863,9 @@ void btrfs_destroy_inode(struct inode *inode)
 		if (!ordered)
 			break;
 		else {
-			printk(KERN_ERR "btrfs found ordered "
-			       "extent %llu %llu on inode cleanup\n",
-			       (unsigned long long)ordered->file_offset,
-			       (unsigned long long)ordered->len);
+			btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
+				(unsigned long long)ordered->file_offset,
+				(unsigned long long)ordered->len);
 			btrfs_remove_ordered_extent(inode, ordered);
 			btrfs_put_ordered_extent(ordered);
 			btrfs_put_ordered_extent(ordered);
@@ -7208,7 +7874,6 @@ void btrfs_destroy_inode(struct inode *inode)
 	inode_tree_del(inode);
 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
-	btrfs_remove_delayed_node(inode);
 	call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
 
@@ -7216,8 +7881,12 @@ int btrfs_drop_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
+	if (root == NULL)
+		return 1;
+
+	/* the snap/subvol tree is on deleting */
 	if (btrfs_root_refs(&root->root_item) == 0 &&
-	    !btrfs_is_free_space_inode(inode))
+	    root != root->fs_info->tree_root)
 		return 1;
 	else
 		return generic_drop_inode(inode);
@@ -7299,40 +7968,22 @@ fail:
 static int btrfs_getattr(struct vfsmount *mnt,
 			 struct dentry *dentry, struct kstat *stat)
 {
+	u64 delalloc_bytes;
 	struct inode *inode = dentry->d_inode;
 	u32 blocksize = inode->i_sb->s_blocksize;
 
 	generic_fillattr(inode, stat);
 	stat->dev = BTRFS_I(inode)->root->anon_dev;
 	stat->blksize = PAGE_CACHE_SIZE;
+
+	spin_lock(&BTRFS_I(inode)->lock);
+	delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+	spin_unlock(&BTRFS_I(inode)->lock);
 	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
-		ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
+			ALIGN(delalloc_bytes, blocksize)) >> 9;
 	return 0;
 }
 
-/*
- * If a file is moved, it will inherit the cow and compression flags of the new
- * directory.
- */
-static void fixup_inode_flags(struct inode *dir, struct inode *inode)
-{
-	struct btrfs_inode *b_dir = BTRFS_I(dir);
-	struct btrfs_inode *b_inode = BTRFS_I(inode);
-
-	if (b_dir->flags & BTRFS_INODE_NODATACOW)
-		b_inode->flags |= BTRFS_INODE_NODATACOW;
-	else
-		b_inode->flags &= ~BTRFS_INODE_NODATACOW;
-
-	if (b_dir->flags & BTRFS_INODE_COMPRESS) {
-		b_inode->flags |= BTRFS_INODE_COMPRESS;
-		b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
-	} else {
-		b_inode->flags &= ~(BTRFS_INODE_COMPRESS |
-				    BTRFS_INODE_NOCOMPRESS);
-	}
-}
-
 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry)
 {
@@ -7403,7 +8054,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
 	 * should cover the worst case number of items we'll modify.
 	 */
-	trans = btrfs_start_transaction(root, 20);
+	trans = btrfs_start_transaction(root, 11);
 	if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
                 goto out_notrans;
@@ -7498,8 +8149,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		}
 	}
 
-	fixup_inode_flags(new_dir, old_inode);
-
 	ret = btrfs_add_link(trans, new_dir, old_inode,
 			     new_dentry->d_name.name,
 			     new_dentry->d_name.len, 0, index);
@@ -7569,7 +8218,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode;
@@ -7578,27 +8227,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	struct list_head splice;
 	int ret = 0;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
-
 	INIT_LIST_HEAD(&works);
 	INIT_LIST_HEAD(&splice);
-again:
-	spin_lock(&root->fs_info->delalloc_lock);
-	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+
+	spin_lock(&root->delalloc_lock);
+	list_splice_init(&root->delalloc_inodes, &splice);
 	while (!list_empty(&splice)) {
 		binode = list_entry(splice.next, struct btrfs_inode,
 				    delalloc_inodes);
 
-		list_del_init(&binode->delalloc_inodes);
-
+		list_move_tail(&binode->delalloc_inodes,
+			       &root->delalloc_inodes);
 		inode = igrab(&binode->vfs_inode);
-		if (!inode)
+		if (!inode) {
+			cond_resched_lock(&root->delalloc_lock);
 			continue;
-
-		list_add_tail(&binode->delalloc_inodes,
-			      &root->fs_info->delalloc_inodes);
-		spin_unlock(&root->fs_info->delalloc_lock);
+		}
+		spin_unlock(&root->delalloc_lock);
 
 		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
 		if (unlikely(!work)) {
@@ -7610,23 +8255,39 @@ again:
 				   &work->work);
 
 		cond_resched();
-		spin_lock(&root->fs_info->delalloc_lock);
+		spin_lock(&root->delalloc_lock);
 	}
-	spin_unlock(&root->fs_info->delalloc_lock);
+	spin_unlock(&root->delalloc_lock);
 
 	list_for_each_entry_safe(work, next, &works, list) {
 		list_del_init(&work->list);
 		btrfs_wait_and_free_delalloc_work(work);
 	}
+	return 0;
+out:
+	list_for_each_entry_safe(work, next, &works, list) {
+		list_del_init(&work->list);
+		btrfs_wait_and_free_delalloc_work(work);
+	}
 
-	spin_lock(&root->fs_info->delalloc_lock);
-	if (!list_empty(&root->fs_info->delalloc_inodes)) {
-		spin_unlock(&root->fs_info->delalloc_lock);
-		goto again;
+	if (!list_empty_careful(&splice)) {
+		spin_lock(&root->delalloc_lock);
+		list_splice_tail(&splice, &root->delalloc_inodes);
+		spin_unlock(&root->delalloc_lock);
 	}
-	spin_unlock(&root->fs_info->delalloc_lock);
+	return ret;
+}
 
-	/* the filemap_flush will queue IO into the worker threads, but
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+	int ret;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	ret = __start_delalloc_inodes(root, delay_iput);
+	/*
+	 * the filemap_flush will queue IO into the worker threads, but
 	 * we have to make sure the IO is actually started and that
 	 * ordered extents get created before we return
 	 */
@@ -7638,17 +8299,55 @@ again:
 		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
 	}
 	atomic_dec(&root->fs_info->async_submit_draining);
-	return 0;
-out:
-	list_for_each_entry_safe(work, next, &works, list) {
-		list_del_init(&work->list);
-		btrfs_wait_and_free_delalloc_work(work);
+	return ret;
+}
+
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+				    int delay_iput)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
+	int ret;
+
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&fs_info->delalloc_root_lock);
+	list_splice_init(&fs_info->delalloc_roots, &splice);
+	while (!list_empty(&splice)) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					delalloc_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		list_move_tail(&root->delalloc_root,
+			       &fs_info->delalloc_roots);
+		spin_unlock(&fs_info->delalloc_root_lock);
+
+		ret = __start_delalloc_inodes(root, delay_iput);
+		btrfs_put_fs_root(root);
+		if (ret)
+			goto out;
+
+		spin_lock(&fs_info->delalloc_root_lock);
 	}
+	spin_unlock(&fs_info->delalloc_root_lock);
 
+	atomic_inc(&fs_info->async_submit_draining);
+	while (atomic_read(&fs_info->nr_async_submits) ||
+	      atomic_read(&fs_info->async_delalloc_pages)) {
+		wait_event(fs_info->async_submit_wait,
+		   (atomic_read(&fs_info->nr_async_submits) == 0 &&
+		    atomic_read(&fs_info->async_delalloc_pages) == 0));
+	}
+	atomic_dec(&fs_info->async_submit_draining);
+	return 0;
+out:
 	if (!list_empty_careful(&splice)) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
-		spin_unlock(&root->fs_info->delalloc_lock);
+		spin_lock(&fs_info->delalloc_root_lock);
+		list_splice_tail(&splice, &fs_info->delalloc_roots);
+		spin_unlock(&fs_info->delalloc_root_lock);
 	}
 	return ret;
 }
@@ -7787,6 +8486,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 	struct btrfs_key ins;
 	u64 cur_offset = start;
 	u64 i_size;
+	u64 cur_bytes;
 	int ret = 0;
 	bool own_trans = true;
 
@@ -7801,8 +8501,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 			}
 		}
 
-		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
-					   0, *alloc_hint, &ins, 1);
+		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+		cur_bytes = max(cur_bytes, min_size);
+		ret = btrfs_reserve_extent(trans, root, cur_bytes,
+					   min_size, 0, *alloc_hint, &ins, 1);
 		if (ret) {
 			if (own_trans)
 				btrfs_end_transaction(trans, root);
@@ -7836,16 +8538,14 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
 		em->orig_block_len = ins.offset;
+		em->ram_bytes = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 		em->generation = trans->transid;
 
 		while (1) {
 			write_lock(&em_tree->lock);
-			ret = add_extent_mapping(em_tree, em);
-			if (!ret)
-				list_move(&em->list,
-					  &em_tree->modified_extents);
+			ret = add_extent_mapping(em_tree, em, 1);
 			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST)
 				break;
@@ -7954,7 +8654,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
 static const struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= btrfs_real_readdir,
+	.iterate	= btrfs_real_readdir,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 338f2597bf7..238a05545ee 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -42,12 +42,12 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/uuid.h>
+#include <linux/btrfs.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "volumes.h"
 #include "locking.h"
@@ -152,7 +152,7 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 
 static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
 {
-	struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
+	struct btrfs_inode *ip = BTRFS_I(file_inode(file));
 	unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
 
 	if (copy_to_user(arg, &flags, sizeof(flags)))
@@ -177,7 +177,7 @@ static int check_flags(unsigned int flags)
 
 static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_inode *ip = BTRFS_I(inode);
 	struct btrfs_root *root = ip->root;
 	struct btrfs_trans_handle *trans;
@@ -310,7 +310,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 
 	return put_user(inode->i_generation, arg);
 }
@@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 	return 0;
 }
 
-static noinline int create_subvol(struct btrfs_root *root,
+static noinline int create_subvol(struct inode *dir,
 				  struct dentry *dentry,
 				  char *name, int namelen,
 				  u64 *async_transid,
-				  struct btrfs_qgroup_inherit **inherit)
+				  struct btrfs_qgroup_inherit *inherit)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key key;
 	struct btrfs_root_item root_item;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *new_root;
-	struct dentry *parent = dentry->d_parent;
-	struct inode *dir;
+	struct btrfs_block_rsv block_rsv;
 	struct timespec cur_time = CURRENT_TIME;
 	int ret;
 	int err;
 	u64 objectid;
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
 	u64 index = 0;
+	u64 qgroup_reserved;
 	uuid_le new_uuid;
 
 	ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
 	if (ret)
 		return ret;
 
-	dir = parent->d_inode;
-
+	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
 	/*
-	 * 1 - inode item
-	 * 2 - refs
-	 * 1 - root item
-	 * 2 - dir items
+	 * The same as the snapshot creation, please see the comment
+	 * of create_snapshot().
 	 */
-	trans = btrfs_start_transaction(root, 6);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
+	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+					       7, &qgroup_reserved);
+	if (ret)
+		return ret;
 
-	ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
-				   inherit ? *inherit : NULL);
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+	trans->block_rsv = &block_rsv;
+	trans->bytes_reserved = block_rsv.size;
+
+	ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
 	if (ret)
 		goto fail;
 
@@ -516,9 +522,13 @@ static noinline int create_subvol(struct btrfs_root *root,
 	BUG_ON(ret);
 
 fail:
+	trans->block_rsv = NULL;
+	trans->bytes_reserved = 0;
 	if (async_transid) {
 		*async_transid = trans->transid;
 		err = btrfs_commit_transaction_async(trans, root, 1);
+		if (err)
+			err = btrfs_commit_transaction(trans, root);
 	} else {
 		err = btrfs_commit_transaction(trans, root);
 	}
@@ -527,13 +537,15 @@ fail:
 
 	if (!ret)
 		d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
-
+out:
+	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
 	return ret;
 }
 
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
-			   char *name, int namelen, u64 *async_transid,
-			   bool readonly, struct btrfs_qgroup_inherit **inherit)
+static int create_snapshot(struct btrfs_root *root, struct inode *dir,
+			   struct dentry *dentry, char *name, int namelen,
+			   u64 *async_transid, bool readonly,
+			   struct btrfs_qgroup_inherit *inherit)
 {
 	struct inode *inode;
 	struct btrfs_pending_snapshot *pending_snapshot;
@@ -543,29 +555,43 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	if (!root->ref_cows)
 		return -EINVAL;
 
+	ret = btrfs_start_delalloc_inodes(root, 0);
+	if (ret)
+		return ret;
+
+	btrfs_wait_ordered_extents(root, 0);
+
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
 	if (!pending_snapshot)
 		return -ENOMEM;
 
 	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
 			     BTRFS_BLOCK_RSV_TEMP);
+	/*
+	 * 1 - parent dir inode
+	 * 2 - dir entries
+	 * 1 - root item
+	 * 2 - root ref/backref
+	 * 1 - root of snapshot
+	 */
+	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
+					&pending_snapshot->block_rsv, 7,
+					&pending_snapshot->qgroup_reserved);
+	if (ret)
+		goto out;
+
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
 	pending_snapshot->readonly = readonly;
-	if (inherit) {
-		pending_snapshot->inherit = *inherit;
-		*inherit = NULL;	/* take responsibility to free it */
-	}
+	pending_snapshot->dir = dir;
+	pending_snapshot->inherit = inherit;
 
-	trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
+	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto fail;
 	}
 
-	ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
-	BUG_ON(ret);
-
 	spin_lock(&root->fs_info->trans_lock);
 	list_add(&pending_snapshot->list,
 		 &trans->transaction->pending_snapshots);
@@ -574,16 +600,14 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 		*async_transid = trans->transid;
 		ret = btrfs_commit_transaction_async(trans,
 				     root->fs_info->extent_root, 1);
+		if (ret)
+			ret = btrfs_commit_transaction(trans, root);
 	} else {
 		ret = btrfs_commit_transaction(trans,
 					       root->fs_info->extent_root);
 	}
-	if (ret) {
-		/* cleanup_transaction has freed this for us */
-		if (trans->aborted)
-			pending_snapshot = NULL;
+	if (ret)
 		goto fail;
-	}
 
 	ret = pending_snapshot->error;
 	if (ret)
@@ -602,6 +626,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	d_instantiate(dentry, inode);
 	ret = 0;
 fail:
+	btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
+					 &pending_snapshot->block_rsv,
+					 pending_snapshot->qgroup_reserved);
+out:
 	kfree(pending_snapshot);
 	return ret;
 }
@@ -695,13 +723,15 @@ static noinline int btrfs_mksubvol(struct path *parent,
 				   char *name, int namelen,
 				   struct btrfs_root *snap_src,
 				   u64 *async_transid, bool readonly,
-				   struct btrfs_qgroup_inherit **inherit)
+				   struct btrfs_qgroup_inherit *inherit)
 {
 	struct inode *dir  = parent->dentry->d_inode;
 	struct dentry *dentry;
 	int error;
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	if (error == -EINTR)
+		return error;
 
 	dentry = lookup_one_len(name, parent->dentry, namelen);
 	error = PTR_ERR(dentry);
@@ -732,11 +762,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
 		goto out_up_read;
 
 	if (snap_src) {
-		error = create_snapshot(snap_src, dentry, name, namelen,
+		error = create_snapshot(snap_src, dir, dentry, name, namelen,
 					async_transid, readonly, inherit);
 	} else {
-		error = create_subvol(BTRFS_I(dir)->root, dentry,
-				      name, namelen, async_transid, inherit);
+		error = create_subvol(dir, dentry, name, namelen,
+				      async_transid, inherit);
 	}
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
@@ -818,7 +848,7 @@ static int find_new_extents(struct btrfs_root *root,
 
 	while(1) {
 		ret = btrfs_search_forward(root, &min_key, &max_key,
-					   path, 0, newer_than);
+					   path, newer_than);
 		if (ret != 0)
 			goto none;
 		if (min_key.objectid != ino)
@@ -1130,8 +1160,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	u64 new_align = ~((u64)128 * 1024 - 1);
 	struct page **pages = NULL;
 
-	if (extent_thresh == 0)
-		extent_thresh = 256 * 1024;
+	if (isize == 0)
+		return 0;
+
+	if (range->start >= isize)
+		return -EINVAL;
 
 	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
 		if (range->compress_type > BTRFS_COMPRESS_TYPES)
@@ -1140,8 +1173,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			compress_type = range->compress_type;
 	}
 
-	if (isize == 0)
-		return 0;
+	if (extent_thresh == 0)
+		extent_thresh = 256 * 1024;
 
 	/*
 	 * if we were not given a file, allocate a readahead
@@ -1206,6 +1239,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		if (!(inode->i_sb->s_flags & MS_ACTIVE))
 			break;
 
+		if (btrfs_defrag_cancelled(root->fs_info)) {
+			printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
+			ret = -EAGAIN;
+			break;
+		}
+
 		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
 					 extent_thresh, &last_len, &skip,
 					 &defrag_end, range->flags &
@@ -1320,7 +1359,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	u64 new_size;
 	u64 old_size;
 	u64 devid = 1;
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device = NULL;
@@ -1329,9 +1368,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	int ret = 0;
 	int mod = 0;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
-
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
@@ -1363,6 +1399,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 		*devstr = '\0';
 		devstr = vol_args->name;
 		devid = simple_strtoull(devstr, &end, 10);
+		if (!devid) {
+			ret = -EINVAL;
+			goto out_free;
+		}
 		printk(KERN_INFO "btrfs: resizing devid %llu\n",
 		       (unsigned long long)devid);
 	}
@@ -1371,7 +1411,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	if (!device) {
 		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
 		       (unsigned long long)devid);
-		ret = -EINVAL;
+		ret = -ENODEV;
 		goto out_free;
 	}
 
@@ -1379,7 +1419,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 		printk(KERN_INFO "btrfs: resizer unable to apply on "
 		       "readonly device %llu\n",
 		       (unsigned long long)devid);
-		ret = -EINVAL;
+		ret = -EPERM;
 		goto out_free;
 	}
 
@@ -1401,7 +1441,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	}
 
 	if (device->is_tgtdev_for_dev_replace) {
-		ret = -EINVAL;
+		ret = -EPERM;
 		goto out_free;
 	}
 
@@ -1457,7 +1497,7 @@ out:
 static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 				char *name, unsigned long fd, int subvol,
 				u64 *transid, bool readonly,
-				struct btrfs_qgroup_inherit **inherit)
+				struct btrfs_qgroup_inherit *inherit)
 {
 	int namelen;
 	int ret = 0;
@@ -1489,8 +1529,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 			goto out_drop_write;
 		}
 
-		src_inode = src.file->f_path.dentry->d_inode;
-		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
+		src_inode = file_inode(src.file);
+		if (src_inode->i_sb != file_inode(file)->i_sb) {
 			printk(KERN_INFO "btrfs: Snapshot src from "
 			       "another FS\n");
 			ret = -EINVAL;
@@ -1566,7 +1606,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 
 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
 					      vol_args->fd, subvol, ptr,
-					      readonly, &inherit);
+					      readonly, inherit);
 
 	if (ret == 0 && ptr &&
 	    copy_to_user(arg +
@@ -1582,7 +1622,7 @@ out:
 static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
 						void __user *arg)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 	u64 flags = 0;
@@ -1604,7 +1644,7 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
 static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 					      void __user *arg)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	u64 root_flags;
@@ -1767,7 +1807,11 @@ static noinline int copy_to_sk(struct btrfs_root *root,
 		item_off = btrfs_item_ptr_offset(leaf, i);
 		item_len = btrfs_item_size_nr(leaf, i);
 
-		if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+		btrfs_item_key_to_cpu(leaf, key, i);
+		if (!key_in_sk(key, sk))
+			continue;
+
+		if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
 			item_len = 0;
 
 		if (sizeof(sh) + item_len + *sk_offset >
@@ -1776,10 +1820,6 @@ static noinline int copy_to_sk(struct btrfs_root *root,
 			goto overflow;
 		}
 
-		btrfs_item_key_to_cpu(leaf, key, i);
-		if (!key_in_sk(key, sk))
-			continue;
-
 		sh.objectid = key->objectid;
 		sh.offset = key->offset;
 		sh.type = key->type;
@@ -1863,7 +1903,7 @@ static noinline int search_ioctl(struct inode *inode,
 	path->keep_locks = 1;
 
 	while(1) {
-		ret = btrfs_search_forward(root, &key, &max_key, path, 0,
+		ret = btrfs_search_forward(root, &key, &max_key, path,
 					   sk->min_transid);
 		if (ret != 0) {
 			if (ret > 0)
@@ -1898,7 +1938,7 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
 	if (IS_ERR(args))
 		return PTR_ERR(args);
 
-	inode = fdentry(file)->d_inode;
+	inode = file_inode(file);
 	ret = search_ioctl(inode, args);
 	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
 		ret = -EFAULT;
@@ -2008,7 +2048,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 	if (IS_ERR(args))
 		return PTR_ERR(args);
 
-	inode = fdentry(file)->d_inode;
+	inode = file_inode(file);
 
 	if (args->treeid == 0)
 		args->treeid = BTRFS_I(inode)->root->root_key.objectid;
@@ -2035,6 +2075,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	struct btrfs_root *dest = NULL;
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_block_rsv block_rsv;
+	u64 qgroup_reserved;
 	int namelen;
 	int ret;
 	int err = 0;
@@ -2055,7 +2097,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	if (err)
 		goto out;
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	if (err == -EINTR)
+		goto out;
 	dentry = lookup_one_len(vol_args->name, parent, namelen);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -2124,12 +2168,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	if (err)
 		goto out_up_write;
 
+	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+	/*
+	 * One for dir inode, two for dir entries, two for root
+	 * ref/backref.
+	 */
+	err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+					       5, &qgroup_reserved);
+	if (err)
+		goto out_up_write;
+
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
-		goto out_up_write;
+		goto out_release;
 	}
-	trans->block_rsv = &root->fs_info->global_block_rsv;
+	trans->block_rsv = &block_rsv;
+	trans->bytes_reserved = block_rsv.size;
 
 	ret = btrfs_unlink_subvol(trans, root, dir,
 				dest->root_key.objectid,
@@ -2159,10 +2214,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		}
 	}
 out_end_trans:
+	trans->block_rsv = NULL;
+	trans->bytes_reserved = 0;
 	ret = btrfs_end_transaction(trans, root);
 	if (ret && !err)
 		err = ret;
 	inode->i_flags |= S_DEAD;
+out_release:
+	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
 out_up_write:
 	up_write(&root->fs_info->subvol_sem);
 out_unlock:
@@ -2171,6 +2230,12 @@ out_unlock:
 		shrink_dcache_sb(root->fs_info->sb);
 		btrfs_invalidate_inodes(dest);
 		d_delete(dentry);
+
+		/* the last ref */
+		if (dest->cache_inode) {
+			iput(dest->cache_inode);
+			dest->cache_inode = NULL;
+		}
 	}
 out_dput:
 	dput(dentry);
@@ -2184,7 +2249,7 @@ out:
 
 static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ioctl_defrag_range_args *range;
 	int ret;
@@ -2193,13 +2258,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 	if (ret)
 		return ret;
 
-	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
-			1)) {
-		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-		mnt_drop_write_file(file);
-		return -EINVAL;
-	}
-
 	if (btrfs_root_readonly(root)) {
 		ret = -EROFS;
 		goto out;
@@ -2211,10 +2269,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 			ret = -EPERM;
 			goto out;
 		}
-		ret = btrfs_defrag_root(root, 0);
+		ret = btrfs_defrag_root(root);
 		if (ret)
 			goto out;
-		ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
+		ret = btrfs_defrag_root(root->fs_info->extent_root);
 		break;
 	case S_IFREG:
 		if (!(file->f_mode & FMODE_WRITE)) {
@@ -2244,7 +2302,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 			/* the rest are all set to zero by kzalloc */
 			range->len = (u64)-1;
 		}
-		ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
+		ret = btrfs_defrag_file(file_inode(file), file,
 					range, 0, 0);
 		if (ret > 0)
 			ret = 0;
@@ -2254,7 +2312,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 		ret = -EINVAL;
 	}
 out:
-	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 	mnt_drop_write_file(file);
 	return ret;
 }
@@ -2292,7 +2349,7 @@ out:
 
 static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 {
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
 
@@ -2303,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	if (ret)
 		return ret;
 
-	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
-			1)) {
-		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-		mnt_drop_write_file(file);
-		return -EINVAL;
-	}
-
-	mutex_lock(&root->fs_info->volume_mutex);
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args)) {
 		ret = PTR_ERR(vol_args);
@@ -2318,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-	ret = btrfs_rm_device(root, vol_args->name);
 
-	kfree(vol_args);
-out:
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+		goto out;
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
+	ret = btrfs_rm_device(root, vol_args->name);
 	mutex_unlock(&root->fs_info->volume_mutex);
 	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+
+out:
+	kfree(vol_args);
 	mnt_drop_write_file(file);
 	return ret;
 }
@@ -2381,7 +2438,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
 
 	mutex_lock(&fs_devices->device_list_mutex);
 	dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
-	mutex_unlock(&fs_devices->device_list_mutex);
 
 	if (!dev) {
 		ret = -ENODEV;
@@ -2405,6 +2461,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
 	}
 
 out:
+	mutex_unlock(&fs_devices->device_list_mutex);
 	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
 		ret = -EFAULT;
 
@@ -2415,7 +2472,7 @@ out:
 static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				       u64 off, u64 olen, u64 destoff)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct fd src_file;
 	struct inode *src;
@@ -2429,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	int ret;
 	u64 len = olen;
 	u64 bs = root->fs_info->sb->s_blocksize;
+	int same_inode = 0;
 
 	/*
 	 * TODO:
@@ -2461,11 +2519,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	if (src_file.file->f_path.mnt != file->f_path.mnt)
 		goto out_fput;
 
-	src = src_file.file->f_dentry->d_inode;
+	src = file_inode(src_file.file);
 
 	ret = -EINVAL;
 	if (src == inode)
-		goto out_fput;
+		same_inode = 1;
 
 	/* the src must be open for reading */
 	if (!(src_file.file->f_mode & FMODE_READ))
@@ -2496,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	}
 	path->reada = 2;
 
-	if (inode < src) {
-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+	if (!same_inode) {
+		if (inode < src) {
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+		} else {
+			mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		}
 	} else {
-		mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		mutex_lock(&src->i_mutex);
 	}
 
 	/* determine range to clone */
@@ -2519,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	    !IS_ALIGNED(destoff, bs))
 		goto out_unlock;
 
+	/* verify if ranges are overlapped within the same file */
+	if (same_inode) {
+		if (destoff + len > off && destoff < off + len)
+			goto out_unlock;
+	}
+
 	if (destoff > inode->i_size) {
 		ret = btrfs_cont_expand(inode, inode->i_size, destoff);
 		if (ret)
@@ -2795,7 +2863,8 @@ out:
 	unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 out_unlock:
 	mutex_unlock(&src->i_mutex);
-	mutex_unlock(&inode->i_mutex);
+	if (!same_inode)
+		mutex_unlock(&inode->i_mutex);
 	vfree(buf);
 	btrfs_free_path(path);
 out_fput:
@@ -2823,7 +2892,7 @@ static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
  */
 static long btrfs_ioctl_trans_start(struct file *file)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret;
@@ -2863,7 +2932,7 @@ out:
 
 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_root *new_root;
 	struct btrfs_dir_item *di;
@@ -2900,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 		goto out;
 	}
 
-	if (btrfs_root_refs(&new_root->root_item) == 0) {
-		ret = -ENOENT;
-		goto out;
-	}
-
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
@@ -2959,7 +3023,7 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
 	}
 }
 
-long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_space_args space_args;
 	struct btrfs_ioctl_space_info space;
@@ -3087,7 +3151,7 @@ out:
  */
 long btrfs_ioctl_trans_end(struct file *file)
 {
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 
@@ -3111,7 +3175,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
 	u64 transid;
 	int ret;
 
-	trans = btrfs_attach_transaction(root);
+	trans = btrfs_attach_transaction_barrier(root);
 	if (IS_ERR(trans)) {
 		if (PTR_ERR(trans) != -ENOENT)
 			return PTR_ERR(trans);
@@ -3149,7 +3213,7 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
 
 static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
 {
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	struct btrfs_ioctl_scrub_args *sa;
 	int ret;
 
@@ -3289,7 +3353,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 	struct inode_fs_paths *ipath = NULL;
 	struct btrfs_path *path;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_DAC_READ_SEARCH))
 		return -EPERM;
 
 	path = btrfs_alloc_path();
@@ -3440,7 +3504,7 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
 
 static long btrfs_ioctl_balance(struct file *file, void __user *arg)
 {
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_ioctl_balance_args *bargs;
 	struct btrfs_balance_control *bctl;
@@ -3630,7 +3694,7 @@ out:
 
 static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 {
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	struct btrfs_ioctl_quota_ctl_args *sa;
 	struct btrfs_trans_handle *trans = NULL;
 	int ret;
@@ -3649,12 +3713,11 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 		goto drop_write;
 	}
 
-	if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
-		trans = btrfs_start_transaction(root, 2);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-			goto out;
-		}
+	down_write(&root->fs_info->subvol_sem);
+	trans = btrfs_start_transaction(root->fs_info->tree_root, 2);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
 	}
 
 	switch (sa->cmd) {
@@ -3664,24 +3727,17 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
 	case BTRFS_QUOTA_CTL_DISABLE:
 		ret = btrfs_quota_disable(trans, root->fs_info);
 		break;
-	case BTRFS_QUOTA_CTL_RESCAN:
-		ret = btrfs_quota_rescan(root->fs_info);
-		break;
 	default:
 		ret = -EINVAL;
 		break;
 	}
 
-	if (copy_to_user(arg, sa, sizeof(*sa)))
-		ret = -EFAULT;
-
-	if (trans) {
-		err = btrfs_commit_transaction(trans, root);
-		if (err && !ret)
-			ret = err;
-	}
+	err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
+	if (err && !ret)
+		ret = err;
 out:
 	kfree(sa);
+	up_write(&root->fs_info->subvol_sem);
 drop_write:
 	mnt_drop_write_file(file);
 	return ret;
@@ -3689,7 +3745,7 @@ drop_write:
 
 static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
 {
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	struct btrfs_ioctl_qgroup_assign_args *sa;
 	struct btrfs_trans_handle *trans;
 	int ret;
@@ -3736,7 +3792,7 @@ drop_write:
 
 static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 {
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	struct btrfs_ioctl_qgroup_create_args *sa;
 	struct btrfs_trans_handle *trans;
 	int ret;
@@ -3787,7 +3843,7 @@ drop_write:
 
 static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
 {
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	struct btrfs_ioctl_qgroup_limit_args *sa;
 	struct btrfs_trans_handle *trans;
 	int ret;
@@ -3833,11 +3889,79 @@ drop_write:
 	return ret;
 }
 
+static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_quota_rescan_args *qsa;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	qsa = memdup_user(arg, sizeof(*qsa));
+	if (IS_ERR(qsa)) {
+		ret = PTR_ERR(qsa);
+		goto drop_write;
+	}
+
+	if (qsa->flags) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_qgroup_rescan(root->fs_info);
+
+out:
+	kfree(qsa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_ioctl_quota_rescan_args *qsa;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+	if (!qsa)
+		return -ENOMEM;
+
+	if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+		qsa->flags = 1;
+		qsa->progress = root->fs_info->qgroup_rescan_progress.objectid;
+	}
+
+	if (copy_to_user(arg, qsa, sizeof(*qsa)))
+		ret = -EFAULT;
+
+	kfree(qsa);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btrfs_qgroup_wait_for_completion(root->fs_info);
+}
+
 static long btrfs_ioctl_set_received_subvol(struct file *file,
 					    void __user *arg)
 {
 	struct btrfs_ioctl_received_subvol_args *sa = NULL;
-	struct inode *inode = fdentry(file)->d_inode;
+	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_root_item *root_item = &root->root_item;
 	struct btrfs_trans_handle *trans;
@@ -3914,10 +4038,69 @@ out:
 	return ret;
 }
 
+static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	const char *label = root->fs_info->super_copy->label;
+	size_t len = strnlen(label, BTRFS_LABEL_SIZE);
+	int ret;
+
+	if (len == BTRFS_LABEL_SIZE) {
+		pr_warn("btrfs: label is too long, return the first %zu bytes\n",
+			--len);
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
+	ret = copy_to_user(arg, label, len);
+	mutex_unlock(&root->fs_info->volume_mutex);
+
+	return ret ? -EFAULT : 0;
+}
+
+static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+	struct btrfs_super_block *super_block = root->fs_info->super_copy;
+	struct btrfs_trans_handle *trans;
+	char label[BTRFS_LABEL_SIZE];
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(label, arg, sizeof(label)))
+		return -EFAULT;
+
+	if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
+		pr_err("btrfs: unable to set label with more than %d bytes\n",
+		       BTRFS_LABEL_SIZE - 1);
+		return -EINVAL;
+	}
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	mutex_lock(&root->fs_info->volume_mutex);
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_unlock;
+	}
+
+	strcpy(super_block->label, label);
+	ret = btrfs_end_transaction(trans, root);
+
+out_unlock:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
 	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
@@ -4012,8 +4195,18 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_qgroup_create(file, argp);
 	case BTRFS_IOC_QGROUP_LIMIT:
 		return btrfs_ioctl_qgroup_limit(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN:
+		return btrfs_ioctl_quota_rescan(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN_STATUS:
+		return btrfs_ioctl_quota_rescan_status(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+		return btrfs_ioctl_quota_rescan_wait(file, argp);
 	case BTRFS_IOC_DEV_REPLACE:
 		return btrfs_ioctl_dev_replace(root, argp);
+	case BTRFS_IOC_GET_FSLABEL:
+		return btrfs_ioctl_get_fslabel(file, argp);
+	case BTRFS_IOC_SET_FSLABEL:
+		return btrfs_ioctl_set_fslabel(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
deleted file mode 100644
index dabca9cc8c2..00000000000
--- a/fs/btrfs/ioctl.h
+++ /dev/null
@@ -1,502 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __IOCTL_
-#define __IOCTL_
-#include <linux/ioctl.h>
-
-#define BTRFS_IOCTL_MAGIC 0x94
-#define BTRFS_VOL_NAME_MAX 255
-
-/* this should be 4k */
-#define BTRFS_PATH_NAME_MAX 4087
-struct btrfs_ioctl_vol_args {
-	__s64 fd;
-	char name[BTRFS_PATH_NAME_MAX + 1];
-};
-
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
-
-#define BTRFS_SUBVOL_CREATE_ASYNC	(1ULL << 0)
-#define BTRFS_SUBVOL_RDONLY		(1ULL << 1)
-#define BTRFS_SUBVOL_QGROUP_INHERIT	(1ULL << 2)
-#define BTRFS_FSID_SIZE 16
-#define BTRFS_UUID_SIZE 16
-
-#define BTRFS_QGROUP_INHERIT_SET_LIMITS	(1ULL << 0)
-
-struct btrfs_qgroup_limit {
-	__u64	flags;
-	__u64	max_rfer;
-	__u64	max_excl;
-	__u64	rsv_rfer;
-	__u64	rsv_excl;
-};
-
-struct btrfs_qgroup_inherit {
-	__u64	flags;
-	__u64	num_qgroups;
-	__u64	num_ref_copies;
-	__u64	num_excl_copies;
-	struct btrfs_qgroup_limit lim;
-	__u64	qgroups[0];
-};
-
-struct btrfs_ioctl_qgroup_limit_args {
-	__u64	qgroupid;
-	struct btrfs_qgroup_limit lim;
-};
-
-#define BTRFS_SUBVOL_NAME_MAX 4039
-struct btrfs_ioctl_vol_args_v2 {
-	__s64 fd;
-	__u64 transid;
-	__u64 flags;
-	union {
-		struct {
-			__u64 size;
-			struct btrfs_qgroup_inherit __user *qgroup_inherit;
-		};
-		__u64 unused[4];
-	};
-	char name[BTRFS_SUBVOL_NAME_MAX + 1];
-};
-
-/*
- * structure to report errors and progress to userspace, either as a
- * result of a finished scrub, a canceled scrub or a progress inquiry
- */
-struct btrfs_scrub_progress {
-	__u64 data_extents_scrubbed;	/* # of data extents scrubbed */
-	__u64 tree_extents_scrubbed;	/* # of tree extents scrubbed */
-	__u64 data_bytes_scrubbed;	/* # of data bytes scrubbed */
-	__u64 tree_bytes_scrubbed;	/* # of tree bytes scrubbed */
-	__u64 read_errors;		/* # of read errors encountered (EIO) */
-	__u64 csum_errors;		/* # of failed csum checks */
-	__u64 verify_errors;		/* # of occurences, where the metadata
-					 * of a tree block did not match the
-					 * expected values, like generation or
-					 * logical */
-	__u64 no_csum;			/* # of 4k data block for which no csum
-					 * is present, probably the result of
-					 * data written with nodatasum */
-	__u64 csum_discards;		/* # of csum for which no data was found
-					 * in the extent tree. */
-	__u64 super_errors;		/* # of bad super blocks encountered */
-	__u64 malloc_errors;		/* # of internal kmalloc errors. These
-					 * will likely cause an incomplete
-					 * scrub */
-	__u64 uncorrectable_errors;	/* # of errors where either no intact
-					 * copy was found or the writeback
-					 * failed */
-	__u64 corrected_errors;		/* # of errors corrected */
-	__u64 last_physical;		/* last physical address scrubbed. In
-					 * case a scrub was aborted, this can
-					 * be used to restart the scrub */
-	__u64 unverified_errors;	/* # of occurences where a read for a
-					 * full (64k) bio failed, but the re-
-					 * check succeeded for each 4k piece.
-					 * Intermittent error. */
-};
-
-#define BTRFS_SCRUB_READONLY	1
-struct btrfs_ioctl_scrub_args {
-	__u64 devid;				/* in */
-	__u64 start;				/* in */
-	__u64 end;				/* in */
-	__u64 flags;				/* in */
-	struct btrfs_scrub_progress progress;	/* out */
-	/* pad to 1k */
-	__u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
-};
-
-#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
-#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
-struct btrfs_ioctl_dev_replace_start_params {
-	__u64 srcdevid;	/* in, if 0, use srcdev_name instead */
-	__u64 cont_reading_from_srcdev_mode;	/* in, see #define
-						 * above */
-	__u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];	/* in */
-	__u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];	/* in */
-};
-
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED	0
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED		1
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED		2
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED		3
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED		4
-struct btrfs_ioctl_dev_replace_status_params {
-	__u64 replace_state;	/* out, see #define above */
-	__u64 progress_1000;	/* out, 0 <= x <= 1000 */
-	__u64 time_started;	/* out, seconds since 1-Jan-1970 */
-	__u64 time_stopped;	/* out, seconds since 1-Jan-1970 */
-	__u64 num_write_errors;	/* out */
-	__u64 num_uncorrectable_read_errors;	/* out */
-};
-
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_START			0
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS			1
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL			2
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR			0
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED		1
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED		2
-struct btrfs_ioctl_dev_replace_args {
-	__u64 cmd;	/* in */
-	__u64 result;	/* out */
-
-	union {
-		struct btrfs_ioctl_dev_replace_start_params start;
-		struct btrfs_ioctl_dev_replace_status_params status;
-	};	/* in/out */
-
-	__u64 spare[64];
-};
-
-struct btrfs_ioctl_dev_info_args {
-	__u64 devid;				/* in/out */
-	__u8 uuid[BTRFS_UUID_SIZE];		/* in/out */
-	__u64 bytes_used;			/* out */
-	__u64 total_bytes;			/* out */
-	__u64 unused[379];			/* pad to 4k */
-	__u8 path[BTRFS_DEVICE_PATH_NAME_MAX];	/* out */
-};
-
-struct btrfs_ioctl_fs_info_args {
-	__u64 max_id;				/* out */
-	__u64 num_devices;			/* out */
-	__u8 fsid[BTRFS_FSID_SIZE];		/* out */
-	__u64 reserved[124];			/* pad to 1k */
-};
-
-/* balance control ioctl modes */
-#define BTRFS_BALANCE_CTL_PAUSE		1
-#define BTRFS_BALANCE_CTL_CANCEL	2
-
-/*
- * this is packed, because it should be exactly the same as its disk
- * byte order counterpart (struct btrfs_disk_balance_args)
- */
-struct btrfs_balance_args {
-	__u64 profiles;
-	__u64 usage;
-	__u64 devid;
-	__u64 pstart;
-	__u64 pend;
-	__u64 vstart;
-	__u64 vend;
-
-	__u64 target;
-
-	__u64 flags;
-
-	__u64 unused[8];
-} __attribute__ ((__packed__));
-
-/* report balance progress to userspace */
-struct btrfs_balance_progress {
-	__u64 expected;		/* estimated # of chunks that will be
-				 * relocated to fulfill the request */
-	__u64 considered;	/* # of chunks we have considered so far */
-	__u64 completed;	/* # of chunks relocated so far */
-};
-
-#define BTRFS_BALANCE_STATE_RUNNING	(1ULL << 0)
-#define BTRFS_BALANCE_STATE_PAUSE_REQ	(1ULL << 1)
-#define BTRFS_BALANCE_STATE_CANCEL_REQ	(1ULL << 2)
-
-struct btrfs_ioctl_balance_args {
-	__u64 flags;				/* in/out */
-	__u64 state;				/* out */
-
-	struct btrfs_balance_args data;		/* in/out */
-	struct btrfs_balance_args meta;		/* in/out */
-	struct btrfs_balance_args sys;		/* in/out */
-
-	struct btrfs_balance_progress stat;	/* out */
-
-	__u64 unused[72];			/* pad to 1k */
-};
-
-#define BTRFS_INO_LOOKUP_PATH_MAX 4080
-struct btrfs_ioctl_ino_lookup_args {
-	__u64 treeid;
-	__u64 objectid;
-	char name[BTRFS_INO_LOOKUP_PATH_MAX];
-};
-
-struct btrfs_ioctl_search_key {
-	/* which root are we searching.  0 is the tree of tree roots */
-	__u64 tree_id;
-
-	/* keys returned will be >= min and <= max */
-	__u64 min_objectid;
-	__u64 max_objectid;
-
-	/* keys returned will be >= min and <= max */
-	__u64 min_offset;
-	__u64 max_offset;
-
-	/* max and min transids to search for */
-	__u64 min_transid;
-	__u64 max_transid;
-
-	/* keys returned will be >= min and <= max */
-	__u32 min_type;
-	__u32 max_type;
-
-	/*
-	 * how many items did userland ask for, and how many are we
-	 * returning
-	 */
-	__u32 nr_items;
-
-	/* align to 64 bits */
-	__u32 unused;
-
-	/* some extra for later */
-	__u64 unused1;
-	__u64 unused2;
-	__u64 unused3;
-	__u64 unused4;
-};
-
-struct btrfs_ioctl_search_header {
-	__u64 transid;
-	__u64 objectid;
-	__u64 offset;
-	__u32 type;
-	__u32 len;
-};
-
-#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
-/*
- * the buf is an array of search headers where
- * each header is followed by the actual item
- * the type field is expanded to 32 bits for alignment
- */
-struct btrfs_ioctl_search_args {
-	struct btrfs_ioctl_search_key key;
-	char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
-};
-
-struct btrfs_ioctl_clone_range_args {
-  __s64 src_fd;
-  __u64 src_offset, src_length;
-  __u64 dest_offset;
-};
-
-/* flags for the defrag range ioctl */
-#define BTRFS_DEFRAG_RANGE_COMPRESS 1
-#define BTRFS_DEFRAG_RANGE_START_IO 2
-
-struct btrfs_ioctl_space_info {
-	__u64 flags;
-	__u64 total_bytes;
-	__u64 used_bytes;
-};
-
-struct btrfs_ioctl_space_args {
-	__u64 space_slots;
-	__u64 total_spaces;
-	struct btrfs_ioctl_space_info spaces[0];
-};
-
-struct btrfs_data_container {
-	__u32	bytes_left;	/* out -- bytes not needed to deliver output */
-	__u32	bytes_missing;	/* out -- additional bytes needed for result */
-	__u32	elem_cnt;	/* out */
-	__u32	elem_missed;	/* out */
-	__u64	val[0];		/* out */
-};
-
-struct btrfs_ioctl_ino_path_args {
-	__u64				inum;		/* in */
-	__u64				size;		/* in */
-	__u64				reserved[4];
-	/* struct btrfs_data_container	*fspath;	   out */
-	__u64				fspath;		/* out */
-};
-
-struct btrfs_ioctl_logical_ino_args {
-	__u64				logical;	/* in */
-	__u64				size;		/* in */
-	__u64				reserved[4];
-	/* struct btrfs_data_container	*inodes;	out   */
-	__u64				inodes;
-};
-
-enum btrfs_dev_stat_values {
-	/* disk I/O failure stats */
-	BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
-	BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
-	BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
-
-	/* stats for indirect indications for I/O failures */
-	BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
-					 * contents is illegal: this is an
-					 * indication that the block was damaged
-					 * during read or write, or written to
-					 * wrong location or read from wrong
-					 * location */
-	BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
-					 * been written */
-
-	BTRFS_DEV_STAT_VALUES_MAX
-};
-
-/* Reset statistics after reading; needs SYS_ADMIN capability */
-#define	BTRFS_DEV_STATS_RESET		(1ULL << 0)
-
-struct btrfs_ioctl_get_dev_stats {
-	__u64 devid;				/* in */
-	__u64 nr_items;				/* in/out */
-	__u64 flags;				/* in/out */
-
-	/* out values: */
-	__u64 values[BTRFS_DEV_STAT_VALUES_MAX];
-
-	__u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
-};
-
-#define BTRFS_QUOTA_CTL_ENABLE	1
-#define BTRFS_QUOTA_CTL_DISABLE	2
-#define BTRFS_QUOTA_CTL_RESCAN	3
-struct btrfs_ioctl_quota_ctl_args {
-	__u64 cmd;
-	__u64 status;
-};
-
-struct btrfs_ioctl_qgroup_assign_args {
-	__u64 assign;
-	__u64 src;
-	__u64 dst;
-};
-
-struct btrfs_ioctl_qgroup_create_args {
-	__u64 create;
-	__u64 qgroupid;
-};
-struct btrfs_ioctl_timespec {
-	__u64 sec;
-	__u32 nsec;
-};
-
-struct btrfs_ioctl_received_subvol_args {
-	char	uuid[BTRFS_UUID_SIZE];	/* in */
-	__u64	stransid;		/* in */
-	__u64	rtransid;		/* out */
-	struct btrfs_ioctl_timespec stime; /* in */
-	struct btrfs_ioctl_timespec rtime; /* out */
-	__u64	flags;			/* in */
-	__u64	reserved[16];		/* in */
-};
-
-struct btrfs_ioctl_send_args {
-	__s64 send_fd;			/* in */
-	__u64 clone_sources_count;	/* in */
-	__u64 __user *clone_sources;	/* in */
-	__u64 parent_root;		/* in */
-	__u64 flags;			/* in */
-	__u64 reserved[4];		/* in */
-};
-
-#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
-				   struct btrfs_ioctl_vol_args)
-/* trans start and trans end are dangerous, and only for
- * use by applications that know how to avoid the
- * resulting deadlocks
- */
-#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
-#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
-#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
-
-#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
-#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
-				   struct btrfs_ioctl_vol_args)
-
-#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
-				  struct btrfs_ioctl_clone_range_args)
-
-#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
-				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
-				struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
-				struct btrfs_ioctl_defrag_range_args)
-#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
-				   struct btrfs_ioctl_search_args)
-#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
-				   struct btrfs_ioctl_ino_lookup_args)
-#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
-#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
-				    struct btrfs_ioctl_space_args)
-#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
-#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
-#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
-				   struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
-				   struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
-#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
-#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
-			      struct btrfs_ioctl_scrub_args)
-#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
-#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
-				       struct btrfs_ioctl_scrub_args)
-#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
-				 struct btrfs_ioctl_dev_info_args)
-#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
-			       struct btrfs_ioctl_fs_info_args)
-#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
-				   struct btrfs_ioctl_balance_args)
-#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
-#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
-					struct btrfs_ioctl_balance_args)
-#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
-					struct btrfs_ioctl_ino_path_args)
-#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
-					struct btrfs_ioctl_ino_path_args)
-#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
-				struct btrfs_ioctl_received_subvol_args)
-#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
-#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
-				     struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
-			       struct btrfs_ioctl_quota_ctl_args)
-#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
-			       struct btrfs_ioctl_qgroup_assign_args)
-#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
-			       struct btrfs_ioctl_qgroup_create_args)
-#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
-			       struct btrfs_ioctl_qgroup_limit_args)
-#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
-				      struct btrfs_ioctl_get_dev_stats)
-#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
-				    struct btrfs_ioctl_dev_replace_args)
-
-#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 2a1762c6604..01277b8f237 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -24,7 +24,7 @@
 #include "extent_io.h"
 #include "locking.h"
 
-void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
 
 /*
  * if we currently have a spinning reader or writer lock
@@ -113,11 +113,10 @@ again:
 		read_unlock(&eb->lock);
 		return;
 	}
-	read_unlock(&eb->lock);
-	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
-	read_lock(&eb->lock);
 	if (atomic_read(&eb->blocking_writers)) {
 		read_unlock(&eb->lock);
+		wait_event(eb->write_lock_wq,
+			   atomic_read(&eb->blocking_writers) == 0);
 		goto again;
 	}
 	atomic_inc(&eb->read_locks);
@@ -265,7 +264,7 @@ void btrfs_assert_tree_locked(struct extent_buffer *eb)
 	BUG_ON(!atomic_read(&eb->write_locks));
 }
 
-void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
 {
 	BUG_ON(!atomic_read(&eb->read_locks));
 }
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index ca52681e5f4..b81e0e9a489 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -26,7 +26,6 @@
 
 void btrfs_tree_lock(struct extent_buffer *eb);
 void btrfs_tree_unlock(struct extent_buffer *eb);
-int btrfs_try_spin_lock(struct extent_buffer *eb);
 
 void btrfs_tree_read_lock(struct extent_buffer *eb);
 void btrfs_tree_read_unlock(struct extent_buffer *eb);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 743b86fa4fc..f93151a9888 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -31,8 +31,8 @@
 
 struct workspace {
 	void *mem;
-	void *buf;	/* where compressed data goes */
-	void *cbuf;	/* where decompressed data goes */
+	void *buf;	/* where decompressed data goes */
+	void *cbuf;	/* where compressed data goes */
 	struct list_head list;
 };
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e5ed5672960..81369827e51 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -24,6 +24,7 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "extent_io.h"
+#include "disk-io.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
@@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 				      u64 start, u64 len, u64 disk_len,
 				      int type, int dio, int compress_type)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry;
@@ -196,6 +198,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
+	    !(type == BTRFS_ORDERED_NOCOW))
+		entry->csum_bytes_left = disk_len;
 	entry->disk_len = disk_len;
 	entry->bytes_left = len;
 	entry->inode = igrab(inode);
@@ -213,6 +218,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	INIT_LIST_HEAD(&entry->root_extent_list);
 	INIT_LIST_HEAD(&entry->work_list);
 	init_completion(&entry->completion);
+	INIT_LIST_HEAD(&entry->log_list);
 
 	trace_btrfs_ordered_extent_add(inode, entry);
 
@@ -223,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 		ordered_data_tree_panic(inode, -EEXIST, file_offset);
 	spin_unlock_irq(&tree->lock);
 
-	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	spin_lock(&root->ordered_extent_lock);
 	list_add_tail(&entry->root_extent_list,
-		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
-	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+		      &root->ordered_extents);
+	root->nr_ordered_extents++;
+	if (root->nr_ordered_extents == 1) {
+		spin_lock(&root->fs_info->ordered_root_lock);
+		BUG_ON(!list_empty(&root->ordered_root));
+		list_add_tail(&root->ordered_root,
+			      &root->fs_info->ordered_roots);
+		spin_unlock(&root->fs_info->ordered_root_lock);
+	}
+	spin_unlock(&root->ordered_extent_lock);
 
 	return 0;
 }
@@ -270,6 +284,10 @@ void btrfs_add_ordered_sum(struct inode *inode,
 	tree = &BTRFS_I(inode)->ordered_tree;
 	spin_lock_irq(&tree->lock);
 	list_add_tail(&sum->list, &entry->list);
+	WARN_ON(entry->csum_bytes_left < sum->len);
+	entry->csum_bytes_left -= sum->len;
+	if (entry->csum_bytes_left == 0)
+		wake_up(&entry->wait);
 	spin_unlock_irq(&tree->lock);
 }
 
@@ -405,6 +423,66 @@ out:
 	return ret == 0;
 }
 
+/* Needs to either be called under a log transaction or the log_mutex */
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct btrfs_ordered_extent *ordered;
+	struct rb_node *n;
+	int index = log->log_transid % 2;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irq(&tree->lock);
+	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+		spin_lock(&log->log_extents_lock[index]);
+		if (list_empty(&ordered->log_list)) {
+			list_add_tail(&ordered->log_list, &log->logged_list[index]);
+			atomic_inc(&ordered->refs);
+		}
+		spin_unlock(&log->log_extents_lock[index]);
+	}
+	spin_unlock_irq(&tree->lock);
+}
+
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+{
+	struct btrfs_ordered_extent *ordered;
+	int index = transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	while (!list_empty(&log->logged_list[index])) {
+		ordered = list_first_entry(&log->logged_list[index],
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		spin_unlock_irq(&log->log_extents_lock[index]);
+		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
+						   &ordered->flags));
+		btrfs_put_ordered_extent(ordered);
+		spin_lock_irq(&log->log_extents_lock[index]);
+	}
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
+{
+	struct btrfs_ordered_extent *ordered;
+	int index = transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	while (!list_empty(&log->logged_list[index])) {
+		ordered = list_first_entry(&log->logged_list[index],
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		spin_unlock_irq(&log->log_extents_lock[index]);
+		btrfs_put_ordered_extent(ordered);
+		spin_lock_irq(&log->log_extents_lock[index]);
+	}
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
 /*
  * used to drop a reference on an ordered extent.  This will free
  * the extent if the last reference is dropped
@@ -448,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
 	spin_unlock_irq(&tree->lock);
 
-	spin_lock(&root->fs_info->ordered_extent_lock);
+	spin_lock(&root->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
+	root->nr_ordered_extents--;
 
 	trace_btrfs_ordered_extent_remove(inode, entry);
 
@@ -462,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
 		list_del_init(&BTRFS_I(inode)->ordered_operations);
 	}
-	spin_unlock(&root->fs_info->ordered_extent_lock);
+
+	if (!root->nr_ordered_extents) {
+		spin_lock(&root->fs_info->ordered_root_lock);
+		BUG_ON(list_empty(&root->ordered_root));
+		list_del_init(&root->ordered_root);
+		spin_unlock(&root->fs_info->ordered_root_lock);
+	}
+	spin_unlock(&root->ordered_extent_lock);
 	wake_up(&entry->wait);
 }
 
@@ -482,42 +568,41 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 {
 	struct list_head splice, works;
-	struct list_head *cur;
 	struct btrfs_ordered_extent *ordered, *next;
 	struct inode *inode;
 
 	INIT_LIST_HEAD(&splice);
 	INIT_LIST_HEAD(&works);
 
-	spin_lock(&root->fs_info->ordered_extent_lock);
-	list_splice_init(&root->fs_info->ordered_extents, &splice);
+	mutex_lock(&root->fs_info->ordered_operations_mutex);
+	spin_lock(&root->ordered_extent_lock);
+	list_splice_init(&root->ordered_extents, &splice);
 	while (!list_empty(&splice)) {
-		cur = splice.next;
-		ordered = list_entry(cur, struct btrfs_ordered_extent,
-				     root_extent_list);
-		list_del_init(&ordered->root_extent_list);
-		atomic_inc(&ordered->refs);
-
+		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
+					   root_extent_list);
+		list_move_tail(&ordered->root_extent_list,
+			       &root->ordered_extents);
 		/*
 		 * the inode may be getting freed (in sys_unlink path).
 		 */
 		inode = igrab(ordered->inode);
+		if (!inode) {
+			cond_resched_lock(&root->ordered_extent_lock);
+			continue;
+		}
 
-		spin_unlock(&root->fs_info->ordered_extent_lock);
+		atomic_inc(&ordered->refs);
+		spin_unlock(&root->ordered_extent_lock);
 
-		if (inode) {
-			ordered->flush_work.func = btrfs_run_ordered_extent_work;
-			list_add_tail(&ordered->work_list, &works);
-			btrfs_queue_worker(&root->fs_info->flush_workers,
-					   &ordered->flush_work);
-		} else {
-			btrfs_put_ordered_extent(ordered);
-		}
+		ordered->flush_work.func = btrfs_run_ordered_extent_work;
+		list_add_tail(&ordered->work_list, &works);
+		btrfs_queue_worker(&root->fs_info->flush_workers,
+				   &ordered->flush_work);
 
 		cond_resched();
-		spin_lock(&root->fs_info->ordered_extent_lock);
+		spin_lock(&root->ordered_extent_lock);
 	}
-	spin_unlock(&root->fs_info->ordered_extent_lock);
+	spin_unlock(&root->ordered_extent_lock);
 
 	list_for_each_entry_safe(ordered, next, &works, work_list) {
 		list_del_init(&ordered->work_list);
@@ -532,6 +617,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 
 		cond_resched();
 	}
+	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+}
+
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+				    int delay_iput)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&fs_info->ordered_root_lock);
+	list_splice_init(&fs_info->ordered_roots, &splice);
+	while (!list_empty(&splice)) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					ordered_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		list_move_tail(&root->ordered_root,
+			       &fs_info->ordered_roots);
+		spin_unlock(&fs_info->ordered_root_lock);
+
+		btrfs_wait_ordered_extents(root, delay_iput);
+		btrfs_put_fs_root(root);
+
+		spin_lock(&fs_info->ordered_root_lock);
+	}
+	spin_unlock(&fs_info->ordered_root_lock);
 }
 
 /*
@@ -544,10 +657,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
  * extra check to make sure the ordered operation list really is empty
  * before we return
  */
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root, int wait)
 {
 	struct btrfs_inode *btrfs_inode;
 	struct inode *inode;
+	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct list_head splice;
 	struct list_head works;
 	struct btrfs_delalloc_work *work, *next;
@@ -557,15 +672,11 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
 	INIT_LIST_HEAD(&works);
 
 	mutex_lock(&root->fs_info->ordered_operations_mutex);
-	spin_lock(&root->fs_info->ordered_extent_lock);
-again:
-	list_splice_init(&root->fs_info->ordered_operations, &splice);
-
+	spin_lock(&root->fs_info->ordered_root_lock);
+	list_splice_init(&cur_trans->ordered_operations, &splice);
 	while (!list_empty(&splice)) {
-
 		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
 				   ordered_operations);
-
 		inode = &btrfs_inode->vfs_inode;
 
 		list_del_init(&btrfs_inode->ordered_operations);
@@ -574,25 +685,23 @@ again:
 		 * the inode may be getting freed (in sys_unlink path).
 		 */
 		inode = igrab(inode);
-
-		if (!wait && inode) {
-			list_add_tail(&BTRFS_I(inode)->ordered_operations,
-			      &root->fs_info->ordered_operations);
-		}
-
 		if (!inode)
 			continue;
-		spin_unlock(&root->fs_info->ordered_extent_lock);
+
+		if (!wait)
+			list_add_tail(&BTRFS_I(inode)->ordered_operations,
+				      &cur_trans->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_root_lock);
 
 		work = btrfs_alloc_delalloc_work(inode, wait, 1);
 		if (!work) {
+			spin_lock(&root->fs_info->ordered_root_lock);
 			if (list_empty(&BTRFS_I(inode)->ordered_operations))
 				list_add_tail(&btrfs_inode->ordered_operations,
 					      &splice);
-			spin_lock(&root->fs_info->ordered_extent_lock);
 			list_splice_tail(&splice,
-					 &root->fs_info->ordered_operations);
-			spin_unlock(&root->fs_info->ordered_extent_lock);
+					 &cur_trans->ordered_operations);
+			spin_unlock(&root->fs_info->ordered_root_lock);
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -601,12 +710,9 @@ again:
 				   &work->work);
 
 		cond_resched();
-		spin_lock(&root->fs_info->ordered_extent_lock);
+		spin_lock(&root->fs_info->ordered_root_lock);
 	}
-	if (wait && !list_empty(&root->fs_info->ordered_operations))
-		goto again;
-
-	spin_unlock(&root->fs_info->ordered_extent_lock);
+	spin_unlock(&root->fs_info->ordered_root_lock);
 out:
 	list_for_each_entry_safe(work, next, &works, list) {
 		list_del_init(&work->list);
@@ -923,39 +1029,42 @@ out:
  * be reclaimed before their checksum is actually put into the btree
  */
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
-			   u32 *sum)
+			   u32 *sum, int len)
 {
 	struct btrfs_ordered_sum *ordered_sum;
-	struct btrfs_sector_sum *sector_sums;
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
 	unsigned long num_sectors;
 	unsigned long i;
 	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
-	int ret = 1;
+	int index = 0;
 
 	ordered = btrfs_lookup_ordered_extent(inode, offset);
 	if (!ordered)
-		return 1;
+		return 0;
 
 	spin_lock_irq(&tree->lock);
 	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
-		if (disk_bytenr >= ordered_sum->bytenr) {
-			num_sectors = ordered_sum->len / sectorsize;
-			sector_sums = ordered_sum->sums;
-			for (i = 0; i < num_sectors; i++) {
-				if (sector_sums[i].bytenr == disk_bytenr) {
-					*sum = sector_sums[i].sum;
-					ret = 0;
-					goto out;
-				}
-			}
+		if (disk_bytenr >= ordered_sum->bytenr &&
+		    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
+			i = (disk_bytenr - ordered_sum->bytenr) >>
+			    inode->i_sb->s_blocksize_bits;
+			num_sectors = ordered_sum->len >>
+				      inode->i_sb->s_blocksize_bits;
+			num_sectors = min_t(int, len - index, num_sectors - i);
+			memcpy(sum + index, ordered_sum->sums + i,
+			       num_sectors);
+
+			index += (int)num_sectors;
+			if (index == len)
+				goto out;
+			disk_bytenr += num_sectors * sectorsize;
 		}
 	}
 out:
 	spin_unlock_irq(&tree->lock);
 	btrfs_put_ordered_extent(ordered);
-	return ret;
+	return index;
 }
 
 
@@ -974,6 +1083,7 @@ out:
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root, struct inode *inode)
 {
+	struct btrfs_transaction *cur_trans = trans->transaction;
 	u64 last_mod;
 
 	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
@@ -985,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 	if (last_mod < root->fs_info->last_trans_committed)
 		return;
 
-	spin_lock(&root->fs_info->ordered_extent_lock);
+	spin_lock(&root->fs_info->ordered_root_lock);
 	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
 		list_add_tail(&BTRFS_I(inode)->ordered_operations,
-			      &root->fs_info->ordered_operations);
+			      &cur_trans->ordered_operations);
 	}
-	spin_unlock(&root->fs_info->ordered_extent_lock);
+	spin_unlock(&root->fs_info->ordered_root_lock);
 }
 
 int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f29d4bf5fbe..68844d59ee6 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
 	struct rb_node *last;
 };
 
-/*
- * these are used to collect checksums done just before bios submission.
- * They are attached via a list into the ordered extent, and
- * checksum items are inserted into the tree after all the blocks in
- * the ordered extent are on disk
- */
-struct btrfs_sector_sum {
-	/* bytenr on disk */
-	u64 bytenr;
-	u32 sum;
-};
-
 struct btrfs_ordered_sum {
 	/* bytenr is the start of this extent on disk */
 	u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
 	/*
 	 * this is the length in bytes covered by the sums array below.
 	 */
-	unsigned long len;
+	int len;
 	struct list_head list;
-	/* last field is a variable length array of btrfs_sector_sums */
-	struct btrfs_sector_sum sums[];
+	/* last field is a variable length array of csums */
+	u32 sums[];
 };
 
 /*
@@ -79,6 +67,8 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
 				       * has done its due diligence in updating
 				       * the isize. */
+#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
+				       ordered extent */
 
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
@@ -96,6 +86,9 @@ struct btrfs_ordered_extent {
 	/* number of bytes that still need writing */
 	u64 bytes_left;
 
+	/* number of bytes that still need csumming */
+	u64 csum_bytes_left;
+
 	/*
 	 * the end of the ordered extent which is behind it but
 	 * didn't update disk_i_size. Please see the comment of
@@ -118,6 +111,9 @@ struct btrfs_ordered_extent {
 	/* list of checksums for insertion when the extent io is done */
 	struct list_head list;
 
+	/* If we need to wait on this to be done */
+	struct list_head log_list;
+
 	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
 	wait_queue_head_t wait;
 
@@ -141,11 +137,8 @@ struct btrfs_ordered_extent {
 static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
 					 unsigned long bytes)
 {
-	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
-		root->sectorsize;
-	num_sectors++;
-	return sizeof(struct btrfs_ordered_sum) +
-		num_sectors * sizeof(struct btrfs_sector_sum);
+	int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
+	return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
 }
 
 static inline void
@@ -188,12 +181,19 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 							u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 				struct btrfs_ordered_extent *ordered);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+			   u32 *sum, int len);
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root, int wait);
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct inode *inode);
 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
+				    int delay_iput);
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
 void ordered_data_exit(void);
 #endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 50d95fd190a..dc0024f17c1 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -176,7 +176,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 
 	nr = btrfs_header_nritems(l);
 
-	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
+	btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
 		(unsigned long long)btrfs_header_bytenr(l), nr,
 		btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
@@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			       btrfs_dev_extent_chunk_offset(l, dev_extent),
 			       (unsigned long long)
 			       btrfs_dev_extent_length(l, dev_extent));
+			break;
 		case BTRFS_DEV_STATS_KEY:
 			printk(KERN_INFO "\t\tdevice stats\n");
 			break;
@@ -318,10 +319,9 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 		btrfs_print_leaf(root, c);
 		return;
 	}
-	printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
-	       (unsigned long long)btrfs_header_bytenr(c),
-	      level, nr,
-	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+	btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u",
+		(unsigned long long)btrfs_header_bytenr(c),
+		level, nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
 		btrfs_node_key_to_cpu(c, &key, i);
 		printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index da75efe534d..7faddfacc5b 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -19,5 +19,5 @@
 #ifndef __PRINT_TREE_
 #define __PRINT_TREE_
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
-void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c);
 #endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a5c85623432..1280eff8af5 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -23,21 +23,21 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/btrfs.h>
 
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
 #include "locking.h"
 #include "ulist.h"
-#include "ioctl.h"
 #include "backref.h"
+#include "extent_io.h"
 
 /* TODO XXX FIXME
  *  - subvol delete -> delete when ref goes to 0? delete limits also?
  *  - reorganize keys
  *  - compressed
  *  - sync
- *  - rescan
  *  - copy also limits on subvol creation
  *  - limit
  *  - caches fuer ulists
@@ -98,7 +98,12 @@ struct btrfs_qgroup_list {
 	struct btrfs_qgroup *member;
 };
 
-/* must be called with qgroup_lock held */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+		   int init_flags);
+static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
+
+/* must be called with qgroup_ioctl_lock held */
 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
 					   u64 qgroupid)
 {
@@ -247,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 	int slot;
 	int ret = 0;
 	u64 flags = 0;
+	u64 rescan_progress = 0;
 
 	if (!fs_info->quota_enabled)
 		return 0;
 
+	fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+	if (!fs_info->qgroup_ulist) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
@@ -298,7 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 			}
 			fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
 									  ptr);
-			/* FIXME read scan element */
+			rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
 			goto next1;
 		}
 
@@ -400,9 +412,18 @@ out:
 	if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
 		fs_info->quota_enabled = 0;
 		fs_info->pending_quota_state = 0;
+	} else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+		   ret >= 0) {
+		ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
 	}
 	btrfs_free_path(path);
 
+	if (ret < 0) {
+		ulist_free(fs_info->qgroup_ulist);
+		fs_info->qgroup_ulist = NULL;
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	}
+
 	return ret < 0 ? ret : 0;
 }
 
@@ -420,8 +441,6 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 		qgroup = rb_entry(n, struct btrfs_qgroup, node);
 		rb_erase(n, &fs_info->qgroup_tree);
 
-		WARN_ON(!list_empty(&qgroup->dirty));
-
 		while (!list_empty(&qgroup->groups)) {
 			list = list_first_entry(&qgroup->groups,
 						struct btrfs_qgroup_list,
@@ -441,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 		}
 		kfree(qgroup);
 	}
+	ulist_free(fs_info->qgroup_ulist);
 }
 
 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -620,7 +640,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 	key.offset = qgroupid;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
+
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret > 0)
 		ret = -ENOENT;
@@ -661,7 +683,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 	key.offset = qgroup->qgroupid;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
+
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret > 0)
 		ret = -ENOENT;
@@ -702,7 +726,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
 	key.offset = 0;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
+
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret > 0)
 		ret = -ENOENT;
@@ -715,7 +741,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
 	ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
 	btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
 	btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
-	/* XXX scan */
+	btrfs_set_qgroup_status_rescan(l, ptr,
+				fs_info->qgroup_rescan_progress.objectid);
 
 	btrfs_mark_buffer_dirty(l);
 
@@ -732,33 +759,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
+	struct extent_buffer *leaf = NULL;
 	int ret;
-
-	if (!root)
-		return -EINVAL;
+	int nr = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	while (1) {
-		key.objectid = 0;
-		key.offset = 0;
-		key.type = 0;
+	path->leave_spinning = 1;
 
-		path->leave_spinning = 1;
+	key.objectid = 0;
+	key.offset = 0;
+	key.type = 0;
+
+	while (1) {
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-		if (ret > 0) {
-			if (path->slots[0] == 0)
-				break;
-			path->slots[0]--;
-		} else if (ret < 0) {
+		if (ret < 0)
+			goto out;
+		leaf = path->nodes[0];
+		nr = btrfs_header_nritems(leaf);
+		if (!nr)
 			break;
-		}
-
-		ret = btrfs_del_item(trans, root, path);
+		/*
+		 * delete the leaf one by one
+		 * since the whole tree is going
+		 * to be deleted.
+		 */
+		path->slots[0] = 0;
+		ret = btrfs_del_items(trans, root, path, 0, nr);
 		if (ret)
 			goto out;
+
 		btrfs_release_path(path);
 	}
 	ret = 0;
@@ -772,19 +804,27 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 		       struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *quota_root;
+	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_path *path = NULL;
 	struct btrfs_qgroup_status_item *ptr;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_qgroup *qgroup = NULL;
 	int ret = 0;
+	int slot;
 
-	spin_lock(&fs_info->qgroup_lock);
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (fs_info->quota_root) {
 		fs_info->pending_quota_state = 1;
-		spin_unlock(&fs_info->qgroup_lock);
 		goto out;
 	}
-	spin_unlock(&fs_info->qgroup_lock);
+
+	fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
+	if (!fs_info->qgroup_ulist) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	/*
 	 * initially create the quota tree
@@ -819,10 +859,57 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 	fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
 				BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 	btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
-	btrfs_set_qgroup_status_scan(leaf, ptr, 0);
+	btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
 
 	btrfs_mark_buffer_dirty(leaf);
 
+	key.objectid = 0;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = 0;
+
+	btrfs_release_path(path);
+	ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
+	if (ret > 0)
+		goto out_add_root;
+	if (ret < 0)
+		goto out_free_path;
+
+
+	while (1) {
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.type == BTRFS_ROOT_REF_KEY) {
+			ret = add_qgroup_item(trans, quota_root,
+					      found_key.offset);
+			if (ret)
+				goto out_free_path;
+
+			qgroup = add_qgroup_rb(fs_info, found_key.offset);
+			if (IS_ERR(qgroup)) {
+				ret = PTR_ERR(qgroup);
+				goto out_free_path;
+			}
+		}
+		ret = btrfs_next_item(tree_root, path);
+		if (ret < 0)
+			goto out_free_path;
+		if (ret)
+			break;
+	}
+
+out_add_root:
+	btrfs_release_path(path);
+	ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
+	if (ret)
+		goto out_free_path;
+
+	qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
+	if (IS_ERR(qgroup)) {
+		ret = PTR_ERR(qgroup);
+		goto out_free_path;
+	}
 	spin_lock(&fs_info->qgroup_lock);
 	fs_info->quota_root = quota_root;
 	fs_info->pending_quota_state = 1;
@@ -836,6 +923,11 @@ out_free_root:
 		kfree(quota_root);
 	}
 out:
+	if (ret) {
+		ulist_free(fs_info->qgroup_ulist);
+		fs_info->qgroup_ulist = NULL;
+	}
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	return ret;
 }
 
@@ -846,6 +938,9 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 	struct btrfs_root *quota_root;
 	int ret = 0;
 
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	if (!fs_info->quota_root)
+		goto out;
 	spin_lock(&fs_info->qgroup_lock);
 	fs_info->quota_enabled = 0;
 	fs_info->pending_quota_state = 0;
@@ -854,8 +949,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 	btrfs_free_qgroup_config(fs_info);
 	spin_unlock(&fs_info->qgroup_lock);
 
-	if (!quota_root)
-		return -EINVAL;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	ret = btrfs_clean_quota_tree(trans, quota_root);
 	if (ret)
@@ -876,39 +973,62 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 	free_extent_buffer(quota_root->commit_root);
 	kfree(quota_root);
 out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	return ret;
 }
 
-int btrfs_quota_rescan(struct btrfs_fs_info *fs_info)
+static void qgroup_dirty(struct btrfs_fs_info *fs_info,
+			 struct btrfs_qgroup *qgroup)
 {
-	/* FIXME */
-	return 0;
+	if (list_empty(&qgroup->dirty))
+		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
 }
 
 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
 {
 	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup_list *list;
 	int ret = 0;
 
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	quota_root = fs_info->quota_root;
-	if (!quota_root)
-		return -EINVAL;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+	member = find_qgroup_rb(fs_info, src);
+	parent = find_qgroup_rb(fs_info, dst);
+	if (!member || !parent) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* check if such qgroup relation exist firstly */
+	list_for_each_entry(list, &member->groups, next_group) {
+		if (list->group == parent) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
 
 	ret = add_qgroup_relation_item(trans, quota_root, src, dst);
 	if (ret)
-		return ret;
+		goto out;
 
 	ret = add_qgroup_relation_item(trans, quota_root, dst, src);
 	if (ret) {
 		del_qgroup_relation_item(trans, quota_root, src, dst);
-		return ret;
+		goto out;
 	}
 
 	spin_lock(&fs_info->qgroup_lock);
 	ret = add_relation_rb(quota_root->fs_info, src, dst);
 	spin_unlock(&fs_info->qgroup_lock);
-
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	return ret;
 }
 
@@ -916,13 +1036,34 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
 			      struct btrfs_fs_info *fs_info, u64 src, u64 dst)
 {
 	struct btrfs_root *quota_root;
+	struct btrfs_qgroup *parent;
+	struct btrfs_qgroup *member;
+	struct btrfs_qgroup_list *list;
 	int ret = 0;
 	int err;
 
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	quota_root = fs_info->quota_root;
-	if (!quota_root)
-		return -EINVAL;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	member = find_qgroup_rb(fs_info, src);
+	parent = find_qgroup_rb(fs_info, dst);
+	if (!member || !parent) {
+		ret = -EINVAL;
+		goto out;
+	}
 
+	/* check if such qgroup relation exist firstly */
+	list_for_each_entry(list, &member->groups, next_group) {
+		if (list->group == parent)
+			goto exist;
+	}
+	ret = -ENOENT;
+	goto out;
+exist:
 	ret = del_qgroup_relation_item(trans, quota_root, src, dst);
 	err = del_qgroup_relation_item(trans, quota_root, dst, src);
 	if (err && !ret)
@@ -930,9 +1071,9 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
 
 	spin_lock(&fs_info->qgroup_lock);
 	del_relation_rb(fs_info, src, dst);
-
 	spin_unlock(&fs_info->qgroup_lock);
-
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	return ret;
 }
 
@@ -943,11 +1084,21 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
 	struct btrfs_qgroup *qgroup;
 	int ret = 0;
 
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	quota_root = fs_info->quota_root;
-	if (!quota_root)
-		return -EINVAL;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+	qgroup = find_qgroup_rb(fs_info, qgroupid);
+	if (qgroup) {
+		ret = -EEXIST;
+		goto out;
+	}
 
 	ret = add_qgroup_item(trans, quota_root, qgroupid);
+	if (ret)
+		goto out;
 
 	spin_lock(&fs_info->qgroup_lock);
 	qgroup = add_qgroup_rb(fs_info, qgroupid);
@@ -955,7 +1106,8 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
 
 	if (IS_ERR(qgroup))
 		ret = PTR_ERR(qgroup);
-
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	return ret;
 }
 
@@ -966,27 +1118,32 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
 	struct btrfs_qgroup *qgroup;
 	int ret = 0;
 
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	quota_root = fs_info->quota_root;
-	if (!quota_root)
-		return -EINVAL;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
 
-	/* check if there are no relations to this qgroup */
-	spin_lock(&fs_info->qgroup_lock);
 	qgroup = find_qgroup_rb(fs_info, qgroupid);
-	if (qgroup) {
-		if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
-			spin_unlock(&fs_info->qgroup_lock);
-			return -EBUSY;
+	if (!qgroup) {
+		ret = -ENOENT;
+		goto out;
+	} else {
+		/* check if there are no relations to this qgroup */
+		if (!list_empty(&qgroup->groups) ||
+		    !list_empty(&qgroup->members)) {
+			ret = -EBUSY;
+			goto out;
 		}
 	}
-	spin_unlock(&fs_info->qgroup_lock);
-
 	ret = del_qgroup_item(trans, quota_root, qgroupid);
 
 	spin_lock(&fs_info->qgroup_lock);
 	del_qgroup_rb(quota_root->fs_info, qgroupid);
 	spin_unlock(&fs_info->qgroup_lock);
-
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	return ret;
 }
 
@@ -994,13 +1151,22 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 		       struct btrfs_fs_info *fs_info, u64 qgroupid,
 		       struct btrfs_qgroup_limit *limit)
 {
-	struct btrfs_root *quota_root = fs_info->quota_root;
+	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
 	int ret = 0;
 
-	if (!quota_root)
-		return -EINVAL;
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
+	quota_root = fs_info->quota_root;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
 
+	qgroup = find_qgroup_rb(fs_info, qgroupid);
+	if (!qgroup) {
+		ret = -ENOENT;
+		goto out;
+	}
 	ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
 				       limit->flags, limit->max_rfer,
 				       limit->max_excl, limit->rsv_rfer,
@@ -1012,31 +1178,17 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 	}
 
 	spin_lock(&fs_info->qgroup_lock);
-
-	qgroup = find_qgroup_rb(fs_info, qgroupid);
-	if (!qgroup) {
-		ret = -ENOENT;
-		goto unlock;
-	}
 	qgroup->lim_flags = limit->flags;
 	qgroup->max_rfer = limit->max_rfer;
 	qgroup->max_excl = limit->max_excl;
 	qgroup->rsv_rfer = limit->rsv_rfer;
 	qgroup->rsv_excl = limit->rsv_excl;
-
-unlock:
 	spin_unlock(&fs_info->qgroup_lock);
-
+out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	return ret;
 }
 
-static void qgroup_dirty(struct btrfs_fs_info *fs_info,
-			 struct btrfs_qgroup *qgroup)
-{
-	if (list_empty(&qgroup->dirty))
-		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
-}
-
 /*
  * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
  * the modification into a list that's later used by btrfs_end_transaction to
@@ -1060,6 +1212,144 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
+				    struct ulist *roots, struct ulist *tmp,
+				    u64 seq)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct ulist_node *tmp_unode;
+	struct ulist_iterator tmp_uiter;
+	struct btrfs_qgroup *qg;
+	int ret;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(roots, &uiter))) {
+		qg = find_qgroup_rb(fs_info, unode->val);
+		if (!qg)
+			continue;
+
+		ulist_reinit(tmp);
+						/* XXX id not needed */
+		ret = ulist_add(tmp, qg->qgroupid,
+				(u64)(uintptr_t)qg, GFP_ATOMIC);
+		if (ret < 0)
+			return ret;
+		ULIST_ITER_INIT(&tmp_uiter);
+		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+			struct btrfs_qgroup_list *glist;
+
+			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
+			if (qg->refcnt < seq)
+				qg->refcnt = seq + 1;
+			else
+				++qg->refcnt;
+
+			list_for_each_entry(glist, &qg->groups, next_group) {
+				ret = ulist_add(tmp, glist->group->qgroupid,
+						(u64)(uintptr_t)glist->group,
+						GFP_ATOMIC);
+				if (ret < 0)
+					return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info,
+				    struct ulist *roots, struct ulist *tmp,
+				    u64 seq, int sgn, u64 num_bytes,
+				    struct btrfs_qgroup *qgroup)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct btrfs_qgroup *qg;
+	struct btrfs_qgroup_list *glist;
+	int ret;
+
+	ulist_reinit(tmp);
+	ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
+	if (ret < 0)
+		return ret;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(tmp, &uiter))) {
+		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+		if (qg->refcnt < seq) {
+			/* not visited by step 1 */
+			qg->rfer += sgn * num_bytes;
+			qg->rfer_cmpr += sgn * num_bytes;
+			if (roots->nnodes == 0) {
+				qg->excl += sgn * num_bytes;
+				qg->excl_cmpr += sgn * num_bytes;
+			}
+			qgroup_dirty(fs_info, qg);
+		}
+		WARN_ON(qg->tag >= seq);
+		qg->tag = seq;
+
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ret = ulist_add(tmp, glist->group->qgroupid,
+					(uintptr_t)glist->group, GFP_ATOMIC);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info,
+				    struct ulist *roots, struct ulist *tmp,
+				    u64 seq, int sgn, u64 num_bytes)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct btrfs_qgroup *qg;
+	struct ulist_node *tmp_unode;
+	struct ulist_iterator tmp_uiter;
+	int ret;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(roots, &uiter))) {
+		qg = find_qgroup_rb(fs_info, unode->val);
+		if (!qg)
+			continue;
+
+		ulist_reinit(tmp);
+		ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
+		if (ret < 0)
+			return ret;
+
+		ULIST_ITER_INIT(&tmp_uiter);
+		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+			struct btrfs_qgroup_list *glist;
+
+			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
+			if (qg->tag == seq)
+				continue;
+
+			if (qg->refcnt - seq == roots->nnodes) {
+				qg->excl -= sgn * num_bytes;
+				qg->excl_cmpr -= sgn * num_bytes;
+				qgroup_dirty(fs_info, qg);
+			}
+
+			list_for_each_entry(glist, &qg->groups, next_group) {
+				ret = ulist_add(tmp, glist->group->qgroupid,
+						(uintptr_t)glist->group,
+						GFP_ATOMIC);
+				if (ret < 0)
+					return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
 /*
  * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
  * from the fs. First, all roots referencing the extent are searched, and
@@ -1075,10 +1365,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_root *quota_root;
 	u64 ref_root;
 	struct btrfs_qgroup *qgroup;
-	struct ulist_node *unode;
 	struct ulist *roots = NULL;
-	struct ulist *tmp = NULL;
-	struct ulist_iterator uiter;
 	u64 seq;
 	int ret = 0;
 	int sgn;
@@ -1117,9 +1404,11 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 	case BTRFS_ADD_DELAYED_REF:
 	case BTRFS_ADD_DELAYED_EXTENT:
 		sgn = 1;
+		seq = btrfs_tree_mod_seq_prev(node->seq);
 		break;
 	case BTRFS_DROP_DELAYED_REF:
 		sgn = -1;
+		seq = node->seq;
 		break;
 	case BTRFS_UPDATE_DELAYED_HEAD:
 		return 0;
@@ -1127,20 +1416,30 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 		BUG();
 	}
 
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+		if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
+			mutex_unlock(&fs_info->qgroup_rescan_lock);
+			return 0;
+		}
+	}
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
 	/*
 	 * the delayed ref sequence number we pass depends on the direction of
-	 * the operation. for add operations, we pass (node->seq - 1) to skip
+	 * the operation. for add operations, we pass
+	 * tree_mod_log_prev_seq(node->seq) to skip
 	 * the delayed ref's current sequence number, because we need the state
 	 * of the tree before the add operation. for delete operations, we pass
 	 * (node->seq) to include the delayed ref's current sequence number,
 	 * because we need the state of the tree after the delete operation.
 	 */
-	ret = btrfs_find_all_roots(trans, fs_info, node->bytenr,
-				   sgn > 0 ? node->seq - 1 : node->seq, &roots);
+	ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	spin_lock(&fs_info->qgroup_lock);
+
 	quota_root = fs_info->quota_root;
 	if (!quota_root)
 		goto unlock;
@@ -1152,117 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 	/*
 	 * step 1: for each old ref, visit all nodes once and inc refcnt
 	 */
-	tmp = ulist_alloc(GFP_ATOMIC);
-	if (!tmp) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
+	ulist_reinit(fs_info->qgroup_ulist);
 	seq = fs_info->qgroup_seq;
 	fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
 
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(roots, &uiter))) {
-		struct ulist_node *tmp_unode;
-		struct ulist_iterator tmp_uiter;
-		struct btrfs_qgroup *qg;
-
-		qg = find_qgroup_rb(fs_info, unode->val);
-		if (!qg)
-			continue;
-
-		ulist_reinit(tmp);
-						/* XXX id not needed */
-		ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
-		ULIST_ITER_INIT(&tmp_uiter);
-		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
-			struct btrfs_qgroup_list *glist;
-
-			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
-			if (qg->refcnt < seq)
-				qg->refcnt = seq + 1;
-			else
-				++qg->refcnt;
-
-			list_for_each_entry(glist, &qg->groups, next_group) {
-				ulist_add(tmp, glist->group->qgroupid,
-					  (u64)(uintptr_t)glist->group,
-					  GFP_ATOMIC);
-			}
-		}
-	}
+	ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
+				       seq);
+	if (ret)
+		goto unlock;
 
 	/*
 	 * step 2: walk from the new root
 	 */
-	ulist_reinit(tmp);
-	ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(tmp, &uiter))) {
-		struct btrfs_qgroup *qg;
-		struct btrfs_qgroup_list *glist;
-
-		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
-		if (qg->refcnt < seq) {
-			/* not visited by step 1 */
-			qg->rfer += sgn * node->num_bytes;
-			qg->rfer_cmpr += sgn * node->num_bytes;
-			if (roots->nnodes == 0) {
-				qg->excl += sgn * node->num_bytes;
-				qg->excl_cmpr += sgn * node->num_bytes;
-			}
-			qgroup_dirty(fs_info, qg);
-		}
-		WARN_ON(qg->tag >= seq);
-		qg->tag = seq;
-
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ulist_add(tmp, glist->group->qgroupid,
-				  (uintptr_t)glist->group, GFP_ATOMIC);
-		}
-	}
+	ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
+				       seq, sgn, node->num_bytes, qgroup);
+	if (ret)
+		goto unlock;
 
 	/*
 	 * step 3: walk again from old refs
 	 */
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(roots, &uiter))) {
-		struct btrfs_qgroup *qg;
-		struct ulist_node *tmp_unode;
-		struct ulist_iterator tmp_uiter;
-
-		qg = find_qgroup_rb(fs_info, unode->val);
-		if (!qg)
-			continue;
-
-		ulist_reinit(tmp);
-		ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
-		ULIST_ITER_INIT(&tmp_uiter);
-		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
-			struct btrfs_qgroup_list *glist;
-
-			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
-			if (qg->tag == seq)
-				continue;
-
-			if (qg->refcnt - seq == roots->nnodes) {
-				qg->excl -= sgn * node->num_bytes;
-				qg->excl_cmpr -= sgn * node->num_bytes;
-				qgroup_dirty(fs_info, qg);
-			}
+	ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
+				       seq, sgn, node->num_bytes);
+	if (ret)
+		goto unlock;
 
-			list_for_each_entry(glist, &qg->groups, next_group) {
-				ulist_add(tmp, glist->group->qgroupid,
-					  (uintptr_t)glist->group,
-					  GFP_ATOMIC);
-			}
-		}
-	}
-	ret = 0;
 unlock:
 	spin_unlock(&fs_info->qgroup_lock);
-out:
 	ulist_free(roots);
-	ulist_free(tmp);
 
 	return ret;
 }
@@ -1275,10 +1491,14 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *quota_root = fs_info->quota_root;
 	int ret = 0;
+	int start_rescan_worker = 0;
 
 	if (!quota_root)
 		goto out;
 
+	if (!fs_info->quota_enabled && fs_info->pending_quota_state)
+		start_rescan_worker = 1;
+
 	fs_info->quota_enabled = fs_info->pending_quota_state;
 
 	spin_lock(&fs_info->qgroup_lock);
@@ -1304,6 +1524,16 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 	if (ret)
 		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 
+	if (!ret && start_rescan_worker) {
+		ret = qgroup_rescan_init(fs_info, 0, 1);
+		if (!ret) {
+			qgroup_rescan_zero_tracking(fs_info);
+			btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+					   &fs_info->qgroup_rescan_work);
+		}
+		ret = 0;
+	}
+
 out:
 
 	return ret;
@@ -1324,12 +1554,30 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 	struct btrfs_qgroup *srcgroup;
 	struct btrfs_qgroup *dstgroup;
 	u32 level_size = 0;
+	u64 nums;
 
+	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (!fs_info->quota_enabled)
-		return 0;
+		goto out;
 
-	if (!quota_root)
-		return -EINVAL;
+	if (!quota_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (inherit) {
+		i_qgroups = (u64 *)(inherit + 1);
+		nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
+		       2 * inherit->num_excl_copies;
+		for (i = 0; i < nums; ++i) {
+			srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
+			if (!srcgroup) {
+				ret = -EINVAL;
+				goto out;
+			}
+			++i_qgroups;
+		}
+	}
 
 	/*
 	 * create a tracking group for the subvol itself
@@ -1456,6 +1704,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 unlock:
 	spin_unlock(&fs_info->qgroup_lock);
 out:
+	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	return ret;
 }
 
@@ -1472,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 ref_root = root->root_key.objectid;
 	int ret = 0;
-	struct ulist *ulist = NULL;
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
 
@@ -1495,42 +1743,46 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 	 * in a first step, we check all affected qgroups if any limits would
 	 * be exceeded
 	 */
-	ulist = ulist_alloc(GFP_ATOMIC);
-	if (!ulist) {
-		ret = -ENOMEM;
+	ulist_reinit(fs_info->qgroup_ulist);
+	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+			(uintptr_t)qgroup, GFP_ATOMIC);
+	if (ret < 0)
 		goto out;
-	}
-	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(ulist, &uiter))) {
+	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
 		struct btrfs_qgroup_list *glist;
 
 		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 
 		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
-		    qg->reserved + qg->rfer + num_bytes >
-		    qg->max_rfer)
+		    qg->reserved + (s64)qg->rfer + num_bytes >
+		    qg->max_rfer) {
 			ret = -EDQUOT;
+			goto out;
+		}
 
 		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
-		    qg->reserved + qg->excl + num_bytes >
-		    qg->max_excl)
+		    qg->reserved + (s64)qg->excl + num_bytes >
+		    qg->max_excl) {
 			ret = -EDQUOT;
+			goto out;
+		}
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
-			ulist_add(ulist, glist->group->qgroupid,
-				  (uintptr_t)glist->group, GFP_ATOMIC);
+			ret = ulist_add(fs_info->qgroup_ulist,
+					glist->group->qgroupid,
+					(uintptr_t)glist->group, GFP_ATOMIC);
+			if (ret < 0)
+				goto out;
 		}
 	}
-	if (ret)
-		goto out;
-
+	ret = 0;
 	/*
 	 * no limits exceeded, now record the reservation into all qgroups
 	 */
 	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(ulist, &uiter))) {
+	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
 
 		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
@@ -1540,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 
 out:
 	spin_unlock(&fs_info->qgroup_lock);
-	ulist_free(ulist);
-
 	return ret;
 }
 
@@ -1550,10 +1800,10 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct ulist *ulist = NULL;
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
 	u64 ref_root = root->root_key.objectid;
+	int ret = 0;
 
 	if (!is_fstree(ref_root))
 		return;
@@ -1571,14 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 	if (!qgroup)
 		goto out;
 
-	ulist = ulist_alloc(GFP_ATOMIC);
-	if (!ulist) {
-		btrfs_std_error(fs_info, -ENOMEM);
+	ulist_reinit(fs_info->qgroup_ulist);
+	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+			(uintptr_t)qgroup, GFP_ATOMIC);
+	if (ret < 0)
 		goto out;
-	}
-	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(ulist, &uiter))) {
+	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
 		struct btrfs_qgroup_list *glist;
 
@@ -1587,22 +1836,370 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 		qg->reserved -= num_bytes;
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
-			ulist_add(ulist, glist->group->qgroupid,
-				  (uintptr_t)glist->group, GFP_ATOMIC);
+			ret = ulist_add(fs_info->qgroup_ulist,
+					glist->group->qgroupid,
+					(uintptr_t)glist->group, GFP_ATOMIC);
+			if (ret < 0)
+				goto out;
 		}
 	}
 
 out:
 	spin_unlock(&fs_info->qgroup_lock);
-	ulist_free(ulist);
 }
 
 void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
 {
 	if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
 		return;
-	printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n",
+	pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
 		trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
-		trans->delayed_ref_elem.seq);
+		(u32)(trans->delayed_ref_elem.seq >> 32),
+		(u32)trans->delayed_ref_elem.seq);
 	BUG();
 }
+
+/*
+ * returns < 0 on error, 0 when more leafs are to be scanned.
+ * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
+ */
+static int
+qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+		   struct btrfs_trans_handle *trans, struct ulist *tmp,
+		   struct extent_buffer *scratch_leaf)
+{
+	struct btrfs_key found;
+	struct ulist *roots = NULL;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct seq_list tree_mod_seq_elem = {};
+	u64 seq;
+	int slot;
+	int ret;
+
+	path->leave_spinning = 1;
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	ret = btrfs_search_slot_for_read(fs_info->extent_root,
+					 &fs_info->qgroup_rescan_progress,
+					 path, 1, 0);
+
+	pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n",
+		 (unsigned long long)fs_info->qgroup_rescan_progress.objectid,
+		 fs_info->qgroup_rescan_progress.type,
+		 (unsigned long long)fs_info->qgroup_rescan_progress.offset,
+		 ret);
+
+	if (ret) {
+		/*
+		 * The rescan is about to end, we will not be scanning any
+		 * further blocks. We cannot unset the RESCAN flag here, because
+		 * we want to commit the transaction if everything went well.
+		 * To make the live accounting work in this phase, we set our
+		 * scan progress pointer such that every real extent objectid
+		 * will be smaller.
+		 */
+		fs_info->qgroup_rescan_progress.objectid = (u64)-1;
+		btrfs_release_path(path);
+		mutex_unlock(&fs_info->qgroup_rescan_lock);
+		return ret;
+	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], &found,
+			      btrfs_header_nritems(path->nodes[0]) - 1);
+	fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
+
+	btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+	memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+	slot = path->slots[0];
+	btrfs_release_path(path);
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
+		btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
+		if (found.type != BTRFS_EXTENT_ITEM_KEY)
+			continue;
+		ret = btrfs_find_all_roots(trans, fs_info, found.objectid,
+					   tree_mod_seq_elem.seq, &roots);
+		if (ret < 0)
+			goto out;
+		spin_lock(&fs_info->qgroup_lock);
+		seq = fs_info->qgroup_seq;
+		fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+
+		ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq);
+		if (ret) {
+			spin_unlock(&fs_info->qgroup_lock);
+			ulist_free(roots);
+			goto out;
+		}
+
+		/*
+		 * step2 of btrfs_qgroup_account_ref works from a single root,
+		 * we're doing all at once here.
+		 */
+		ulist_reinit(tmp);
+		ULIST_ITER_INIT(&uiter);
+		while ((unode = ulist_next(roots, &uiter))) {
+			struct btrfs_qgroup *qg;
+
+			qg = find_qgroup_rb(fs_info, unode->val);
+			if (!qg)
+				continue;
+
+			ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg,
+					GFP_ATOMIC);
+			if (ret < 0) {
+				spin_unlock(&fs_info->qgroup_lock);
+				ulist_free(roots);
+				goto out;
+			}
+		}
+
+		/* this loop is similar to step 2 of btrfs_qgroup_account_ref */
+		ULIST_ITER_INIT(&uiter);
+		while ((unode = ulist_next(tmp, &uiter))) {
+			struct btrfs_qgroup *qg;
+			struct btrfs_qgroup_list *glist;
+
+			qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
+			qg->rfer += found.offset;
+			qg->rfer_cmpr += found.offset;
+			WARN_ON(qg->tag >= seq);
+			if (qg->refcnt - seq == roots->nnodes) {
+				qg->excl += found.offset;
+				qg->excl_cmpr += found.offset;
+			}
+			qgroup_dirty(fs_info, qg);
+
+			list_for_each_entry(glist, &qg->groups, next_group) {
+				ret = ulist_add(tmp, glist->group->qgroupid,
+						(uintptr_t)glist->group,
+						GFP_ATOMIC);
+				if (ret < 0) {
+					spin_unlock(&fs_info->qgroup_lock);
+					ulist_free(roots);
+					goto out;
+				}
+			}
+		}
+
+		spin_unlock(&fs_info->qgroup_lock);
+		ulist_free(roots);
+		ret = 0;
+	}
+
+out:
+	btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+
+	return ret;
+}
+
+static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
+{
+	struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
+						     qgroup_rescan_work);
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans = NULL;
+	struct ulist *tmp = NULL;
+	struct extent_buffer *scratch_leaf = NULL;
+	int err = -ENOMEM;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto out;
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		goto out;
+	scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
+	if (!scratch_leaf)
+		goto out;
+
+	err = 0;
+	while (!err) {
+		trans = btrfs_start_transaction(fs_info->fs_root, 0);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			break;
+		}
+		if (!fs_info->quota_enabled) {
+			err = -EINTR;
+		} else {
+			err = qgroup_rescan_leaf(fs_info, path, trans,
+						 tmp, scratch_leaf);
+		}
+		if (err > 0)
+			btrfs_commit_transaction(trans, fs_info->fs_root);
+		else
+			btrfs_end_transaction(trans, fs_info->fs_root);
+	}
+
+out:
+	kfree(scratch_leaf);
+	ulist_free(tmp);
+	btrfs_free_path(path);
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+
+	if (err == 2 &&
+	    fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	} else if (err < 0) {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+	}
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	if (err >= 0) {
+		pr_info("btrfs: qgroup scan completed%s\n",
+			err == 2 ? " (inconsistency flag cleared)" : "");
+	} else {
+		pr_err("btrfs: qgroup scan failed with %d\n", err);
+	}
+
+	complete_all(&fs_info->qgroup_rescan_completion);
+}
+
+/*
+ * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
+ * memory required for the rescan context.
+ */
+static int
+qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
+		   int init_flags)
+{
+	int ret = 0;
+
+	if (!init_flags &&
+	    (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
+	     !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	spin_lock(&fs_info->qgroup_lock);
+
+	if (init_flags) {
+		if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+			ret = -EINPROGRESS;
+		else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+			ret = -EINVAL;
+
+		if (ret) {
+			spin_unlock(&fs_info->qgroup_lock);
+			mutex_unlock(&fs_info->qgroup_rescan_lock);
+			goto err;
+		}
+
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	}
+
+	memset(&fs_info->qgroup_rescan_progress, 0,
+		sizeof(fs_info->qgroup_rescan_progress));
+	fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+
+	spin_unlock(&fs_info->qgroup_lock);
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	init_completion(&fs_info->qgroup_rescan_completion);
+
+	memset(&fs_info->qgroup_rescan_work, 0,
+	       sizeof(fs_info->qgroup_rescan_work));
+	fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+
+	if (ret) {
+err:
+		pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void
+qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
+{
+	struct rb_node *n;
+	struct btrfs_qgroup *qgroup;
+
+	spin_lock(&fs_info->qgroup_lock);
+	/* clear all current qgroup tracking information */
+	for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
+		qgroup = rb_entry(n, struct btrfs_qgroup, node);
+		qgroup->rfer = 0;
+		qgroup->rfer_cmpr = 0;
+		qgroup->excl = 0;
+		qgroup->excl_cmpr = 0;
+	}
+	spin_unlock(&fs_info->qgroup_lock);
+}
+
+int
+btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
+{
+	int ret = 0;
+	struct btrfs_trans_handle *trans;
+
+	ret = qgroup_rescan_init(fs_info, 0, 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * We have set the rescan_progress to 0, which means no more
+	 * delayed refs will be accounted by btrfs_qgroup_account_ref.
+	 * However, btrfs_qgroup_account_ref may be right after its call
+	 * to btrfs_find_all_roots, in which case it would still do the
+	 * accounting.
+	 * To solve this, we're committing the transaction, which will
+	 * ensure we run all delayed refs and only after that, we are
+	 * going to clear all tracking information for a clean start.
+	 */
+
+	trans = btrfs_join_transaction(fs_info->fs_root);
+	if (IS_ERR(trans)) {
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+		return PTR_ERR(trans);
+	}
+	ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+	if (ret) {
+		fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+		return ret;
+	}
+
+	qgroup_rescan_zero_tracking(fs_info);
+
+	btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+			   &fs_info->qgroup_rescan_work);
+
+	return 0;
+}
+
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+{
+	int running;
+	int ret = 0;
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	spin_lock(&fs_info->qgroup_lock);
+	running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+	spin_unlock(&fs_info->qgroup_lock);
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	if (running)
+		ret = wait_for_completion_interruptible(
+					&fs_info->qgroup_rescan_completion);
+
+	return ret;
+}
+
+/*
+ * this is only called from open_ctree where we're still single threaded, thus
+ * locking is omitted here.
+ */
+void
+btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
+{
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+		btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+				   &fs_info->qgroup_rescan_work);
+}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 00000000000..0525e1389f5
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2100 @@
+/*
+ * Copyright (C) 2012 Fusion-io  All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/hash.h>
+#include <linux/list_sort.h>
+#include <linux/raid/xor.h>
+#include <linux/vmalloc.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+
+/* set when additional merges to this rbio are not allowed */
+#define RBIO_RMW_LOCKED_BIT	1
+
+/*
+ * set when this rbio is sitting in the hash, but it is just a cache
+ * of past RMW
+ */
+#define RBIO_CACHE_BIT		2
+
+/*
+ * set when it is safe to trust the stripe_pages for caching
+ */
+#define RBIO_CACHE_READY_BIT	3
+
+
+#define RBIO_CACHE_SIZE 1024
+
+struct btrfs_raid_bio {
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_bio *bbio;
+
+	/*
+	 * logical block numbers for the start of each stripe
+	 * The last one or two are p/q.  These are sorted,
+	 * so raid_map[0] is the start of our full stripe
+	 */
+	u64 *raid_map;
+
+	/* while we're doing rmw on a stripe
+	 * we put it into a hash table so we can
+	 * lock the stripe and merge more rbios
+	 * into it.
+	 */
+	struct list_head hash_list;
+
+	/*
+	 * LRU list for the stripe cache
+	 */
+	struct list_head stripe_cache;
+
+	/*
+	 * for scheduling work in the helper threads
+	 */
+	struct btrfs_work work;
+
+	/*
+	 * bio list and bio_list_lock are used
+	 * to add more bios into the stripe
+	 * in hopes of avoiding the full rmw
+	 */
+	struct bio_list bio_list;
+	spinlock_t bio_list_lock;
+
+	/* also protected by the bio_list_lock, the
+	 * plug list is used by the plugging code
+	 * to collect partial bios while plugged.  The
+	 * stripe locking code also uses it to hand off
+	 * the stripe lock to the next pending IO
+	 */
+	struct list_head plug_list;
+
+	/*
+	 * flags that tell us if it is safe to
+	 * merge with this bio
+	 */
+	unsigned long flags;
+
+	/* size of each individual stripe on disk */
+	int stripe_len;
+
+	/* number of data stripes (no p/q) */
+	int nr_data;
+
+	/*
+	 * set if we're doing a parity rebuild
+	 * for a read from higher up, which is handled
+	 * differently from a parity rebuild as part of
+	 * rmw
+	 */
+	int read_rebuild;
+
+	/* first bad stripe */
+	int faila;
+
+	/* second bad stripe (for raid6 use) */
+	int failb;
+
+	/*
+	 * number of pages needed to represent the full
+	 * stripe
+	 */
+	int nr_pages;
+
+	/*
+	 * size of all the bios in the bio_list.  This
+	 * helps us decide if the rbio maps to a full
+	 * stripe or not
+	 */
+	int bio_list_bytes;
+
+	atomic_t refs;
+
+	/*
+	 * these are two arrays of pointers.  We allocate the
+	 * rbio big enough to hold them both and setup their
+	 * locations when the rbio is allocated
+	 */
+
+	/* pointers to pages that we allocated for
+	 * reading/writing stripes directly from the disk (including P/Q)
+	 */
+	struct page **stripe_pages;
+
+	/*
+	 * pointers to the pages in the bio_list.  Stored
+	 * here for faster lookup
+	 */
+	struct page **bio_pages;
+};
+
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
+static void rmw_work(struct btrfs_work *work);
+static void read_rebuild_work(struct btrfs_work *work);
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
+static void async_read_rebuild(struct btrfs_raid_bio *rbio);
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
+static void __free_raid_bio(struct btrfs_raid_bio *rbio);
+static void index_rbio_pages(struct btrfs_raid_bio *rbio);
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
+
+/*
+ * the stripe hash table is used for locking, and to collect
+ * bios in hopes of making a full stripe
+ */
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
+{
+	struct btrfs_stripe_hash_table *table;
+	struct btrfs_stripe_hash_table *x;
+	struct btrfs_stripe_hash *cur;
+	struct btrfs_stripe_hash *h;
+	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
+	int i;
+	int table_size;
+
+	if (info->stripe_hash_table)
+		return 0;
+
+	/*
+	 * The table is large, starting with order 4 and can go as high as
+	 * order 7 in case lock debugging is turned on.
+	 *
+	 * Try harder to allocate and fallback to vmalloc to lower the chance
+	 * of a failing mount.
+	 */
+	table_size = sizeof(*table) + sizeof(*h) * num_entries;
+	table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+	if (!table) {
+		table = vzalloc(table_size);
+		if (!table)
+			return -ENOMEM;
+	}
+
+	spin_lock_init(&table->cache_lock);
+	INIT_LIST_HEAD(&table->stripe_cache);
+
+	h = table->table;
+
+	for (i = 0; i < num_entries; i++) {
+		cur = h + i;
+		INIT_LIST_HEAD(&cur->hash_list);
+		spin_lock_init(&cur->lock);
+		init_waitqueue_head(&cur->wait);
+	}
+
+	x = cmpxchg(&info->stripe_hash_table, NULL, table);
+	if (x) {
+		if (is_vmalloc_addr(x))
+			vfree(x);
+		else
+			kfree(x);
+	}
+	return 0;
+}
+
+/*
+ * caching an rbio means to copy anything from the
+ * bio_pages array into the stripe_pages array.  We
+ * use the page uptodate bit in the stripe cache array
+ * to indicate if it has valid data
+ *
+ * once the caching is done, we set the cache ready
+ * bit.
+ */
+static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+	int i;
+	char *s;
+	char *d;
+	int ret;
+
+	ret = alloc_rbio_pages(rbio);
+	if (ret)
+		return;
+
+	for (i = 0; i < rbio->nr_pages; i++) {
+		if (!rbio->bio_pages[i])
+			continue;
+
+		s = kmap(rbio->bio_pages[i]);
+		d = kmap(rbio->stripe_pages[i]);
+
+		memcpy(d, s, PAGE_CACHE_SIZE);
+
+		kunmap(rbio->bio_pages[i]);
+		kunmap(rbio->stripe_pages[i]);
+		SetPageUptodate(rbio->stripe_pages[i]);
+	}
+	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+}
+
+/*
+ * we hash on the first logical address of the stripe
+ */
+static int rbio_bucket(struct btrfs_raid_bio *rbio)
+{
+	u64 num = rbio->raid_map[0];
+
+	/*
+	 * we shift down quite a bit.  We're using byte
+	 * addressing, and most of the lower bits are zeros.
+	 * This tends to upset hash_64, and it consistently
+	 * returns just one or two different values.
+	 *
+	 * shifting off the lower bits fixes things.
+	 */
+	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
+}
+
+/*
+ * stealing an rbio means taking all the uptodate pages from the stripe
+ * array in the source rbio and putting them into the destination rbio
+ */
+static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
+{
+	int i;
+	struct page *s;
+	struct page *d;
+
+	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
+		return;
+
+	for (i = 0; i < dest->nr_pages; i++) {
+		s = src->stripe_pages[i];
+		if (!s || !PageUptodate(s)) {
+			continue;
+		}
+
+		d = dest->stripe_pages[i];
+		if (d)
+			__free_page(d);
+
+		dest->stripe_pages[i] = s;
+		src->stripe_pages[i] = NULL;
+	}
+}
+
+/*
+ * merging means we take the bio_list from the victim and
+ * splice it into the destination.  The victim should
+ * be discarded afterwards.
+ *
+ * must be called with dest->rbio_list_lock held
+ */
+static void merge_rbio(struct btrfs_raid_bio *dest,
+		       struct btrfs_raid_bio *victim)
+{
+	bio_list_merge(&dest->bio_list, &victim->bio_list);
+	dest->bio_list_bytes += victim->bio_list_bytes;
+	bio_list_init(&victim->bio_list);
+}
+
+/*
+ * used to prune items that are in the cache.  The caller
+ * must hold the hash table lock.
+ */
+static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+	int bucket = rbio_bucket(rbio);
+	struct btrfs_stripe_hash_table *table;
+	struct btrfs_stripe_hash *h;
+	int freeit = 0;
+
+	/*
+	 * check the bit again under the hash table lock.
+	 */
+	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+		return;
+
+	table = rbio->fs_info->stripe_hash_table;
+	h = table->table + bucket;
+
+	/* hold the lock for the bucket because we may be
+	 * removing it from the hash table
+	 */
+	spin_lock(&h->lock);
+
+	/*
+	 * hold the lock for the bio list because we need
+	 * to make sure the bio list is empty
+	 */
+	spin_lock(&rbio->bio_list_lock);
+
+	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+		list_del_init(&rbio->stripe_cache);
+		table->cache_size -= 1;
+		freeit = 1;
+
+		/* if the bio list isn't empty, this rbio is
+		 * still involved in an IO.  We take it out
+		 * of the cache list, and drop the ref that
+		 * was held for the list.
+		 *
+		 * If the bio_list was empty, we also remove
+		 * the rbio from the hash_table, and drop
+		 * the corresponding ref
+		 */
+		if (bio_list_empty(&rbio->bio_list)) {
+			if (!list_empty(&rbio->hash_list)) {
+				list_del_init(&rbio->hash_list);
+				atomic_dec(&rbio->refs);
+				BUG_ON(!list_empty(&rbio->plug_list));
+			}
+		}
+	}
+
+	spin_unlock(&rbio->bio_list_lock);
+	spin_unlock(&h->lock);
+
+	if (freeit)
+		__free_raid_bio(rbio);
+}
+
+/*
+ * prune a given rbio from the cache
+ */
+static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+	struct btrfs_stripe_hash_table *table;
+	unsigned long flags;
+
+	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+		return;
+
+	table = rbio->fs_info->stripe_hash_table;
+
+	spin_lock_irqsave(&table->cache_lock, flags);
+	__remove_rbio_from_cache(rbio);
+	spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove everything in the cache
+ */
+static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+{
+	struct btrfs_stripe_hash_table *table;
+	unsigned long flags;
+	struct btrfs_raid_bio *rbio;
+
+	table = info->stripe_hash_table;
+
+	spin_lock_irqsave(&table->cache_lock, flags);
+	while (!list_empty(&table->stripe_cache)) {
+		rbio = list_entry(table->stripe_cache.next,
+				  struct btrfs_raid_bio,
+				  stripe_cache);
+		__remove_rbio_from_cache(rbio);
+	}
+	spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+
+/*
+ * remove all cached entries and free the hash table
+ * used by unmount
+ */
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
+{
+	if (!info->stripe_hash_table)
+		return;
+	btrfs_clear_rbio_cache(info);
+	if (is_vmalloc_addr(info->stripe_hash_table))
+		vfree(info->stripe_hash_table);
+	else
+		kfree(info->stripe_hash_table);
+	info->stripe_hash_table = NULL;
+}
+
+/*
+ * insert an rbio into the stripe cache.  It
+ * must have already been prepared by calling
+ * cache_rbio_pages
+ *
+ * If this rbio was already cached, it gets
+ * moved to the front of the lru.
+ *
+ * If the size of the rbio cache is too big, we
+ * prune an item.
+ */
+static void cache_rbio(struct btrfs_raid_bio *rbio)
+{
+	struct btrfs_stripe_hash_table *table;
+	unsigned long flags;
+
+	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
+		return;
+
+	table = rbio->fs_info->stripe_hash_table;
+
+	spin_lock_irqsave(&table->cache_lock, flags);
+	spin_lock(&rbio->bio_list_lock);
+
+	/* bump our ref if we were not in the list before */
+	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
+		atomic_inc(&rbio->refs);
+
+	if (!list_empty(&rbio->stripe_cache)){
+		list_move(&rbio->stripe_cache, &table->stripe_cache);
+	} else {
+		list_add(&rbio->stripe_cache, &table->stripe_cache);
+		table->cache_size += 1;
+	}
+
+	spin_unlock(&rbio->bio_list_lock);
+
+	if (table->cache_size > RBIO_CACHE_SIZE) {
+		struct btrfs_raid_bio *found;
+
+		found = list_entry(table->stripe_cache.prev,
+				  struct btrfs_raid_bio,
+				  stripe_cache);
+
+		if (found != rbio)
+			__remove_rbio_from_cache(found);
+	}
+
+	spin_unlock_irqrestore(&table->cache_lock, flags);
+	return;
+}
+
+/*
+ * helper function to run the xor_blocks api.  It is only
+ * able to do MAX_XOR_BLOCKS at a time, so we need to
+ * loop through.
+ */
+static void run_xor(void **pages, int src_cnt, ssize_t len)
+{
+	int src_off = 0;
+	int xor_src_cnt = 0;
+	void *dest = pages[src_cnt];
+
+	while(src_cnt > 0) {
+		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
+		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
+
+		src_cnt -= xor_src_cnt;
+		src_off += xor_src_cnt;
+	}
+}
+
+/*
+ * returns true if the bio list inside this rbio
+ * covers an entire stripe (no rmw required).
+ * Must be called with the bio list lock held, or
+ * at a time when you know it is impossible to add
+ * new bios into the list
+ */
+static int __rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+	unsigned long size = rbio->bio_list_bytes;
+	int ret = 1;
+
+	if (size != rbio->nr_data * rbio->stripe_len)
+		ret = 0;
+
+	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
+	return ret;
+}
+
+static int rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&rbio->bio_list_lock, flags);
+	ret = __rbio_is_full(rbio);
+	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+	return ret;
+}
+
+/*
+ * returns 1 if it is safe to merge two rbios together.
+ * The merging is safe if the two rbios correspond to
+ * the same stripe and if they are both going in the same
+ * direction (read vs write), and if neither one is
+ * locked for final IO
+ *
+ * The caller is responsible for locking such that
+ * rmw_locked is safe to test
+ */
+static int rbio_can_merge(struct btrfs_raid_bio *last,
+			  struct btrfs_raid_bio *cur)
+{
+	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
+	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
+		return 0;
+
+	/*
+	 * we can't merge with cached rbios, since the
+	 * idea is that when we merge the destination
+	 * rbio is going to run our IO for us.  We can
+	 * steal from cached rbio's though, other functions
+	 * handle that.
+	 */
+	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
+	    test_bit(RBIO_CACHE_BIT, &cur->flags))
+		return 0;
+
+	if (last->raid_map[0] !=
+	    cur->raid_map[0])
+		return 0;
+
+	/* reads can't merge with writes */
+	if (last->read_rebuild !=
+	    cur->read_rebuild) {
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * helper to index into the pstripe
+ */
+static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+	return rbio->stripe_pages[index];
+}
+
+/*
+ * helper to index into the qstripe, returns null
+ * if there is no qstripe
+ */
+static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
+		return NULL;
+
+	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
+		PAGE_CACHE_SHIFT;
+	return rbio->stripe_pages[index];
+}
+
+/*
+ * The first stripe in the table for a logical address
+ * has the lock.  rbios are added in one of three ways:
+ *
+ * 1) Nobody has the stripe locked yet.  The rbio is given
+ * the lock and 0 is returned.  The caller must start the IO
+ * themselves.
+ *
+ * 2) Someone has the stripe locked, but we're able to merge
+ * with the lock owner.  The rbio is freed and the IO will
+ * start automatically along with the existing rbio.  1 is returned.
+ *
+ * 3) Someone has the stripe locked, but we're not able to merge.
+ * The rbio is added to the lock owner's plug list, or merged into
+ * an rbio already on the plug list.  When the lock owner unlocks,
+ * the next rbio on the list is run and the IO is started automatically.
+ * 1 is returned
+ *
+ * If we return 0, the caller still owns the rbio and must continue with
+ * IO submission.  If we return 1, the caller must assume the rbio has
+ * already been freed.
+ */
+static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
+{
+	int bucket = rbio_bucket(rbio);
+	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
+	struct btrfs_raid_bio *cur;
+	struct btrfs_raid_bio *pending;
+	unsigned long flags;
+	DEFINE_WAIT(wait);
+	struct btrfs_raid_bio *freeit = NULL;
+	struct btrfs_raid_bio *cache_drop = NULL;
+	int ret = 0;
+	int walk = 0;
+
+	spin_lock_irqsave(&h->lock, flags);
+	list_for_each_entry(cur, &h->hash_list, hash_list) {
+		walk++;
+		if (cur->raid_map[0] == rbio->raid_map[0]) {
+			spin_lock(&cur->bio_list_lock);
+
+			/* can we steal this cached rbio's pages? */
+			if (bio_list_empty(&cur->bio_list) &&
+			    list_empty(&cur->plug_list) &&
+			    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+			    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+				list_del_init(&cur->hash_list);
+				atomic_dec(&cur->refs);
+
+				steal_rbio(cur, rbio);
+				cache_drop = cur;
+				spin_unlock(&cur->bio_list_lock);
+
+				goto lockit;
+			}
+
+			/* can we merge into the lock owner? */
+			if (rbio_can_merge(cur, rbio)) {
+				merge_rbio(cur, rbio);
+				spin_unlock(&cur->bio_list_lock);
+				freeit = rbio;
+				ret = 1;
+				goto out;
+			}
+
+
+			/*
+			 * we couldn't merge with the running
+			 * rbio, see if we can merge with the
+			 * pending ones.  We don't have to
+			 * check for rmw_locked because there
+			 * is no way they are inside finish_rmw
+			 * right now
+			 */
+			list_for_each_entry(pending, &cur->plug_list,
+					    plug_list) {
+				if (rbio_can_merge(pending, rbio)) {
+					merge_rbio(pending, rbio);
+					spin_unlock(&cur->bio_list_lock);
+					freeit = rbio;
+					ret = 1;
+					goto out;
+				}
+			}
+
+			/* no merging, put us on the tail of the plug list,
+			 * our rbio will be started with the currently
+			 * running rbio unlocks
+			 */
+			list_add_tail(&rbio->plug_list, &cur->plug_list);
+			spin_unlock(&cur->bio_list_lock);
+			ret = 1;
+			goto out;
+		}
+	}
+lockit:
+	atomic_inc(&rbio->refs);
+	list_add(&rbio->hash_list, &h->hash_list);
+out:
+	spin_unlock_irqrestore(&h->lock, flags);
+	if (cache_drop)
+		remove_rbio_from_cache(cache_drop);
+	if (freeit)
+		__free_raid_bio(freeit);
+	return ret;
+}
+
+/*
+ * called as rmw or parity rebuild is completed.  If the plug list has more
+ * rbios waiting for this stripe, the next one on the list will be started
+ */
+static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+{
+	int bucket;
+	struct btrfs_stripe_hash *h;
+	unsigned long flags;
+	int keep_cache = 0;
+
+	bucket = rbio_bucket(rbio);
+	h = rbio->fs_info->stripe_hash_table->table + bucket;
+
+	if (list_empty(&rbio->plug_list))
+		cache_rbio(rbio);
+
+	spin_lock_irqsave(&h->lock, flags);
+	spin_lock(&rbio->bio_list_lock);
+
+	if (!list_empty(&rbio->hash_list)) {
+		/*
+		 * if we're still cached and there is no other IO
+		 * to perform, just leave this rbio here for others
+		 * to steal from later
+		 */
+		if (list_empty(&rbio->plug_list) &&
+		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+			keep_cache = 1;
+			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+			BUG_ON(!bio_list_empty(&rbio->bio_list));
+			goto done;
+		}
+
+		list_del_init(&rbio->hash_list);
+		atomic_dec(&rbio->refs);
+
+		/*
+		 * we use the plug list to hold all the rbios
+		 * waiting for the chance to lock this stripe.
+		 * hand the lock over to one of them.
+		 */
+		if (!list_empty(&rbio->plug_list)) {
+			struct btrfs_raid_bio *next;
+			struct list_head *head = rbio->plug_list.next;
+
+			next = list_entry(head, struct btrfs_raid_bio,
+					  plug_list);
+
+			list_del_init(&rbio->plug_list);
+
+			list_add(&next->hash_list, &h->hash_list);
+			atomic_inc(&next->refs);
+			spin_unlock(&rbio->bio_list_lock);
+			spin_unlock_irqrestore(&h->lock, flags);
+
+			if (next->read_rebuild)
+				async_read_rebuild(next);
+			else {
+				steal_rbio(rbio, next);
+				async_rmw_stripe(next);
+			}
+
+			goto done_nolock;
+		} else  if (waitqueue_active(&h->wait)) {
+			spin_unlock(&rbio->bio_list_lock);
+			spin_unlock_irqrestore(&h->lock, flags);
+			wake_up(&h->wait);
+			goto done_nolock;
+		}
+	}
+done:
+	spin_unlock(&rbio->bio_list_lock);
+	spin_unlock_irqrestore(&h->lock, flags);
+
+done_nolock:
+	if (!keep_cache)
+		remove_rbio_from_cache(rbio);
+}
+
+static void __free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+	int i;
+
+	WARN_ON(atomic_read(&rbio->refs) < 0);
+	if (!atomic_dec_and_test(&rbio->refs))
+		return;
+
+	WARN_ON(!list_empty(&rbio->stripe_cache));
+	WARN_ON(!list_empty(&rbio->hash_list));
+	WARN_ON(!bio_list_empty(&rbio->bio_list));
+
+	for (i = 0; i < rbio->nr_pages; i++) {
+		if (rbio->stripe_pages[i]) {
+			__free_page(rbio->stripe_pages[i]);
+			rbio->stripe_pages[i] = NULL;
+		}
+	}
+	kfree(rbio->raid_map);
+	kfree(rbio->bbio);
+	kfree(rbio);
+}
+
+static void free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+	unlock_stripe(rbio);
+	__free_raid_bio(rbio);
+}
+
+/*
+ * this frees the rbio and runs through all the bios in the
+ * bio_list and calls end_io on them
+ */
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
+{
+	struct bio *cur = bio_list_get(&rbio->bio_list);
+	struct bio *next;
+	free_raid_bio(rbio);
+
+	while (cur) {
+		next = cur->bi_next;
+		cur->bi_next = NULL;
+		if (uptodate)
+			set_bit(BIO_UPTODATE, &cur->bi_flags);
+		bio_endio(cur, err);
+		cur = next;
+	}
+}
+
+/*
+ * end io function used by finish_rmw.  When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_end_io(struct bio *bio, int err)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	if (err)
+		fail_bio_stripe(rbio, bio);
+
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+		return;
+
+	err = 0;
+
+	/* OK, we have read all the stripes we need to. */
+	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+		err = -EIO;
+
+	rbio_orig_end_io(rbio, err, 0);
+	return;
+}
+
+/*
+ * the read/modify/write code wants to use the original bio for
+ * any pages it included, and then use the rbio for everything
+ * else.  This function decides if a given index (stripe number)
+ * and page number in that stripe fall inside the original bio
+ * or the rbio.
+ *
+ * if you set bio_list_only, you'll get a NULL back for any ranges
+ * that are outside the bio_list
+ *
+ * This doesn't take any refs on anything, you get a bare page pointer
+ * and the caller must bump refs as required.
+ *
+ * You must call index_rbio_pages once before you can trust
+ * the answers from this function.
+ */
+static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
+				 int index, int pagenr, int bio_list_only)
+{
+	int chunk_page;
+	struct page *p = NULL;
+
+	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
+
+	spin_lock_irq(&rbio->bio_list_lock);
+	p = rbio->bio_pages[chunk_page];
+	spin_unlock_irq(&rbio->bio_list_lock);
+
+	if (p || bio_list_only)
+		return p;
+
+	return rbio->stripe_pages[chunk_page];
+}
+
+/*
+ * number of pages we need for the entire stripe across all the
+ * drives
+ */
+static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
+{
+	unsigned long nr = stripe_len * nr_stripes;
+	return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+/*
+ * allocation and initial setup for the btrfs_raid_bio.  Not
+ * this does not allocate any pages for rbio->pages.
+ */
+static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
+			  struct btrfs_bio *bbio, u64 *raid_map,
+			  u64 stripe_len)
+{
+	struct btrfs_raid_bio *rbio;
+	int nr_data = 0;
+	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
+	void *p;
+
+	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
+			GFP_NOFS);
+	if (!rbio) {
+		kfree(raid_map);
+		kfree(bbio);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	bio_list_init(&rbio->bio_list);
+	INIT_LIST_HEAD(&rbio->plug_list);
+	spin_lock_init(&rbio->bio_list_lock);
+	INIT_LIST_HEAD(&rbio->stripe_cache);
+	INIT_LIST_HEAD(&rbio->hash_list);
+	rbio->bbio = bbio;
+	rbio->raid_map = raid_map;
+	rbio->fs_info = root->fs_info;
+	rbio->stripe_len = stripe_len;
+	rbio->nr_pages = num_pages;
+	rbio->faila = -1;
+	rbio->failb = -1;
+	atomic_set(&rbio->refs, 1);
+
+	/*
+	 * the stripe_pages and bio_pages array point to the extra
+	 * memory we allocated past the end of the rbio
+	 */
+	p = rbio + 1;
+	rbio->stripe_pages = p;
+	rbio->bio_pages = p + sizeof(struct page *) * num_pages;
+
+	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+		nr_data = bbio->num_stripes - 2;
+	else
+		nr_data = bbio->num_stripes - 1;
+
+	rbio->nr_data = nr_data;
+	return rbio;
+}
+
+/* allocate pages for all the stripes in the bio, including parity */
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+	int i;
+	struct page *page;
+
+	for (i = 0; i < rbio->nr_pages; i++) {
+		if (rbio->stripe_pages[i])
+			continue;
+		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
+		rbio->stripe_pages[i] = page;
+		ClearPageUptodate(page);
+	}
+	return 0;
+}
+
+/* allocate pages for just the p/q stripes */
+static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
+{
+	int i;
+	struct page *page;
+
+	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+
+	for (; i < rbio->nr_pages; i++) {
+		if (rbio->stripe_pages[i])
+			continue;
+		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
+		rbio->stripe_pages[i] = page;
+	}
+	return 0;
+}
+
+/*
+ * add a single page from a specific stripe into our list of bios for IO
+ * this will try to merge into existing bios if possible, and returns
+ * zero if all went well.
+ */
+static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
+			    struct bio_list *bio_list,
+			    struct page *page,
+			    int stripe_nr,
+			    unsigned long page_index,
+			    unsigned long bio_max_len)
+{
+	struct bio *last = bio_list->tail;
+	u64 last_end = 0;
+	int ret;
+	struct bio *bio;
+	struct btrfs_bio_stripe *stripe;
+	u64 disk_start;
+
+	stripe = &rbio->bbio->stripes[stripe_nr];
+	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
+
+	/* if the device is missing, just fail this stripe */
+	if (!stripe->dev->bdev)
+		return fail_rbio_index(rbio, stripe_nr);
+
+	/* see if we can add this page onto our existing bio */
+	if (last) {
+		last_end = (u64)last->bi_sector << 9;
+		last_end += last->bi_size;
+
+		/*
+		 * we can't merge these if they are from different
+		 * devices or if they are not contiguous
+		 */
+		if (last_end == disk_start && stripe->dev->bdev &&
+		    test_bit(BIO_UPTODATE, &last->bi_flags) &&
+		    last->bi_bdev == stripe->dev->bdev) {
+			ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
+			if (ret == PAGE_CACHE_SIZE)
+				return 0;
+		}
+	}
+
+	/* put a new bio on the list */
+	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_size = 0;
+	bio->bi_bdev = stripe->dev->bdev;
+	bio->bi_sector = disk_start >> 9;
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+	bio_list_add(bio_list, bio);
+	return 0;
+}
+
+/*
+ * while we're doing the read/modify/write cycle, we could
+ * have errors in reading pages off the disk.  This checks
+ * for errors and if we're not able to read the page it'll
+ * trigger parity reconstruction.  The rmw will be finished
+ * after we've reconstructed the failed stripes
+ */
+static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
+{
+	if (rbio->faila >= 0 || rbio->failb >= 0) {
+		BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
+		__raid56_parity_recover(rbio);
+	} else {
+		finish_rmw(rbio);
+	}
+}
+
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
+{
+	int index;
+	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
+	index += page;
+	return rbio->stripe_pages[index];
+}
+
+/*
+ * helper function to walk our bio list and populate the bio_pages array with
+ * the result.  This seems expensive, but it is faster than constantly
+ * searching through the bio list as we setup the IO in finish_rmw or stripe
+ * reconstruction.
+ *
+ * This must be called before you trust the answers from page_in_rbio
+ */
+static void index_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+	struct bio *bio;
+	u64 start;
+	unsigned long stripe_offset;
+	unsigned long page_index;
+	struct page *p;
+	int i;
+
+	spin_lock_irq(&rbio->bio_list_lock);
+	bio_list_for_each(bio, &rbio->bio_list) {
+		start = (u64)bio->bi_sector << 9;
+		stripe_offset = start - rbio->raid_map[0];
+		page_index = stripe_offset >> PAGE_CACHE_SHIFT;
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			p = bio->bi_io_vec[i].bv_page;
+			rbio->bio_pages[page_index + i] = p;
+		}
+	}
+	spin_unlock_irq(&rbio->bio_list_lock);
+}
+
+/*
+ * this is called from one of two situations.  We either
+ * have a full stripe from the higher layers, or we've read all
+ * the missing bits off disk.
+ *
+ * This will calculate the parity and then send down any
+ * changed blocks.
+ */
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+{
+	struct btrfs_bio *bbio = rbio->bbio;
+	void *pointers[bbio->num_stripes];
+	int stripe_len = rbio->stripe_len;
+	int nr_data = rbio->nr_data;
+	int stripe;
+	int pagenr;
+	int p_stripe = -1;
+	int q_stripe = -1;
+	struct bio_list bio_list;
+	struct bio *bio;
+	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
+	int ret;
+
+	bio_list_init(&bio_list);
+
+	if (bbio->num_stripes - rbio->nr_data == 1) {
+		p_stripe = bbio->num_stripes - 1;
+	} else if (bbio->num_stripes - rbio->nr_data == 2) {
+		p_stripe = bbio->num_stripes - 2;
+		q_stripe = bbio->num_stripes - 1;
+	} else {
+		BUG();
+	}
+
+	/* at this point we either have a full stripe,
+	 * or we've read the full stripe from the drive.
+	 * recalculate the parity and write the new results.
+	 *
+	 * We're not allowed to add any new bios to the
+	 * bio list here, anyone else that wants to
+	 * change this stripe needs to do their own rmw.
+	 */
+	spin_lock_irq(&rbio->bio_list_lock);
+	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+	spin_unlock_irq(&rbio->bio_list_lock);
+
+	atomic_set(&rbio->bbio->error, 0);
+
+	/*
+	 * now that we've set rmw_locked, run through the
+	 * bio list one last time and map the page pointers
+	 *
+	 * We don't cache full rbios because we're assuming
+	 * the higher layers are unlikely to use this area of
+	 * the disk again soon.  If they do use it again,
+	 * hopefully they will send another full bio.
+	 */
+	index_rbio_pages(rbio);
+	if (!rbio_is_full(rbio))
+		cache_rbio_pages(rbio);
+	else
+		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+		struct page *p;
+		/* first collect one page from each data stripe */
+		for (stripe = 0; stripe < nr_data; stripe++) {
+			p = page_in_rbio(rbio, stripe, pagenr, 0);
+			pointers[stripe] = kmap(p);
+		}
+
+		/* then add the parity stripe */
+		p = rbio_pstripe_page(rbio, pagenr);
+		SetPageUptodate(p);
+		pointers[stripe++] = kmap(p);
+
+		if (q_stripe != -1) {
+
+			/*
+			 * raid6, add the qstripe and call the
+			 * library function to fill in our p/q
+			 */
+			p = rbio_qstripe_page(rbio, pagenr);
+			SetPageUptodate(p);
+			pointers[stripe++] = kmap(p);
+
+			raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
+						pointers);
+		} else {
+			/* raid5 */
+			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+			run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+		}
+
+
+		for (stripe = 0; stripe < bbio->num_stripes; stripe++)
+			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+	}
+
+	/*
+	 * time to start writing.  Make bios for everything from the
+	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
+	 * everything else.
+	 */
+	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+			struct page *page;
+			if (stripe < rbio->nr_data) {
+				page = page_in_rbio(rbio, stripe, pagenr, 1);
+				if (!page)
+					continue;
+			} else {
+			       page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+
+			ret = rbio_add_io_page(rbio, &bio_list,
+				       page, stripe, pagenr, rbio->stripe_len);
+			if (ret)
+				goto cleanup;
+		}
+	}
+
+	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
+	BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
+
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_write_end_io;
+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+		submit_bio(WRITE, bio);
+	}
+	return;
+
+cleanup:
+	rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+/*
+ * helper to find the stripe number for a given bio.  Used to figure out which
+ * stripe has failed.  This expects the bio to correspond to a physical disk,
+ * so it looks up based on physical sector numbers.
+ */
+static int find_bio_stripe(struct btrfs_raid_bio *rbio,
+			   struct bio *bio)
+{
+	u64 physical = bio->bi_sector;
+	u64 stripe_start;
+	int i;
+	struct btrfs_bio_stripe *stripe;
+
+	physical <<= 9;
+
+	for (i = 0; i < rbio->bbio->num_stripes; i++) {
+		stripe = &rbio->bbio->stripes[i];
+		stripe_start = stripe->physical;
+		if (physical >= stripe_start &&
+		    physical < stripe_start + rbio->stripe_len) {
+			return i;
+		}
+	}
+	return -1;
+}
+
+/*
+ * helper to find the stripe number for a given
+ * bio (before mapping).  Used to figure out which stripe has
+ * failed.  This looks up based on logical block numbers.
+ */
+static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
+				   struct bio *bio)
+{
+	u64 logical = bio->bi_sector;
+	u64 stripe_start;
+	int i;
+
+	logical <<= 9;
+
+	for (i = 0; i < rbio->nr_data; i++) {
+		stripe_start = rbio->raid_map[i];
+		if (logical >= stripe_start &&
+		    logical < stripe_start + rbio->stripe_len) {
+			return i;
+		}
+	}
+	return -1;
+}
+
+/*
+ * returns -EIO if we had too many failures
+ */
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&rbio->bio_list_lock, flags);
+
+	/* we already know this stripe is bad, move on */
+	if (rbio->faila == failed || rbio->failb == failed)
+		goto out;
+
+	if (rbio->faila == -1) {
+		/* first failure on this rbio */
+		rbio->faila = failed;
+		atomic_inc(&rbio->bbio->error);
+	} else if (rbio->failb == -1) {
+		/* second failure on this rbio */
+		rbio->failb = failed;
+		atomic_inc(&rbio->bbio->error);
+	} else {
+		ret = -EIO;
+	}
+out:
+	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+
+	return ret;
+}
+
+/*
+ * helper to fail a stripe based on a physical disk
+ * bio.
+ */
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
+			   struct bio *bio)
+{
+	int failed = find_bio_stripe(rbio, bio);
+
+	if (failed < 0)
+		return -EIO;
+
+	return fail_rbio_index(rbio, failed);
+}
+
+/*
+ * this sets each page in the bio uptodate.  It should only be used on private
+ * rbio pages, nothing that comes in from the higher layers
+ */
+static void set_bio_pages_uptodate(struct bio *bio)
+{
+	int i;
+	struct page *p;
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		p = bio->bi_io_vec[i].bv_page;
+		SetPageUptodate(p);
+	}
+}
+
+/*
+ * end io for the read phase of the rmw cycle.  All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid_rmw_end_io(struct bio *bio, int err)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	if (err)
+		fail_bio_stripe(rbio, bio);
+	else
+		set_bio_pages_uptodate(bio);
+
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+		return;
+
+	err = 0;
+	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+		goto cleanup;
+
+	/*
+	 * this will normally call finish_rmw to start our write
+	 * but if there are any failed stripes we'll reconstruct
+	 * from parity first
+	 */
+	validate_rbio_for_rmw(rbio);
+	return;
+
+cleanup:
+
+	rbio_orig_end_io(rbio, -EIO, 0);
+}
+
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+	rbio->work.flags = 0;
+	rbio->work.func = rmw_work;
+
+	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
+			   &rbio->work);
+}
+
+static void async_read_rebuild(struct btrfs_raid_bio *rbio)
+{
+	rbio->work.flags = 0;
+	rbio->work.func = read_rebuild_work;
+
+	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
+			   &rbio->work);
+}
+
+/*
+ * the stripe must be locked by the caller.  It will
+ * unlock after all the writes are done
+ */
+static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+	int bios_to_read = 0;
+	struct btrfs_bio *bbio = rbio->bbio;
+	struct bio_list bio_list;
+	int ret;
+	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int pagenr;
+	int stripe;
+	struct bio *bio;
+
+	bio_list_init(&bio_list);
+
+	ret = alloc_rbio_pages(rbio);
+	if (ret)
+		goto cleanup;
+
+	index_rbio_pages(rbio);
+
+	atomic_set(&rbio->bbio->error, 0);
+	/*
+	 * build a list of bios to read all the missing parts of this
+	 * stripe
+	 */
+	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+			struct page *page;
+			/*
+			 * we want to find all the pages missing from
+			 * the rbio and read them from the disk.  If
+			 * page_in_rbio finds a page in the bio list
+			 * we don't need to read it off the stripe.
+			 */
+			page = page_in_rbio(rbio, stripe, pagenr, 1);
+			if (page)
+				continue;
+
+			page = rbio_stripe_page(rbio, stripe, pagenr);
+			/*
+			 * the bio cache may have handed us an uptodate
+			 * page.  If so, be happy and use it
+			 */
+			if (PageUptodate(page))
+				continue;
+
+			ret = rbio_add_io_page(rbio, &bio_list, page,
+				       stripe, pagenr, rbio->stripe_len);
+			if (ret)
+				goto cleanup;
+		}
+	}
+
+	bios_to_read = bio_list_size(&bio_list);
+	if (!bios_to_read) {
+		/*
+		 * this can happen if others have merged with
+		 * us, it means there is nothing left to read.
+		 * But if there are missing devices it may not be
+		 * safe to do the full stripe write yet.
+		 */
+		goto finish;
+	}
+
+	/*
+	 * the bbio may be freed once we submit the last bio.  Make sure
+	 * not to touch it after that
+	 */
+	atomic_set(&bbio->stripes_pending, bios_to_read);
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_rmw_end_io;
+
+		btrfs_bio_wq_end_io(rbio->fs_info, bio,
+				    BTRFS_WQ_ENDIO_RAID56);
+
+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+		submit_bio(READ, bio);
+	}
+	/* the actual write will happen once the reads are done */
+	return 0;
+
+cleanup:
+	rbio_orig_end_io(rbio, -EIO, 0);
+	return -EIO;
+
+finish:
+	validate_rbio_for_rmw(rbio);
+	return 0;
+}
+
+/*
+ * if the upper layers pass in a full stripe, we thank them by only allocating
+ * enough pages to hold the parity, and sending it all down quickly.
+ */
+static int full_stripe_write(struct btrfs_raid_bio *rbio)
+{
+	int ret;
+
+	ret = alloc_rbio_parity_pages(rbio);
+	if (ret)
+		return ret;
+
+	ret = lock_stripe_add(rbio);
+	if (ret == 0)
+		finish_rmw(rbio);
+	return 0;
+}
+
+/*
+ * partial stripe writes get handed over to async helpers.
+ * We're really hoping to merge a few more writes into this
+ * rbio before calculating new parity
+ */
+static int partial_stripe_write(struct btrfs_raid_bio *rbio)
+{
+	int ret;
+
+	ret = lock_stripe_add(rbio);
+	if (ret == 0)
+		async_rmw_stripe(rbio);
+	return 0;
+}
+
+/*
+ * sometimes while we were reading from the drive to
+ * recalculate parity, enough new bios come into create
+ * a full stripe.  So we do a check here to see if we can
+ * go directly to finish_rmw
+ */
+static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
+{
+	/* head off into rmw land if we don't have a full stripe */
+	if (!rbio_is_full(rbio))
+		return partial_stripe_write(rbio);
+	return full_stripe_write(rbio);
+}
+
+/*
+ * We use plugging call backs to collect full stripes.
+ * Any time we get a partial stripe write while plugged
+ * we collect it into a list.  When the unplug comes down,
+ * we sort the list by logical block number and merge
+ * everything we can into the same rbios
+ */
+struct btrfs_plug_cb {
+	struct blk_plug_cb cb;
+	struct btrfs_fs_info *info;
+	struct list_head rbio_list;
+	struct btrfs_work work;
+};
+
+/*
+ * rbios on the plug list are sorted for easier merging.
+ */
+static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+						 plug_list);
+	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+						 plug_list);
+	u64 a_sector = ra->bio_list.head->bi_sector;
+	u64 b_sector = rb->bio_list.head->bi_sector;
+
+	if (a_sector < b_sector)
+		return -1;
+	if (a_sector > b_sector)
+		return 1;
+	return 0;
+}
+
+static void run_plug(struct btrfs_plug_cb *plug)
+{
+	struct btrfs_raid_bio *cur;
+	struct btrfs_raid_bio *last = NULL;
+
+	/*
+	 * sort our plug list then try to merge
+	 * everything we can in hopes of creating full
+	 * stripes.
+	 */
+	list_sort(NULL, &plug->rbio_list, plug_cmp);
+	while (!list_empty(&plug->rbio_list)) {
+		cur = list_entry(plug->rbio_list.next,
+				 struct btrfs_raid_bio, plug_list);
+		list_del_init(&cur->plug_list);
+
+		if (rbio_is_full(cur)) {
+			/* we have a full stripe, send it down */
+			full_stripe_write(cur);
+			continue;
+		}
+		if (last) {
+			if (rbio_can_merge(last, cur)) {
+				merge_rbio(last, cur);
+				__free_raid_bio(cur);
+				continue;
+
+			}
+			__raid56_parity_write(last);
+		}
+		last = cur;
+	}
+	if (last) {
+		__raid56_parity_write(last);
+	}
+	kfree(plug);
+}
+
+/*
+ * if the unplug comes from schedule, we have to push the
+ * work off to a helper thread
+ */
+static void unplug_work(struct btrfs_work *work)
+{
+	struct btrfs_plug_cb *plug;
+	plug = container_of(work, struct btrfs_plug_cb, work);
+	run_plug(plug);
+}
+
+static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+	struct btrfs_plug_cb *plug;
+	plug = container_of(cb, struct btrfs_plug_cb, cb);
+
+	if (from_schedule) {
+		plug->work.flags = 0;
+		plug->work.func = unplug_work;
+		btrfs_queue_worker(&plug->info->rmw_workers,
+				   &plug->work);
+		return;
+	}
+	run_plug(plug);
+}
+
+/*
+ * our main entry point for writes from the rest of the FS.
+ */
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+			struct btrfs_bio *bbio, u64 *raid_map,
+			u64 stripe_len)
+{
+	struct btrfs_raid_bio *rbio;
+	struct btrfs_plug_cb *plug = NULL;
+	struct blk_plug_cb *cb;
+
+	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+	if (IS_ERR(rbio)) {
+		kfree(raid_map);
+		kfree(bbio);
+		return PTR_ERR(rbio);
+	}
+	bio_list_add(&rbio->bio_list, bio);
+	rbio->bio_list_bytes = bio->bi_size;
+
+	/*
+	 * don't plug on full rbios, just get them out the door
+	 * as quickly as we can
+	 */
+	if (rbio_is_full(rbio))
+		return full_stripe_write(rbio);
+
+	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
+			       sizeof(*plug));
+	if (cb) {
+		plug = container_of(cb, struct btrfs_plug_cb, cb);
+		if (!plug->info) {
+			plug->info = root->fs_info;
+			INIT_LIST_HEAD(&plug->rbio_list);
+		}
+		list_add_tail(&rbio->plug_list, &plug->rbio_list);
+	} else {
+		return __raid56_parity_write(rbio);
+	}
+	return 0;
+}
+
+/*
+ * all parity reconstruction happens here.  We've read in everything
+ * we can find from the drives and this does the heavy lifting of
+ * sorting the good from the bad.
+ */
+static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+{
+	int pagenr, stripe;
+	void **pointers;
+	int faila = -1, failb = -1;
+	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int err;
+	int i;
+
+	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
+			   GFP_NOFS);
+	if (!pointers) {
+		err = -ENOMEM;
+		goto cleanup_io;
+	}
+
+	faila = rbio->faila;
+	failb = rbio->failb;
+
+	if (rbio->read_rebuild) {
+		spin_lock_irq(&rbio->bio_list_lock);
+		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+		spin_unlock_irq(&rbio->bio_list_lock);
+	}
+
+	index_rbio_pages(rbio);
+
+	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+		/* setup our array of pointers with pages
+		 * from each stripe
+		 */
+		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+			/*
+			 * if we're rebuilding a read, we have to use
+			 * pages from the bio list
+			 */
+			if (rbio->read_rebuild &&
+			    (stripe == faila || stripe == failb)) {
+				page = page_in_rbio(rbio, stripe, pagenr, 0);
+			} else {
+				page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+			pointers[stripe] = kmap(page);
+		}
+
+		/* all raid6 handling here */
+		if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
+		    RAID6_Q_STRIPE) {
+
+			/*
+			 * single failure, rebuild from parity raid5
+			 * style
+			 */
+			if (failb < 0) {
+				if (faila == rbio->nr_data) {
+					/*
+					 * Just the P stripe has failed, without
+					 * a bad data or Q stripe.
+					 * TODO, we should redo the xor here.
+					 */
+					err = -EIO;
+					goto cleanup;
+				}
+				/*
+				 * a single failure in raid6 is rebuilt
+				 * in the pstripe code below
+				 */
+				goto pstripe;
+			}
+
+			/* make sure our ps and qs are in order */
+			if (faila > failb) {
+				int tmp = failb;
+				failb = faila;
+				faila = tmp;
+			}
+
+			/* if the q stripe is failed, do a pstripe reconstruction
+			 * from the xors.
+			 * If both the q stripe and the P stripe are failed, we're
+			 * here due to a crc mismatch and we can't give them the
+			 * data they want
+			 */
+			if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
+				if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
+					err = -EIO;
+					goto cleanup;
+				}
+				/*
+				 * otherwise we have one bad data stripe and
+				 * a good P stripe.  raid5!
+				 */
+				goto pstripe;
+			}
+
+			if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
+				raid6_datap_recov(rbio->bbio->num_stripes,
+						  PAGE_SIZE, faila, pointers);
+			} else {
+				raid6_2data_recov(rbio->bbio->num_stripes,
+						  PAGE_SIZE, faila, failb,
+						  pointers);
+			}
+		} else {
+			void *p;
+
+			/* rebuild from P stripe here (raid5 or raid6) */
+			BUG_ON(failb != -1);
+pstripe:
+			/* Copy parity block into failed block to start with */
+			memcpy(pointers[faila],
+			       pointers[rbio->nr_data],
+			       PAGE_CACHE_SIZE);
+
+			/* rearrange the pointer array */
+			p = pointers[faila];
+			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
+				pointers[stripe] = pointers[stripe + 1];
+			pointers[rbio->nr_data - 1] = p;
+
+			/* xor in the rest */
+			run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
+		}
+		/* if we're doing this rebuild as part of an rmw, go through
+		 * and set all of our private rbio pages in the
+		 * failed stripes as uptodate.  This way finish_rmw will
+		 * know they can be trusted.  If this was a read reconstruction,
+		 * other endio functions will fiddle the uptodate bits
+		 */
+		if (!rbio->read_rebuild) {
+			for (i = 0;  i < nr_pages; i++) {
+				if (faila != -1) {
+					page = rbio_stripe_page(rbio, faila, i);
+					SetPageUptodate(page);
+				}
+				if (failb != -1) {
+					page = rbio_stripe_page(rbio, failb, i);
+					SetPageUptodate(page);
+				}
+			}
+		}
+		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+			/*
+			 * if we're rebuilding a read, we have to use
+			 * pages from the bio list
+			 */
+			if (rbio->read_rebuild &&
+			    (stripe == faila || stripe == failb)) {
+				page = page_in_rbio(rbio, stripe, pagenr, 0);
+			} else {
+				page = rbio_stripe_page(rbio, stripe, pagenr);
+			}
+			kunmap(page);
+		}
+	}
+
+	err = 0;
+cleanup:
+	kfree(pointers);
+
+cleanup_io:
+
+	if (rbio->read_rebuild) {
+		if (err == 0)
+			cache_rbio_pages(rbio);
+		else
+			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+
+		rbio_orig_end_io(rbio, err, err == 0);
+	} else if (err == 0) {
+		rbio->faila = -1;
+		rbio->failb = -1;
+		finish_rmw(rbio);
+	} else {
+		rbio_orig_end_io(rbio, err, 0);
+	}
+}
+
+/*
+ * This is called only for stripes we've read from disk to
+ * reconstruct the parity.
+ */
+static void raid_recover_end_io(struct bio *bio, int err)
+{
+	struct btrfs_raid_bio *rbio = bio->bi_private;
+
+	/*
+	 * we only read stripe pages off the disk, set them
+	 * up to date if there were no errors
+	 */
+	if (err)
+		fail_bio_stripe(rbio, bio);
+	else
+		set_bio_pages_uptodate(bio);
+	bio_put(bio);
+
+	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+		return;
+
+	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+		rbio_orig_end_io(rbio, -EIO, 0);
+	else
+		__raid_recover_end_io(rbio);
+}
+
+/*
+ * reads everything we need off the disk to reconstruct
+ * the parity. endio handlers trigger final reconstruction
+ * when the IO is done.
+ *
+ * This is used both for reads from the higher layers and for
+ * parity construction required to finish a rmw cycle.
+ */
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+{
+	int bios_to_read = 0;
+	struct btrfs_bio *bbio = rbio->bbio;
+	struct bio_list bio_list;
+	int ret;
+	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int pagenr;
+	int stripe;
+	struct bio *bio;
+
+	bio_list_init(&bio_list);
+
+	ret = alloc_rbio_pages(rbio);
+	if (ret)
+		goto cleanup;
+
+	atomic_set(&rbio->bbio->error, 0);
+
+	/*
+	 * read everything that hasn't failed.  Thanks to the
+	 * stripe cache, it is possible that some or all of these
+	 * pages are going to be uptodate.
+	 */
+	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+		if (rbio->faila == stripe ||
+		    rbio->failb == stripe)
+			continue;
+
+		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+			struct page *p;
+
+			/*
+			 * the rmw code may have already read this
+			 * page in
+			 */
+			p = rbio_stripe_page(rbio, stripe, pagenr);
+			if (PageUptodate(p))
+				continue;
+
+			ret = rbio_add_io_page(rbio, &bio_list,
+				       rbio_stripe_page(rbio, stripe, pagenr),
+				       stripe, pagenr, rbio->stripe_len);
+			if (ret < 0)
+				goto cleanup;
+		}
+	}
+
+	bios_to_read = bio_list_size(&bio_list);
+	if (!bios_to_read) {
+		/*
+		 * we might have no bios to read just because the pages
+		 * were up to date, or we might have no bios to read because
+		 * the devices were gone.
+		 */
+		if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
+			__raid_recover_end_io(rbio);
+			goto out;
+		} else {
+			goto cleanup;
+		}
+	}
+
+	/*
+	 * the bbio may be freed once we submit the last bio.  Make sure
+	 * not to touch it after that
+	 */
+	atomic_set(&bbio->stripes_pending, bios_to_read);
+	while (1) {
+		bio = bio_list_pop(&bio_list);
+		if (!bio)
+			break;
+
+		bio->bi_private = rbio;
+		bio->bi_end_io = raid_recover_end_io;
+
+		btrfs_bio_wq_end_io(rbio->fs_info, bio,
+				    BTRFS_WQ_ENDIO_RAID56);
+
+		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+		submit_bio(READ, bio);
+	}
+out:
+	return 0;
+
+cleanup:
+	if (rbio->read_rebuild)
+		rbio_orig_end_io(rbio, -EIO, 0);
+	return -EIO;
+}
+
+/*
+ * the main entry point for reads from the higher layers.  This
+ * is really only called when the normal read path had a failure,
+ * so we assume the bio they send down corresponds to a failed part
+ * of the drive.
+ */
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+			  struct btrfs_bio *bbio, u64 *raid_map,
+			  u64 stripe_len, int mirror_num)
+{
+	struct btrfs_raid_bio *rbio;
+	int ret;
+
+	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+	if (IS_ERR(rbio)) {
+		return PTR_ERR(rbio);
+	}
+
+	rbio->read_rebuild = 1;
+	bio_list_add(&rbio->bio_list, bio);
+	rbio->bio_list_bytes = bio->bi_size;
+
+	rbio->faila = find_logical_bio_stripe(rbio, bio);
+	if (rbio->faila == -1) {
+		BUG();
+		kfree(rbio);
+		return -EIO;
+	}
+
+	/*
+	 * reconstruct from the q stripe if they are
+	 * asking for mirror 3
+	 */
+	if (mirror_num == 3)
+		rbio->failb = bbio->num_stripes - 2;
+
+	ret = lock_stripe_add(rbio);
+
+	/*
+	 * __raid56_parity_recover will end the bio with
+	 * any errors it hits.  We don't want to return
+	 * its error value up the stack because our caller
+	 * will end up calling bio_endio with any nonzero
+	 * return
+	 */
+	if (ret == 0)
+		__raid56_parity_recover(rbio);
+	/*
+	 * our rbio has been added to the list of
+	 * rbios that will be handled after the
+	 * currently lock owner is done
+	 */
+	return 0;
+
+}
+
+static void rmw_work(struct btrfs_work *work)
+{
+	struct btrfs_raid_bio *rbio;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+	raid56_rmw_stripe(rbio);
+}
+
+static void read_rebuild_work(struct btrfs_work *work)
+{
+	struct btrfs_raid_bio *rbio;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+	__raid56_parity_recover(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 00000000000..ea5d73bfdfb
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2012 Fusion-io  All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_RAID56__
+#define __BTRFS_RAID56__
+static inline int nr_parity_stripes(struct map_lookup *map)
+{
+	if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+		return 1;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+		return 2;
+	else
+		return 0;
+}
+
+static inline int nr_data_stripes(struct map_lookup *map)
+{
+	return map->num_stripes - nr_parity_stripes(map);
+}
+#define RAID5_P_STRIPE ((u64)-2)
+#define RAID6_Q_STRIPE ((u64)-1)
+
+#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\
+			     ((x) == RAID6_Q_STRIPE))
+
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+				 struct btrfs_bio *bbio, u64 *raid_map,
+				 u64 stripe_len, int mirror_num);
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+			       struct btrfs_bio *bbio, u64 *raid_map,
+			       u64 stripe_len);
+
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+#endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 96b93daa0bb..1031b69252c 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -955,10 +955,11 @@ int btrfs_reada_wait(void *handle)
 	while (atomic_read(&rc->elems)) {
 		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
 				   5 * HZ);
-		dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+		dump_devs(rc->root->fs_info,
+			  atomic_read(&rc->elems) < 10 ? 1 : 0);
 	}
 
-	dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+	dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
 
 	kref_put(&rc->refcnt, reada_control_release);
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 300e09ac365..12096496cc9 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -326,8 +326,7 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 	return NULL;
 }
 
-void backref_tree_panic(struct rb_node *rb_node, int errno,
-					  u64 bytenr)
+static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
 {
 
 	struct btrfs_fs_info *fs_info = NULL;
@@ -619,10 +618,13 @@ static noinline_for_stack
 int find_inline_backref(struct extent_buffer *leaf, int slot,
 			unsigned long *ptr, unsigned long *end)
 {
+	struct btrfs_key key;
 	struct btrfs_extent_item *ei;
 	struct btrfs_tree_block_info *bi;
 	u32 item_size;
 
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+
 	item_size = btrfs_item_size_nr(leaf, slot);
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 	if (item_size < sizeof(*ei)) {
@@ -634,13 +636,18 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
 
-	if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+	    item_size <= sizeof(*ei) + sizeof(*bi)) {
 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
 		return 1;
 	}
 
-	bi = (struct btrfs_tree_block_info *)(ei + 1);
-	*ptr = (unsigned long)(bi + 1);
+	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		*ptr = (unsigned long)(bi + 1);
+	} else {
+		*ptr = (unsigned long)(ei + 1);
+	}
 	*end = (unsigned long)ei + item_size;
 	return 0;
 }
@@ -708,7 +715,7 @@ again:
 	end = 0;
 	ptr = 0;
 	key.objectid = cur->bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.type = BTRFS_METADATA_ITEM_KEY;
 	key.offset = (u64)-1;
 
 	path1->search_commit_root = 1;
@@ -766,7 +773,8 @@ again:
 				break;
 			}
 
-			if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+			if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+			    key.type == BTRFS_METADATA_ITEM_KEY) {
 				ret = find_inline_backref(eb, path1->slots[0],
 							  &ptr, &end);
 				if (ret)
@@ -1269,6 +1277,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
 	}
 	spin_unlock(&rc->reloc_root_tree.lock);
 
+	if (!node)
+		return 0;
 	BUG_ON((struct btrfs_root *)node->data != root);
 
 	if (!del) {
@@ -1295,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
 	struct extent_buffer *eb;
 	struct btrfs_root_item *root_item;
 	struct btrfs_key root_key;
+	u64 last_snap = 0;
 	int ret;
 
 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1310,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
 				      BTRFS_TREE_RELOC_OBJECTID);
 		BUG_ON(ret);
 
+		last_snap = btrfs_root_last_snapshot(&root->root_item);
 		btrfs_set_root_last_snapshot(&root->root_item,
 					     trans->transid - 1);
 	} else {
@@ -1335,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
 		memset(&root_item->drop_progress, 0,
 		       sizeof(struct btrfs_disk_key));
 		root_item->drop_level = 0;
+		/*
+		 * abuse rtransid, it is safe because it is impossible to
+		 * receive data into a relocation tree.
+		 */
+		btrfs_set_root_rtransid(root_item, last_snap);
+		btrfs_set_root_otransid(root_item, trans->transid);
 	}
 
 	btrfs_tree_unlock(eb);
@@ -1345,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	kfree(root_item);
 
-	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-						 &root_key);
+	reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
 	BUG_ON(IS_ERR(reloc_root));
 	reloc_root->last_trans = trans->transid;
 	return reloc_root;
@@ -1760,7 +1777,11 @@ again:
 
 			eb = read_tree_block(dest, old_bytenr, blocksize,
 					     old_ptr_gen);
-			BUG_ON(!eb);
+			if (!eb || !extent_buffer_uptodate(eb)) {
+				ret = (!eb) ? -ENOMEM : -EIO;
+				free_extent_buffer(eb);
+				break;
+			}
 			btrfs_tree_lock(eb);
 			if (cow) {
 				ret = btrfs_cow_block(trans, dest, eb, parent,
@@ -1913,6 +1934,10 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
 		blocksize = btrfs_level_size(root, i - 1);
 		eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
+			return -EIO;
+		}
 		BUG_ON(btrfs_header_level(eb) != i - 1);
 		path->nodes[i - 1] = eb;
 		path->slots[i - 1] = 0;
@@ -2238,13 +2263,32 @@ again:
 }
 
 static noinline_for_stack
+void free_reloc_roots(struct list_head *list)
+{
+	struct btrfs_root *reloc_root;
+
+	while (!list_empty(list)) {
+		reloc_root = list_entry(list->next, struct btrfs_root,
+					root_list);
+		__update_reloc_root(reloc_root, 1);
+		free_extent_buffer(reloc_root->node);
+		free_extent_buffer(reloc_root->commit_root);
+		kfree(reloc_root);
+	}
+}
+
+static noinline_for_stack
 int merge_reloc_roots(struct reloc_control *rc)
 {
+	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root;
 	struct btrfs_root *reloc_root;
+	u64 last_snap;
+	u64 otransid;
+	u64 objectid;
 	LIST_HEAD(reloc_roots);
 	int found = 0;
-	int ret;
+	int ret = 0;
 again:
 	root = rc->extent_root;
 
@@ -2270,20 +2314,65 @@ again:
 			BUG_ON(root->reloc_root != reloc_root);
 
 			ret = merge_reloc_root(rc, root);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 		} else {
 			list_del_init(&reloc_root->root_list);
 		}
+
+		/*
+		 * we keep the old last snapshod transid in rtranid when we
+		 * created the relocation tree.
+		 */
+		last_snap = btrfs_root_rtransid(&reloc_root->root_item);
+		otransid = btrfs_root_otransid(&reloc_root->root_item);
+		objectid = reloc_root->root_key.offset;
+
 		ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
-		BUG_ON(ret < 0);
+		if (ret < 0) {
+			if (list_empty(&reloc_root->root_list))
+				list_add_tail(&reloc_root->root_list,
+					      &reloc_roots);
+			goto out;
+		} else if (!ret) {
+			/*
+			 * recover the last snapshot tranid to avoid
+			 * the space balance break NOCOW.
+			 */
+			root = read_fs_root(rc->extent_root->fs_info,
+					    objectid);
+			if (IS_ERR(root))
+				continue;
+
+			if (btrfs_root_refs(&root->root_item) == 0)
+				continue;
+
+			trans = btrfs_join_transaction(root);
+			BUG_ON(IS_ERR(trans));
+
+			/* Check if the fs/file tree was snapshoted or not. */
+			if (btrfs_root_last_snapshot(&root->root_item) ==
+			    otransid - 1)
+				btrfs_set_root_last_snapshot(&root->root_item,
+							     last_snap);
+				
+			btrfs_end_transaction(trans, root);
+		}
 	}
 
 	if (found) {
 		found = 0;
 		goto again;
 	}
+out:
+	if (ret) {
+		btrfs_std_error(root->fs_info, ret);
+		if (!list_empty(&reloc_roots))
+			free_reloc_roots(&reloc_roots);
+	}
+
 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
-	return 0;
+	return ret;
 }
 
 static void free_block_list(struct rb_root *blocks)
@@ -2562,7 +2651,8 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 		blocksize = btrfs_level_size(root, node->level);
 		generation = btrfs_node_ptr_generation(upper->eb, slot);
 		eb = read_tree_block(root, bytenr, blocksize, generation);
-		if (!eb) {
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
 			err = -EIO;
 			goto next;
 		}
@@ -2723,7 +2813,10 @@ static int get_tree_block_key(struct reloc_control *rc,
 	BUG_ON(block->key_ready);
 	eb = read_tree_block(rc->extent_root, block->bytenr,
 			     block->key.objectid, block->key.offset);
-	BUG_ON(!eb);
+	if (!eb || !extent_buffer_uptodate(eb)) {
+		free_extent_buffer(eb);
+		return -EIO;
+	}
 	WARN_ON(btrfs_header_level(eb) != block->level);
 	if (block->level == 0)
 		btrfs_item_key_to_cpu(eb, &block->key, 0);
@@ -2738,8 +2831,13 @@ static int reada_tree_block(struct reloc_control *rc,
 			    struct tree_block *block)
 {
 	BUG_ON(block->key_ready);
-	readahead_tree_block(rc->extent_root, block->bytenr,
-			     block->key.objectid, block->key.offset);
+	if (block->key.type == BTRFS_METADATA_ITEM_KEY)
+		readahead_tree_block(rc->extent_root, block->bytenr,
+				     block->key.objectid,
+				     rc->extent_root->leafsize);
+	else
+		readahead_tree_block(rc->extent_root, block->bytenr,
+				     block->key.objectid, block->key.offset);
 	return 0;
 }
 
@@ -2818,8 +2916,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 	int err = 0;
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		err = -ENOMEM;
+		goto out_free_blocks;
+	}
 
 	rb_node = rb_first(blocks);
 	while (rb_node) {
@@ -2832,8 +2932,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 	rb_node = rb_first(blocks);
 	while (rb_node) {
 		block = rb_entry(rb_node, struct tree_block, rb_node);
-		if (!block->key_ready)
-			get_tree_block_key(rc, block);
+		if (!block->key_ready) {
+			err = get_tree_block_key(rc, block);
+			if (err)
+				goto out_free_path;
+		}
 		rb_node = rb_next(rb_node);
 	}
 
@@ -2858,10 +2961,12 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
 		rb_node = rb_next(rb_node);
 	}
 out:
-	free_block_list(blocks);
 	err = finish_pending_nodes(trans, rc, path, err);
 
+out_free_path:
 	btrfs_free_path(path);
+out_free_blocks:
+	free_block_list(blocks);
 	return err;
 }
 
@@ -2932,7 +3037,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
 	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
 	while (1) {
 		write_lock(&em_tree->lock);
-		ret = add_extent_mapping(em_tree, em);
+		ret = add_extent_mapping(em_tree, em, 0);
 		write_unlock(&em_tree->lock);
 		if (ret != -EEXIST) {
 			free_extent_map(em);
@@ -3017,7 +3122,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 			}
 		}
 
-		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+		page_start = page_offset(page);
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
@@ -3143,12 +3248,17 @@ static int add_tree_block(struct reloc_control *rc,
 	eb =  path->nodes[0];
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
 
-	if (item_size >= sizeof(*ei) + sizeof(*bi)) {
+	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
+	    item_size >= sizeof(*ei) + sizeof(*bi)) {
 		ei = btrfs_item_ptr(eb, path->slots[0],
 				struct btrfs_extent_item);
-		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
+			bi = (struct btrfs_tree_block_info *)(ei + 1);
+			level = btrfs_tree_block_level(eb, bi);
+		} else {
+			level = (int)extent_key->offset;
+		}
 		generation = btrfs_extent_generation(eb, ei);
-		level = btrfs_tree_block_level(eb, bi);
 	} else {
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 		u64 ref_owner;
@@ -3177,7 +3287,7 @@ static int add_tree_block(struct reloc_control *rc,
 		return -ENOMEM;
 
 	block->bytenr = extent_key->objectid;
-	block->key.objectid = extent_key->offset;
+	block->key.objectid = rc->extent_root->leafsize;
 	block->key.offset = generation;
 	block->level = level;
 	block->key_ready = 0;
@@ -3199,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc,
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	int ret;
+	bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
+					SKINNY_METADATA);
 
 	if (tree_block_processed(bytenr, blocksize, rc))
 		return 0;
@@ -3209,19 +3321,42 @@ static int __add_tree_block(struct reloc_control *rc,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-
+again:
 	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = blocksize;
+	if (skinny) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = (u64)-1;
+	} else {
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = blocksize;
+	}
 
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
+
+	if (ret > 0 && skinny) {
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    (key.type == BTRFS_METADATA_ITEM_KEY ||
+			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
+			      key.offset == blocksize)))
+				ret = 0;
+		}
+
+		if (ret) {
+			skinny = false;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
 	BUG_ON(ret);
 
-	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 	ret = add_tree_block(rc, &key, path, blocks);
 out:
 	btrfs_free_path(path);
@@ -3242,7 +3377,8 @@ static int block_use_full_backref(struct reloc_control *rc,
 		return 1;
 
 	ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
-				       eb->start, eb->len, NULL, &flags);
+				       eb->start, btrfs_header_level(eb), 1,
+				       NULL, &flags);
 	BUG_ON(ret);
 
 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
@@ -3276,6 +3412,11 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 	}
 
 truncate:
+	ret = btrfs_check_trunc_cache_free_space(root,
+						 &fs_info->global_block_rsv);
+	if (ret)
+		goto out;
+
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
@@ -3472,7 +3613,7 @@ out:
 }
 
 /*
- * hepler to find all tree blocks that reference a given data extent
+ * helper to find all tree blocks that reference a given data extent
  */
 static noinline_for_stack
 int add_data_references(struct reloc_control *rc,
@@ -3566,7 +3707,7 @@ int add_data_references(struct reloc_control *rc,
 }
 
 /*
- * hepler to find next unprocessed extent
+ * helper to find next unprocessed extent
  */
 static noinline_for_stack
 int find_next_extent(struct btrfs_trans_handle *trans,
@@ -3611,12 +3752,25 @@ next:
 			break;
 		}
 
-		if (key.type != BTRFS_EXTENT_ITEM_KEY ||
+		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+		    key.type != BTRFS_METADATA_ITEM_KEY) {
+			path->slots[0]++;
+			goto next;
+		}
+
+		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
 		    key.objectid + key.offset <= rc->search_start) {
 			path->slots[0]++;
 			goto next;
 		}
 
+		if (key.type == BTRFS_METADATA_ITEM_KEY &&
+		    key.objectid + rc->extent_root->leafsize <=
+		    rc->search_start) {
+			path->slots[0]++;
+			goto next;
+		}
+
 		ret = find_first_extent_bit(&rc->processed_blocks,
 					    key.objectid, &start, &end,
 					    EXTENT_DIRTY, NULL);
@@ -3625,7 +3779,11 @@ next:
 			btrfs_release_path(path);
 			rc->search_start = end + 1;
 		} else {
-			rc->search_start = key.objectid + key.offset;
+			if (key.type == BTRFS_EXTENT_ITEM_KEY)
+				rc->search_start = key.objectid + key.offset;
+			else
+				rc->search_start = key.objectid +
+					rc->extent_root->leafsize;
 			memcpy(extent_key, &key, sizeof(key));
 			return 0;
 		}
@@ -3698,7 +3856,15 @@ int prepare_to_relocate(struct reloc_control *rc)
 	set_reloc_control(rc);
 
 	trans = btrfs_join_transaction(rc->extent_root);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		unset_reloc_control(rc);
+		/*
+		 * extent tree is not a ref_cow tree and has no reloc_root to
+		 * cleanup.  And callers are responsible to free the above
+		 * block rsv.
+		 */
+		return PTR_ERR(trans);
+	}
 	btrfs_commit_transaction(trans, rc->extent_root);
 	return 0;
 }
@@ -3730,7 +3896,11 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	while (1) {
 		progress++;
 		trans = btrfs_start_transaction(rc->extent_root, 0);
-		BUG_ON(IS_ERR(trans));
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
 restart:
 		if (update_backref_cache(trans, &rc->backref_cache)) {
 			btrfs_end_transaction(trans, rc->extent_root);
@@ -3974,7 +4144,7 @@ out:
 	return inode;
 }
 
-static struct reloc_control *alloc_reloc_control(void)
+static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
 {
 	struct reloc_control *rc;
 
@@ -3985,7 +4155,8 @@ static struct reloc_control *alloc_reloc_control(void)
 	INIT_LIST_HEAD(&rc->reloc_roots);
 	backref_cache_init(&rc->backref_cache);
 	mapping_tree_init(&rc->reloc_root_tree);
-	extent_io_tree_init(&rc->processed_blocks, NULL);
+	extent_io_tree_init(&rc->processed_blocks,
+			    fs_info->btree_inode->i_mapping);
 	return rc;
 }
 
@@ -4002,7 +4173,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	int rw = 0;
 	int err = 0;
 
-	rc = alloc_reloc_control();
+	rc = alloc_reloc_control(fs_info);
 	if (!rc)
 		return -ENOMEM;
 
@@ -4051,19 +4222,16 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	       (unsigned long long)rc->block_group->key.objectid,
 	       (unsigned long long)rc->block_group->flags);
 
-	ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+	ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
 	if (ret < 0) {
 		err = ret;
 		goto out;
 	}
-	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+	btrfs_wait_all_ordered_extents(fs_info, 0);
 
 	while (1) {
 		mutex_lock(&fs_info->cleaner_mutex);
-
-		btrfs_clean_old_snapshots(fs_info->tree_root);
 		ret = relocate_block_group(rc);
-
 		mutex_unlock(&fs_info->cleaner_mutex);
 		if (ret < 0) {
 			err = ret;
@@ -4171,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 		    key.type != BTRFS_ROOT_ITEM_KEY)
 			break;
 
-		reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+		reloc_root = btrfs_read_fs_root(root, &key);
 		if (IS_ERR(reloc_root)) {
 			err = PTR_ERR(reloc_root);
 			goto out;
@@ -4206,7 +4374,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 	if (list_empty(&reloc_roots))
 		goto out;
 
-	rc = alloc_reloc_control();
+	rc = alloc_reloc_control(root->fs_info);
 	if (!rc) {
 		err = -ENOMEM;
 		goto out;
@@ -4264,14 +4432,9 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 out_free:
 	kfree(rc);
 out:
-	while (!list_empty(&reloc_roots)) {
-		reloc_root = list_entry(reloc_roots.next,
-					struct btrfs_root, root_list);
-		list_del(&reloc_root->root_list);
-		free_extent_buffer(reloc_root->node);
-		free_extent_buffer(reloc_root->commit_root);
-		kfree(reloc_root);
-	}
+	if (!list_empty(&reloc_roots))
+		free_reloc_roots(&reloc_roots);
+
 	btrfs_free_path(path);
 
 	if (err == 0) {
@@ -4295,10 +4458,8 @@ out:
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 {
 	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	size_t offset;
 	int ret;
 	u64 disk_bytenr;
 	LIST_HEAD(list);
@@ -4312,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 	if (ret)
 		goto out;
 
+	disk_bytenr = ordered->start;
 	while (!list_empty(&list)) {
 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
 		list_del_init(&sums->list);
 
-		sector_sum = sums->sums;
-		sums->bytenr = ordered->start;
-
-		offset = 0;
-		while (offset < sums->len) {
-			sector_sum->bytenr += ordered->start - disk_bytenr;
-			sector_sum++;
-			offset += root->sectorsize;
-		}
+		sums->bytenr = disk_bytenr;
+		disk_bytenr += sums->len;
 
 		btrfs_add_ordered_sum(inode, ordered, sums);
 	}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 668af537a3e..ffb1036ef10 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -29,9 +29,8 @@
  * generation numbers as then we know the root was once mounted with an older
  * kernel that was not aware of the root item structure change.
  */
-void btrfs_read_root_item(struct btrfs_root *root,
-			 struct extent_buffer *eb, int slot,
-			 struct btrfs_root_item *item)
+void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+			  struct btrfs_root_item *item)
 {
 	uuid_le uuid;
 	int len;
@@ -65,52 +64,59 @@ void btrfs_read_root_item(struct btrfs_root *root,
 }
 
 /*
- * lookup the root with the highest offset for a given objectid.  The key we do
- * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
- * on error.
+ * btrfs_find_root - lookup the root by the key.
+ * root: the root of the root tree
+ * search_key: the key to search
+ * path: the path we search
+ * root_item: the root item of the tree we look for
+ * root_key: the reak key of the tree we look for
+ *
+ * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * of the search key, just lookup the root with the highest offset for a
+ * given objectid.
+ *
+ * If we find something return 0, otherwise > 0, < 0 on error.
  */
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
-			struct btrfs_root_item *item, struct btrfs_key *key)
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+		    struct btrfs_path *path, struct btrfs_root_item *root_item,
+		    struct btrfs_key *root_key)
 {
-	struct btrfs_path *path;
-	struct btrfs_key search_key;
 	struct btrfs_key found_key;
 	struct extent_buffer *l;
 	int ret;
 	int slot;
 
-	search_key.objectid = objectid;
-	search_key.type = BTRFS_ROOT_ITEM_KEY;
-	search_key.offset = (u64)-1;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
 	if (ret < 0)
-		goto out;
+		return ret;
 
-	BUG_ON(ret == 0);
-	if (path->slots[0] == 0) {
-		ret = 1;
-		goto out;
+	if (search_key->offset != -1ULL) {	/* the search key is exact */
+		if (ret > 0)
+			goto out;
+	} else {
+		BUG_ON(ret == 0);		/* Logical error */
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+		ret = 0;
 	}
+
 	l = path->nodes[0];
-	slot = path->slots[0] - 1;
+	slot = path->slots[0];
+
 	btrfs_item_key_to_cpu(l, &found_key, slot);
-	if (found_key.objectid != objectid ||
+	if (found_key.objectid != search_key->objectid ||
 	    found_key.type != BTRFS_ROOT_ITEM_KEY) {
 		ret = 1;
 		goto out;
 	}
-	if (item)
-		btrfs_read_root_item(root, l, slot, item);
-	if (key)
-		memcpy(key, &found_key, sizeof(found_key));
 
-	ret = 0;
+	if (root_item)
+		btrfs_read_root_item(l, slot, root_item);
+	if (root_key)
+		memcpy(root_key, &found_key, sizeof(found_key));
 out:
-	btrfs_free_path(path);
+	btrfs_release_path(path);
 	return ret;
 }
 
@@ -213,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return btrfs_insert_item(trans, root, key, item, sizeof(*item));
 }
 
-/*
- * at mount time we want to find all the old transaction snapshots that were in
- * the process of being deleted if we crashed.  This is any root item with an
- * offset lower than the latest root.  They need to be queued for deletion to
- * finish what was happening when we crashed.
- */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
-{
-	struct btrfs_root *dead_root;
-	struct btrfs_root_item *ri;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	struct btrfs_path *path;
-	int ret;
-	u32 nritems;
-	struct extent_buffer *leaf;
-	int slot;
-
-	key.objectid = objectid;
-	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-	key.offset = 0;
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-again:
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0)
-		goto err;
-	while (1) {
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-		slot = path->slots[0];
-		if (slot >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret)
-				break;
-			leaf = path->nodes[0];
-			nritems = btrfs_header_nritems(leaf);
-			slot = path->slots[0];
-		}
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
-			goto next;
-
-		if (key.objectid < objectid)
-			goto next;
-
-		if (key.objectid > objectid)
-			break;
-
-		ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
-		if (btrfs_disk_root_refs(leaf, ri) != 0)
-			goto next;
-
-		memcpy(&found_key, &key, sizeof(key));
-		key.offset++;
-		btrfs_release_path(path);
-		dead_root =
-			btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-						    &found_key);
-		if (IS_ERR(dead_root)) {
-			ret = PTR_ERR(dead_root);
-			goto err;
-		}
-
-		ret = btrfs_add_dead_root(dead_root);
-		if (ret)
-			goto err;
-		goto again;
-next:
-		slot++;
-		path->slots[0]++;
-	}
-	ret = 0;
-err:
-	btrfs_free_path(path);
-	return ret;
-}
-
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 {
 	struct extent_buffer *leaf;
@@ -302,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 	struct btrfs_root *root;
 	int err = 0;
 	int ret;
+	bool can_recover = true;
+
+	if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
+		can_recover = false;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -341,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 		root_key.objectid = key.offset;
 		key.offset++;
 
-		root = btrfs_read_fs_root_no_name(tree_root->fs_info,
-						  &root_key);
-		if (!IS_ERR(root))
+		root = btrfs_read_fs_root(tree_root, &root_key);
+		err = PTR_RET(root);
+		if (err && err != -ENOENT) {
+			break;
+		} else if (err == -ENOENT) {
+			struct btrfs_trans_handle *trans;
+
+			btrfs_release_path(path);
+
+			trans = btrfs_join_transaction(tree_root);
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
+				btrfs_error(tree_root->fs_info, err,
+					    "Failed to start trans to delete "
+					    "orphan item");
+				break;
+			}
+			err = btrfs_del_orphan_item(trans, tree_root,
+						    root_key.objectid);
+			btrfs_end_transaction(trans, tree_root);
+			if (err) {
+				btrfs_error(tree_root->fs_info, err,
+					    "Failed to delete root orphan "
+					    "item");
+				break;
+			}
 			continue;
+		}
 
-		ret = PTR_ERR(root);
-		if (ret != -ENOENT) {
-			err = ret;
+		if (btrfs_root_refs(&root->root_item) == 0) {
+			btrfs_add_dead_root(root);
+			continue;
+		}
+
+		err = btrfs_init_fs_root(root);
+		if (err) {
+			btrfs_free_fs_root(root);
 			break;
 		}
 
-		ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
-		if (ret) {
-			err = ret;
+		root->orphan_item_inserted = 1;
+
+		err = btrfs_insert_fs_root(root->fs_info, root);
+		if (err) {
+			BUG_ON(err == -EEXIST);
+			btrfs_free_fs_root(root);
 			break;
 		}
 	}
@@ -369,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	struct btrfs_path *path;
 	int ret;
-	struct btrfs_root_item *ri;
-	struct extent_buffer *leaf;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -380,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		goto out;
 
 	BUG_ON(ret != 0);
-	leaf = path->nodes[0];
-	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
 
 	ret = btrfs_del_item(trans, root, path);
 out:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 67783e03d12..4ba2a69a60a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -28,6 +28,7 @@
 #include "dev-replace.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "raid56.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -541,7 +542,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	eb = path->nodes[0];
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
-	btrfs_release_path(path);
 
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		do {
@@ -557,7 +557,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 				ret < 0 ? -1 : ref_level,
 				ret < 0 ? -1 : ref_root);
 		} while (ret != 1);
+		btrfs_release_path(path);
 	} else {
+		btrfs_release_path(path);
 		swarn.path = path;
 		swarn.dev = dev;
 		iterate_extent_inodes(fs_info, found_key.objectid,
@@ -1294,7 +1296,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 		}
 
 		WARN_ON(!page->page);
-		bio = bio_alloc(GFP_NOFS, 1);
+		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 		if (!bio) {
 			page->io_error = 1;
 			sblock->no_io_error_seen = 0;
@@ -1334,7 +1336,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 	int page_num;
 	u8 calculated_csum[BTRFS_CSUM_SIZE];
 	u32 crc = ~(u32)0;
-	struct btrfs_root *root = fs_info->extent_root;
 	void *mapped_buffer;
 
 	WARN_ON(!sblock->pagev[0]->page);
@@ -1363,12 +1364,11 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 
 	for (page_num = 0;;) {
 		if (page_num == 0 && is_metadata)
-			crc = btrfs_csum_data(root,
+			crc = btrfs_csum_data(
 				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
 				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
 		else
-			crc = btrfs_csum_data(root, mapped_buffer, crc,
-					      PAGE_SIZE);
+			crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
 
 		kunmap_atomic(mapped_buffer);
 		page_num++;
@@ -1431,7 +1431,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 			return -EIO;
 		}
 
-		bio = bio_alloc(GFP_NOFS, 1);
+		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 		if (!bio)
 			return -EIO;
 		bio->bi_bdev = page_bad->dev->bdev;
@@ -1522,7 +1522,7 @@ again:
 		sbio->dev = wr_ctx->tgtdev;
 		bio = sbio->bio;
 		if (!bio) {
-			bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+			bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
 			if (!bio) {
 				mutex_unlock(&wr_ctx->wr_lock);
 				return -ENOMEM;
@@ -1655,7 +1655,6 @@ static int scrub_checksum_data(struct scrub_block *sblock)
 	void *buffer;
 	u32 crc = ~(u32)0;
 	int fail = 0;
-	struct btrfs_root *root = sctx->dev_root;
 	u64 len;
 	int index;
 
@@ -1672,7 +1671,7 @@ static int scrub_checksum_data(struct scrub_block *sblock)
 	for (;;) {
 		u64 l = min_t(u64, len, PAGE_SIZE);
 
-		crc = btrfs_csum_data(root, buffer, crc, l);
+		crc = btrfs_csum_data(buffer, crc, l);
 		kunmap_atomic(buffer);
 		len -= l;
 		if (len == 0)
@@ -1742,7 +1741,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 	for (;;) {
 		u64 l = min_t(u64, len, mapped_size);
 
-		crc = btrfs_csum_data(root, p, crc, l);
+		crc = btrfs_csum_data(p, crc, l);
 		kunmap_atomic(mapped_buffer);
 		len -= l;
 		if (len == 0)
@@ -1803,7 +1802,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 	for (;;) {
 		u64 l = min_t(u64, len, mapped_size);
 
-		crc = btrfs_csum_data(root, p, crc, l);
+		crc = btrfs_csum_data(p, crc, l);
 		kunmap_atomic(mapped_buffer);
 		len -= l;
 		if (len == 0)
@@ -1931,7 +1930,7 @@ again:
 		sbio->dev = spage->dev;
 		bio = sbio->bio;
 		if (!bio) {
-			bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
+			bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
 			if (!bio)
 				return -ENOMEM;
 			sbio->bio = bio;
@@ -2127,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
 			   u8 *csum)
 {
 	struct btrfs_ordered_sum *sum = NULL;
-	int ret = 0;
-	unsigned long i;
+	unsigned long index;
 	unsigned long num_sectors;
 
 	while (!list_empty(&sctx->csum_list)) {
@@ -2147,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
 	if (!sum)
 		return 0;
 
+	index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
 	num_sectors = sum->len / sctx->sectorsize;
-	for (i = 0; i < num_sectors; ++i) {
-		if (sum->sums[i].bytenr == logical) {
-			memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
-			ret = 1;
-			break;
-		}
-	}
-	if (ret && i == num_sectors - 1) {
+	memcpy(csum, sum->sums + index, sctx->csum_size);
+	if (index == num_sectors - 1) {
 		list_del(&sum->list);
 		kfree(sum);
 	}
-	return ret;
+	return 1;
 }
 
 /* scrub extent tries to collect up to 64 kB for each bio */
@@ -2234,12 +2227,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	u64 flags;
 	int ret;
 	int slot;
-	int i;
 	u64 nstripes;
 	struct extent_buffer *l;
 	struct btrfs_key key;
 	u64 physical;
 	u64 logical;
+	u64 logic_end;
 	u64 generation;
 	int mirror_num;
 	struct reada_control *reada1;
@@ -2253,6 +2246,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	u64 extent_len;
 	struct btrfs_device *extent_dev;
 	int extent_mirror_num;
+	int stop_loop;
+
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			 BTRFS_BLOCK_GROUP_RAID6)) {
+		if (num >= nr_data_stripes(map)) {
+			return 0;
+		}
+	}
 
 	nstripes = length;
 	offset = 0;
@@ -2306,8 +2307,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	key_start.type = BTRFS_EXTENT_ITEM_KEY;
 	key_start.offset = (u64)0;
 	key_end.objectid = base + offset + nstripes * increment;
-	key_end.type = BTRFS_EXTENT_ITEM_KEY;
-	key_end.offset = (u64)0;
+	key_end.type = BTRFS_METADATA_ITEM_KEY;
+	key_end.offset = (u64)-1;
 	reada1 = btrfs_reada_add(root, &key_start, &key_end);
 
 	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -2345,8 +2346,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	 */
 	logical = base + offset;
 	physical = map->stripes[num].physical;
+	logic_end = logical + increment * nstripes;
 	ret = 0;
-	for (i = 0; i < nstripes; ++i) {
+	while (logical < logic_end) {
 		/*
 		 * canceled?
 		 */
@@ -2382,19 +2384,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			wake_up(&fs_info->scrub_pause_wait);
 		}
 
-		ret = btrfs_lookup_csums_range(csum_root, logical,
-					       logical + map->stripe_len - 1,
-					       &sctx->csum_list, 1);
-		if (ret)
-			goto out;
-
 		key.objectid = logical;
 		key.type = BTRFS_EXTENT_ITEM_KEY;
-		key.offset = (u64)0;
+		key.offset = (u64)-1;
 
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
+
 		if (ret > 0) {
 			ret = btrfs_previous_item(root, path, 0,
 						  BTRFS_EXTENT_ITEM_KEY);
@@ -2411,7 +2408,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			}
 		}
 
+		stop_loop = 0;
 		while (1) {
+			u64 bytes;
+
 			l = path->nodes[0];
 			slot = path->slots[0];
 			if (slot >= btrfs_header_nritems(l)) {
@@ -2421,19 +2421,30 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 				if (ret < 0)
 					goto out;
 
+				stop_loop = 1;
 				break;
 			}
 			btrfs_item_key_to_cpu(l, &key, slot);
 
-			if (key.objectid + key.offset <= logical)
-				goto next;
+			if (key.type == BTRFS_METADATA_ITEM_KEY)
+				bytes = root->leafsize;
+			else
+				bytes = key.offset;
 
-			if (key.objectid >= logical + map->stripe_len)
-				break;
+			if (key.objectid + bytes <= logical)
+				goto next;
 
-			if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+			    key.type != BTRFS_METADATA_ITEM_KEY)
 				goto next;
 
+			if (key.objectid >= logical + map->stripe_len) {
+				/* out of this device extent */
+				if (key.objectid >= logic_end)
+					stop_loop = 1;
+				break;
+			}
+
 			extent = btrfs_item_ptr(l, slot,
 						struct btrfs_extent_item);
 			flags = btrfs_extent_flags(l, extent);
@@ -2449,22 +2460,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 				goto next;
 			}
 
+again:
+			extent_logical = key.objectid;
+			extent_len = bytes;
+
 			/*
 			 * trim extent to this stripe
 			 */
-			if (key.objectid < logical) {
-				key.offset -= logical - key.objectid;
-				key.objectid = logical;
+			if (extent_logical < logical) {
+				extent_len -= logical - extent_logical;
+				extent_logical = logical;
 			}
-			if (key.objectid + key.offset >
+			if (extent_logical + extent_len >
 			    logical + map->stripe_len) {
-				key.offset = logical + map->stripe_len -
-					     key.objectid;
+				extent_len = logical + map->stripe_len -
+					     extent_logical;
 			}
 
-			extent_logical = key.objectid;
-			extent_physical = key.objectid - logical + physical;
-			extent_len = key.offset;
+			extent_physical = extent_logical - logical + physical;
 			extent_dev = scrub_dev;
 			extent_mirror_num = mirror_num;
 			if (is_dev_replace)
@@ -2472,13 +2485,36 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 						   extent_len, &extent_physical,
 						   &extent_dev,
 						   &extent_mirror_num);
+
+			ret = btrfs_lookup_csums_range(csum_root, logical,
+						logical + map->stripe_len - 1,
+						&sctx->csum_list, 1);
+			if (ret)
+				goto out;
+
 			ret = scrub_extent(sctx, extent_logical, extent_len,
 					   extent_physical, extent_dev, flags,
 					   generation, extent_mirror_num,
-					   key.objectid - logical + physical);
+					   extent_physical);
 			if (ret)
 				goto out;
 
+			scrub_free_csums(sctx);
+			if (extent_logical + extent_len <
+			    key.objectid + bytes) {
+				logical += increment;
+				physical += map->stripe_len;
+
+				if (logical < key.objectid + bytes) {
+					cond_resched();
+					goto again;
+				}
+
+				if (logical >= logic_end) {
+					stop_loop = 1;
+					break;
+				}
+			}
 next:
 			path->slots[0]++;
 		}
@@ -2486,8 +2522,14 @@ next:
 		logical += increment;
 		physical += map->stripe_len;
 		spin_lock(&sctx->stat_lock);
-		sctx->stat.last_physical = physical;
+		if (stop_loop)
+			sctx->stat.last_physical = map->stripes[num].physical +
+						   length;
+		else
+			sctx->stat.last_physical = physical;
 		spin_unlock(&sctx->stat_lock);
+		if (stop_loop)
+			break;
 	}
 out:
 	/* push queued extents */
@@ -2708,7 +2750,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
 	int	ret;
 	struct btrfs_root *root = sctx->dev_root;
 
-	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		return -EIO;
 
 	gen = root->fs_info->last_trans_committed;
@@ -2996,28 +3038,6 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_device *dev;
-	int ret;
-
-	/*
-	 * we have to hold the device_list_mutex here so the device
-	 * does not go away in cancel_dev. FIXME: find a better solution
-	 */
-	mutex_lock(&fs_info->fs_devices->device_list_mutex);
-	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
-	if (!dev) {
-		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		return -ENODEV;
-	}
-	ret = btrfs_scrub_cancel_dev(fs_info, dev);
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-
-	return ret;
-}
-
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 			 struct btrfs_scrub_progress *progress)
 {
@@ -3179,16 +3199,18 @@ out:
 
 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
 {
-	unsigned long index;
 	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
-	int ret = 0;
+	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
 	struct btrfs_key key;
-	struct inode *inode = NULL;
+	struct inode *inode;
+	struct page *page;
 	struct btrfs_root *local_root;
 	u64 physical_for_dev_replace;
 	u64 len;
-	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+	unsigned long index;
 	int srcu_index;
+	int ret;
+	int err;
 
 	key.objectid = root;
 	key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3202,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
 		return PTR_ERR(local_root);
 	}
 
+	if (btrfs_root_refs(&local_root->root_item) == 0) {
+		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+		return -ENOENT;
+	}
+
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.objectid = inum;
 	key.offset = 0;
@@ -3210,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
+	/* Avoid truncate/dio/punch hole.. */
+	mutex_lock(&inode->i_mutex);
+	inode_dio_wait(inode);
+
+	ret = 0;
 	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
 	len = nocow_ctx->len;
 	while (len >= PAGE_CACHE_SIZE) {
-		struct page *page = NULL;
-		int ret_sub;
-
 		index = offset >> PAGE_CACHE_SHIFT;
-
+again:
 		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 		if (!page) {
 			pr_err("find_or_create_page() failed\n");
 			ret = -ENOMEM;
-			goto next_page;
+			goto out;
 		}
 
 		if (PageUptodate(page)) {
@@ -3230,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
 				goto next_page;
 		} else {
 			ClearPageError(page);
-			ret_sub = extent_read_full_page(&BTRFS_I(inode)->
+			err = extent_read_full_page(&BTRFS_I(inode)->
 							 io_tree,
 							page, btrfs_get_extent,
 							nocow_ctx->mirror_num);
-			if (ret_sub) {
-				ret = ret_sub;
+			if (err) {
+				ret = err;
 				goto next_page;
 			}
-			wait_on_page_locked(page);
+
+			lock_page(page);
+			/*
+			 * If the page has been remove from the page cache,
+			 * the data on it is meaningless, because it may be
+			 * old one, the new data may be written into the new
+			 * page in the page cache.
+			 */
+			if (page->mapping != inode->i_mapping) {
+				page_cache_release(page);
+				goto again;
+			}
 			if (!PageUptodate(page)) {
 				ret = -EIO;
 				goto next_page;
 			}
 		}
-		ret_sub = write_page_nocow(nocow_ctx->sctx,
-					   physical_for_dev_replace, page);
-		if (ret_sub) {
-			ret = ret_sub;
-			goto next_page;
-		}
-
+		err = write_page_nocow(nocow_ctx->sctx,
+				       physical_for_dev_replace, page);
+		if (err)
+			ret = err;
 next_page:
-		if (page) {
-			unlock_page(page);
-			put_page(page);
-		}
+		unlock_page(page);
+		page_cache_release(page);
+
+		if (ret)
+			break;
+
 		offset += PAGE_CACHE_SIZE;
 		physical_for_dev_replace += PAGE_CACHE_SIZE;
 		len -= PAGE_CACHE_SIZE;
 	}
-
-	if (inode)
-		iput(inode);
+out:
+	mutex_unlock(&inode->i_mutex);
+	iput(inode);
 	return ret;
 }
 
@@ -3282,7 +3321,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 			"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
 		return -EIO;
 	}
-	bio = bio_alloc(GFP_NOFS, 1);
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 	if (!bio) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.malloc_errors++;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 321b7fb4e44..d3f3b43cae0 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -85,6 +85,7 @@ struct send_ctx {
 	u32 send_max_size;
 	u64 total_send_size;
 	u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+	u64 flags;	/* 'flags' member of btrfs_ioctl_send_args is u64 */
 
 	struct vfsmount *mnt;
 
@@ -157,7 +158,7 @@ static void fs_path_reset(struct fs_path *p)
 	}
 }
 
-static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc(void)
 {
 	struct fs_path *p;
 
@@ -172,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
 	return p;
 }
 
-static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
+static struct fs_path *fs_path_alloc_reversed(void)
 {
 	struct fs_path *p;
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return NULL;
 	p->reversed = 1;
@@ -184,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
 	return p;
 }
 
-static void fs_path_free(struct send_ctx *sctx, struct fs_path *p)
+static void fs_path_free(struct fs_path *p)
 {
 	if (!p)
 		return;
@@ -386,7 +387,7 @@ static struct btrfs_path *alloc_path_for_send(void)
 	return path;
 }
 
-int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
+static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
 {
 	int ret;
 	mm_segment_t old_fs;
@@ -752,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
  *
  * path must point to the INODE_REF or INODE_EXTREF when called.
  */
-static int iterate_inode_ref(struct send_ctx *sctx,
-			     struct btrfs_root *root, struct btrfs_path *path,
+static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 			     struct btrfs_key *found_key, int resolve,
 			     iterate_inode_ref_t iterate, void *ctx)
 {
@@ -776,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx,
 	unsigned long elem_size;
 	unsigned long ptr;
 
-	p = fs_path_alloc_reversed(sctx);
+	p = fs_path_alloc_reversed();
 	if (!p)
 		return -ENOMEM;
 
 	tmp_path = alloc_path_for_send();
 	if (!tmp_path) {
-		fs_path_free(sctx, p);
+		fs_path_free(p);
 		return -ENOMEM;
 	}
 
@@ -857,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx,
 
 out:
 	btrfs_free_path(tmp_path);
-	fs_path_free(sctx, p);
+	fs_path_free(p);
 	return ret;
 }
 
@@ -873,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
  *
  * path must point to the dir item when called.
  */
-static int iterate_dir_item(struct send_ctx *sctx,
-			    struct btrfs_root *root, struct btrfs_path *path,
+static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 			    struct btrfs_key *found_key,
 			    iterate_dir_item_t iterate, void *ctx)
 {
@@ -989,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index,
  * Retrieve the first path of an inode. If an inode has more then one
  * ref/hardlink, this is ignored.
  */
-static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
+static int get_inode_path(struct btrfs_root *root,
 			  u64 ino, struct fs_path *path)
 {
 	int ret;
@@ -1021,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
 		goto out;
 	}
 
-	ret = iterate_inode_ref(sctx, root, p, &found_key, 1,
-			__copy_first_ref, path);
+	ret = iterate_inode_ref(root, p, &found_key, 1,
+				__copy_first_ref, path);
 	if (ret < 0)
 		goto out;
 	ret = 0;
@@ -1313,8 +1312,7 @@ out:
 	return ret;
 }
 
-static int read_symlink(struct send_ctx *sctx,
-			struct btrfs_root *root,
+static int read_symlink(struct btrfs_root *root,
 			u64 ino,
 			struct fs_path *dest)
 {
@@ -1561,8 +1559,7 @@ out:
  * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
  * generation of the parent dir and the name of the dir entry.
  */
-static int get_first_ref(struct send_ctx *sctx,
-			 struct btrfs_root *root, u64 ino,
+static int get_first_ref(struct btrfs_root *root, u64 ino,
 			 u64 *dir, u64 *dir_gen, struct fs_path *name)
 {
 	int ret;
@@ -1627,8 +1624,7 @@ out:
 	return ret;
 }
 
-static int is_first_ref(struct send_ctx *sctx,
-			struct btrfs_root *root,
+static int is_first_ref(struct btrfs_root *root,
 			u64 ino, u64 dir,
 			const char *name, int name_len)
 {
@@ -1637,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx,
 	u64 tmp_dir;
 	u64 tmp_dir_gen;
 
-	tmp_name = fs_path_alloc(sctx);
+	tmp_name = fs_path_alloc();
 	if (!tmp_name)
 		return -ENOMEM;
 
-	ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
+	ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
 	if (ret < 0)
 		goto out;
 
@@ -1653,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx,
 	ret = !memcmp(tmp_name->start, name, name_len);
 
 out:
-	fs_path_free(sctx, tmp_name);
+	fs_path_free(tmp_name);
 	return ret;
 }
 
@@ -1782,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
 	if (!sctx->parent_root)
 		goto out;
 
-	name = fs_path_alloc(sctx);
+	name = fs_path_alloc();
 	if (!name)
 		return -ENOMEM;
 
-	ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name);
+	ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
 	if (ret < 0)
 		goto out;
 
@@ -1794,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
 			name->start, fs_path_len(name));
 
 out:
-	fs_path_free(sctx, name);
+	fs_path_free(name);
 	return ret;
 }
 
@@ -1978,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	 * send_root or parent_root for ref lookup.
 	 */
 	if (ino < sctx->send_progress)
-		ret = get_first_ref(sctx, sctx->send_root, ino,
-				parent_ino, parent_gen, dest);
+		ret = get_first_ref(sctx->send_root, ino,
+				    parent_ino, parent_gen, dest);
 	else
-		ret = get_first_ref(sctx, sctx->parent_root, ino,
-				parent_ino, parent_gen, dest);
+		ret = get_first_ref(sctx->parent_root, ino,
+				    parent_ino, parent_gen, dest);
 	if (ret < 0)
 		goto out;
 
@@ -2069,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	u64 parent_gen = 0;
 	int stop = 0;
 
-	name = fs_path_alloc(sctx);
+	name = fs_path_alloc();
 	if (!name) {
 		ret = -ENOMEM;
 		goto out;
@@ -2097,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	}
 
 out:
-	fs_path_free(sctx, name);
+	fs_path_free(name);
 	if (!ret)
 		fs_path_unreverse(dest);
 	return ret;
@@ -2262,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
 
 verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
@@ -2280,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
 
 tlv_put_failure:
 out:
-	fs_path_free(sctx, p);
+	fs_path_free(p);
 	return ret;
 }
 
@@ -2291,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
 
 verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
@@ -2309,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
 
 tlv_put_failure:
 out:
-	fs_path_free(sctx, p);
+	fs_path_free(p);
 	return ret;
 }
 
@@ -2320,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
 
 verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
@@ -2339,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
 
 tlv_put_failure:
 out:
-	fs_path_free(sctx, p);
+	fs_path_free(p);
 	return ret;
 }
 
@@ -2355,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
 
 verbose_printk("btrfs: send_utimes %llu\n", ino);
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
@@ -2396,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
 
 tlv_put_failure:
 out:
-	fs_path_free(sctx, p);
+	fs_path_free(p);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -2417,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
 
 verbose_printk("btrfs: send_create_inode %llu\n", ino);
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
@@ -2458,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 
 	if (S_ISLNK(mode)) {
 		fs_path_reset(p);
-		ret = read_symlink(sctx, sctx->send_root, ino, p);
+		ret = read_symlink(sctx->send_root, ino, p);
 		if (ret < 0)
 			goto out;
 		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
@@ -2475,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 
 tlv_put_failure:
 out:
-	fs_path_free(sctx, p);
+	fs_path_free(p);
 	return ret;
 }
 
@@ -2614,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir,
 	return 0;
 }
 
-static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
+static void __free_recorded_refs(struct list_head *head)
 {
 	struct recorded_ref *cur;
 
 	while (!list_empty(head)) {
 		cur = list_entry(head->next, struct recorded_ref, list);
-		fs_path_free(sctx, cur->full_path);
+		fs_path_free(cur->full_path);
 		list_del(&cur->list);
 		kfree(cur);
 	}
@@ -2628,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
 
 static void free_recorded_refs(struct send_ctx *sctx)
 {
-	__free_recorded_refs(sctx, &sctx->new_refs);
-	__free_recorded_refs(sctx, &sctx->deleted_refs);
+	__free_recorded_refs(&sctx->new_refs);
+	__free_recorded_refs(&sctx->deleted_refs);
 }
 
 /*
@@ -2643,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
 	int ret;
 	struct fs_path *orphan;
 
-	orphan = fs_path_alloc(sctx);
+	orphan = fs_path_alloc();
 	if (!orphan)
 		return -ENOMEM;
 
@@ -2654,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
 	ret = send_rename(sctx, path, orphan);
 
 out:
-	fs_path_free(sctx, orphan);
+	fs_path_free(orphan);
 	return ret;
 }
 
@@ -2745,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 	 */
 	BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
 
-	valid_path = fs_path_alloc(sctx);
+	valid_path = fs_path_alloc();
 	if (!valid_path) {
 		ret = -ENOMEM;
 		goto out;
@@ -2842,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		if (ret < 0)
 			goto out;
 		if (ret) {
-			ret = is_first_ref(sctx, sctx->parent_root,
-					ow_inode, cur->dir, cur->name,
-					cur->name_len);
+			ret = is_first_ref(sctx->parent_root,
+					   ow_inode, cur->dir, cur->name,
+					   cur->name_len);
 			if (ret < 0)
 				goto out;
 			if (ret) {
@@ -3023,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 out:
 	free_recorded_refs(sctx);
 	ulist_free(check_dirs);
-	fs_path_free(sctx, valid_path);
+	fs_path_free(valid_path);
 	return ret;
 }
 
@@ -3036,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index,
 	struct fs_path *p;
 	u64 gen;
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
@@ -3056,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index,
 
 out:
 	if (ret)
-		fs_path_free(sctx, p);
+		fs_path_free(p);
 	return ret;
 }
 
@@ -3069,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
 	struct fs_path *p;
 	u64 gen;
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
@@ -3089,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
 
 out:
 	if (ret)
-		fs_path_free(sctx, p);
+		fs_path_free(p);
 	return ret;
 }
 
@@ -3097,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx)
 {
 	int ret;
 
-	ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
-			sctx->cmp_key, 0, __record_new_ref, sctx);
+	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
+				sctx->cmp_key, 0, __record_new_ref, sctx);
 	if (ret < 0)
 		goto out;
 	ret = 0;
@@ -3111,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
 {
 	int ret;
 
-	ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
-			sctx->cmp_key, 0, __record_deleted_ref, sctx);
+	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
+				sctx->cmp_key, 0, __record_deleted_ref, sctx);
 	if (ret < 0)
 		goto out;
 	ret = 0;
@@ -3141,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index,
 	return 0;
 }
 
-static int find_iref(struct send_ctx *sctx,
-		     struct btrfs_root *root,
+static int find_iref(struct btrfs_root *root,
 		     struct btrfs_path *path,
 		     struct btrfs_key *key,
 		     u64 dir, struct fs_path *name)
@@ -3154,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx,
 	ctx.name = name;
 	ctx.found_idx = -1;
 
-	ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx);
+	ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
 	if (ret < 0)
 		return ret;
 
@@ -3171,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
 	int ret;
 	struct send_ctx *sctx = ctx;
 
-	ret = find_iref(sctx, sctx->parent_root, sctx->right_path,
+	ret = find_iref(sctx->parent_root, sctx->right_path,
 			sctx->cmp_key, dir, name);
 	if (ret == -ENOENT)
 		ret = __record_new_ref(num, dir, index, name, sctx);
@@ -3188,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
 	int ret;
 	struct send_ctx *sctx = ctx;
 
-	ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
+	ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
 			dir, name);
 	if (ret == -ENOENT)
 		ret = __record_deleted_ref(num, dir, index, name, sctx);
@@ -3202,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx)
 {
 	int ret = 0;
 
-	ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path,
+	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
 			sctx->cmp_key, 0, __record_changed_new_ref, sctx);
 	if (ret < 0)
 		goto out;
-	ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path,
+	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
 			sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
 	if (ret < 0)
 		goto out;
@@ -3265,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx,
 		     found_key.type != BTRFS_INODE_EXTREF_KEY))
 			break;
 
-		ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
-				sctx);
+		ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
 		btrfs_release_path(path);
 		if (ret < 0)
 			goto out;
@@ -3334,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
 	struct fs_path *p;
 	posix_acl_xattr_header dummy_acl;
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
@@ -3361,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
 	ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
 
 out:
-	fs_path_free(sctx, p);
+	fs_path_free(p);
 	return ret;
 }
 
@@ -3374,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
 	struct send_ctx *sctx = ctx;
 	struct fs_path *p;
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
@@ -3385,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
 	ret = send_remove_xattr(sctx, p, name, name_len);
 
 out:
-	fs_path_free(sctx, p);
+	fs_path_free(p);
 	return ret;
 }
 
@@ -3393,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx)
 {
 	int ret = 0;
 
-	ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
-			sctx->cmp_key, __process_new_xattr, sctx);
+	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
+			       sctx->cmp_key, __process_new_xattr, sctx);
 
 	return ret;
 }
@@ -3403,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx)
 {
 	int ret;
 
-	ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
-			sctx->cmp_key, __process_deleted_xattr, sctx);
+	ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
+			       sctx->cmp_key, __process_deleted_xattr, sctx);
 
 	return ret;
 }
@@ -3428,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
 	    strncmp(name, ctx->name, name_len) == 0) {
 		ctx->found_idx = num;
 		ctx->found_data_len = data_len;
-		ctx->found_data = kmalloc(data_len, GFP_NOFS);
+		ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
 		if (!ctx->found_data)
 			return -ENOMEM;
-		memcpy(ctx->found_data, data, data_len);
 		return 1;
 	}
 	return 0;
 }
 
-static int find_xattr(struct send_ctx *sctx,
-		      struct btrfs_root *root,
+static int find_xattr(struct btrfs_root *root,
 		      struct btrfs_path *path,
 		      struct btrfs_key *key,
 		      const char *name, int name_len,
@@ -3453,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx,
 	ctx.found_data = NULL;
 	ctx.found_data_len = 0;
 
-	ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx);
+	ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
 	if (ret < 0)
 		return ret;
 
@@ -3478,11 +3470,10 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
 	struct send_ctx *sctx = ctx;
 	char *found_data = NULL;
 	int found_data_len  = 0;
-	struct fs_path *p = NULL;
 
-	ret = find_xattr(sctx, sctx->parent_root, sctx->right_path,
-			sctx->cmp_key, name, name_len, &found_data,
-			&found_data_len);
+	ret = find_xattr(sctx->parent_root, sctx->right_path,
+			 sctx->cmp_key, name, name_len, &found_data,
+			 &found_data_len);
 	if (ret == -ENOENT) {
 		ret = __process_new_xattr(num, di_key, name, name_len, data,
 				data_len, type, ctx);
@@ -3497,7 +3488,6 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
 	}
 
 	kfree(found_data);
-	fs_path_free(sctx, p);
 	return ret;
 }
 
@@ -3509,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
 	int ret;
 	struct send_ctx *sctx = ctx;
 
-	ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key,
-			name, name_len, NULL, NULL);
+	ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
+			 name, name_len, NULL, NULL);
 	if (ret == -ENOENT)
 		ret = __process_deleted_xattr(num, di_key, name, name_len, data,
 				data_len, type, ctx);
@@ -3524,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
 {
 	int ret = 0;
 
-	ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path,
+	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
 			sctx->cmp_key, __process_changed_new_xattr, sctx);
 	if (ret < 0)
 		goto out;
-	ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path,
+	ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
 			sctx->cmp_key, __process_changed_deleted_xattr, sctx);
 
 out:
@@ -3573,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
 			goto out;
 		}
 
-		ret = iterate_dir_item(sctx, root, path, &found_key,
-				__process_new_xattr, sctx);
+		ret = iterate_dir_item(root, path, &found_key,
+				       __process_new_xattr, sctx);
 		if (ret < 0)
 			goto out;
 
@@ -3599,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
 	int num_read = 0;
 	mm_segment_t old_fs;
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
@@ -3641,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 
 tlv_put_failure:
 out:
-	fs_path_free(sctx, p);
+	fs_path_free(p);
 	set_fs(old_fs);
 	if (ret < 0)
 		return ret;
@@ -3664,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 		clone_root->root->objectid, clone_root->ino,
 		clone_root->offset);
 
-	p = fs_path_alloc(sctx);
+	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
@@ -3687,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 			goto out;
 		ret = get_cur_path(sctx, clone_root->ino, gen, p);
 	} else {
-		ret = get_inode_path(sctx, clone_root->root,
-				clone_root->ino, p);
+		ret = get_inode_path(clone_root->root, clone_root->ino, p);
 	}
 	if (ret < 0)
 		goto out;
@@ -3705,7 +3694,40 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 
 tlv_put_failure:
 out:
-	fs_path_free(sctx, p);
+	fs_path_free(p);
+	return ret;
+}
+
+/*
+ * Send an update extent command to user space.
+ */
+static int send_update_extent(struct send_ctx *sctx,
+			      u64 offset, u32 len)
+{
+	int ret = 0;
+	struct fs_path *p;
+
+	p = fs_path_alloc();
+	if (!p)
+		return -ENOMEM;
+
+	ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
+	if (ret < 0)
+		goto out;
+
+	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	if (ret < 0)
+		goto out;
+
+	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+
+	ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+	fs_path_free(p);
 	return ret;
 }
 
@@ -3744,7 +3766,11 @@ static int send_write_or_clone(struct send_ctx *sctx,
 		goto out;
 	}
 
-	if (!clone_root) {
+	if (clone_root) {
+		ret = send_clone(sctx, offset, len, clone_root);
+	} else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
+		ret = send_update_extent(sctx, offset, len);
+	} else {
 		while (pos < len) {
 			l = len - pos;
 			if (l > BTRFS_SEND_READ_SIZE)
@@ -3757,10 +3783,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
 			pos += ret;
 		}
 		ret = 0;
-	} else {
-		ret = send_clone(sctx, offset, len, clone_root);
 	}
-
 out:
 	return ret;
 }
@@ -3910,12 +3933,10 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		    found_key.type != key.type) {
 			key.offset += right_len;
 			break;
-		} else {
-			if (found_key.offset != key.offset + right_len) {
-				/* Should really not happen */
-				ret = -EIO;
-				goto out;
-			}
+		}
+		if (found_key.offset != key.offset + right_len) {
+			ret = 0;
+			goto out;
 		}
 		key = found_key;
 	}
@@ -4496,9 +4517,11 @@ static int send_subvol(struct send_ctx *sctx)
 {
 	int ret;
 
-	ret = send_header(sctx);
-	if (ret < 0)
-		goto out;
+	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
+		ret = send_header(sctx);
+		if (ret < 0)
+			goto out;
+	}
 
 	ret = send_subvol_begin(sctx);
 	if (ret < 0)
@@ -4536,7 +4559,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	struct btrfs_fs_info *fs_info;
 	struct btrfs_ioctl_send_args *arg = NULL;
 	struct btrfs_key key;
-	struct file *filp = NULL;
 	struct send_ctx *sctx = NULL;
 	u32 i;
 	u64 *clone_sources_tmp = NULL;
@@ -4544,9 +4566,44 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	send_root = BTRFS_I(fdentry(mnt_file)->d_inode)->root;
+	send_root = BTRFS_I(file_inode(mnt_file))->root;
 	fs_info = send_root->fs_info;
 
+	/*
+	 * This is done when we lookup the root, it should already be complete
+	 * by the time we get here.
+	 */
+	WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
+
+	/*
+	 * If we just created this root we need to make sure that the orphan
+	 * cleanup has been done and committed since we search the commit root,
+	 * so check its commit root transid with our otransid and if they match
+	 * commit the transaction to make sure everything is updated.
+	 */
+	down_read(&send_root->fs_info->extent_commit_sem);
+	if (btrfs_header_generation(send_root->commit_root) ==
+	    btrfs_root_otransid(&send_root->root_item)) {
+		struct btrfs_trans_handle *trans;
+
+		up_read(&send_root->fs_info->extent_commit_sem);
+
+		trans = btrfs_attach_transaction_barrier(send_root);
+		if (IS_ERR(trans)) {
+			if (PTR_ERR(trans) != -ENOENT) {
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+			/* ENOENT means theres no transaction */
+		} else {
+			ret = btrfs_commit_transaction(trans, send_root);
+			if (ret)
+				goto out;
+		}
+	} else {
+		up_read(&send_root->fs_info->extent_commit_sem);
+	}
+
 	arg = memdup_user(arg_, sizeof(*arg));
 	if (IS_ERR(arg)) {
 		ret = PTR_ERR(arg);
@@ -4561,6 +4618,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 		goto out;
 	}
 
+	if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
 	if (!sctx) {
 		ret = -ENOMEM;
@@ -4572,9 +4634,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
 	INIT_LIST_HEAD(&sctx->name_cache_list);
 
+	sctx->flags = arg->flags;
+
 	sctx->send_filp = fget(arg->send_fd);
-	if (IS_ERR(sctx->send_filp)) {
-		ret = PTR_ERR(sctx->send_filp);
+	if (!sctx->send_filp) {
+		ret = -EBADF;
 		goto out;
 	}
 
@@ -4624,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 			key.type = BTRFS_ROOT_ITEM_KEY;
 			key.offset = (u64)-1;
 			clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
-			if (!clone_root) {
-				ret = -EINVAL;
-				goto out;
-			}
 			if (IS_ERR(clone_root)) {
 				ret = PTR_ERR(clone_root);
 				goto out;
@@ -4643,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 		key.type = BTRFS_ROOT_ITEM_KEY;
 		key.offset = (u64)-1;
 		sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
-		if (!sctx->parent_root) {
-			ret = -EINVAL;
+		if (IS_ERR(sctx->parent_root)) {
+			ret = PTR_ERR(sctx->parent_root);
 			goto out;
 		}
 	}
@@ -4665,16 +4725,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	if (ret < 0)
 		goto out;
 
-	ret = begin_cmd(sctx, BTRFS_SEND_C_END);
-	if (ret < 0)
-		goto out;
-	ret = send_cmd(sctx);
-	if (ret < 0)
-		goto out;
+	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
+		ret = begin_cmd(sctx, BTRFS_SEND_C_END);
+		if (ret < 0)
+			goto out;
+		ret = send_cmd(sctx);
+		if (ret < 0)
+			goto out;
+	}
 
 out:
-	if (filp)
-		fput(filp);
 	kfree(arg);
 	vfree(clone_sources_tmp);
 
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 1bf4f32fd4e..48d425aef05 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -86,6 +86,7 @@ enum btrfs_send_cmd {
 	BTRFS_SEND_C_UTIMES,
 
 	BTRFS_SEND_C_END,
+	BTRFS_SEND_C_UPDATE_EXTENT,
 	__BTRFS_SEND_C_MAX,
 };
 #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
@@ -130,5 +131,4 @@ enum {
 
 #ifdef __KERNEL__
 long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
-int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d8982e9601d..8eb6191d86d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,21 +41,21 @@
 #include <linux/slab.h>
 #include <linux/cleancache.h>
 #include <linux/ratelimit.h>
+#include <linux/btrfs.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "xattr.h"
 #include "volumes.h"
-#include "version.h"
 #include "export.h"
 #include "compression.h"
 #include "rcu-string.h"
 #include "dev-replace.h"
+#include "free-space-cache.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
@@ -63,10 +63,9 @@
 static const struct super_operations btrfs_super_ops;
 static struct file_system_type btrfs_fs_type;
 
-static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
-				      char nbuf[16])
+static const char *btrfs_decode_error(int errno)
 {
-	char *errstr = NULL;
+	char *errstr = "unknown";
 
 	switch (errno) {
 	case -EIO:
@@ -81,29 +80,24 @@ static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
 	case -EEXIST:
 		errstr = "Object already exists";
 		break;
-	default:
-		if (nbuf) {
-			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
-				errstr = nbuf;
-		}
+	case -ENOSPC:
+		errstr = "No space left";
+		break;
+	case -ENOENT:
+		errstr = "No such entry";
 		break;
 	}
 
 	return errstr;
 }
 
-static void __save_error_info(struct btrfs_fs_info *fs_info)
+static void save_error_info(struct btrfs_fs_info *fs_info)
 {
 	/*
 	 * today we only save the error info into ram.  Long term we'll
 	 * also send it down to the disk
 	 */
-	fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
-}
-
-static void save_error_info(struct btrfs_fs_info *fs_info)
-{
-	__save_error_info(fs_info);
+	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 }
 
 /* btrfs handle error by forcing the filesystem readonly */
@@ -114,9 +108,9 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
-	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
 		sb->s_flags |= MS_RDONLY;
-		printk(KERN_INFO "btrfs is forced readonly\n");
+		btrfs_info(fs_info, "forced readonly");
 		/*
 		 * Note that a running device replace operation is not
 		 * canceled here although there is no way to update
@@ -127,7 +121,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
 		 * mounted writeable again, the device replace
 		 * operation continues.
 		 */
-//		WARN_ON(1);
 	}
 }
 
@@ -140,10 +133,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		       unsigned int line, int errno, const char *fmt, ...)
 {
 	struct super_block *sb = fs_info->sb;
-	char nbuf[16];
 	const char *errstr;
-	va_list args;
-	va_start(args, fmt);
 
 	/*
 	 * Special case: if the error is EROFS, and we're already
@@ -152,26 +142,27 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
   		return;
 
-  	errstr = btrfs_decode_error(fs_info, errno, nbuf);
+	errstr = btrfs_decode_error(errno);
 	if (fmt) {
-		struct va_format vaf = {
-			.fmt = fmt,
-			.va = &args,
-		};
+		struct va_format vaf;
+		va_list args;
+
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
 
-		printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
-			sb->s_id, function, line, errstr, &vaf);
+		printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n",
+			sb->s_id, function, line, errno, errstr, &vaf);
+		va_end(args);
 	} else {
-		printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
-			sb->s_id, function, line, errstr);
+		printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n",
+			sb->s_id, function, line, errno, errstr);
 	}
 
 	/* Don't go through full error handling during mount */
-	if (sb->s_flags & MS_BORN) {
-		save_error_info(fs_info);
+	save_error_info(fs_info);
+	if (sb->s_flags & MS_BORN)
 		btrfs_handle_error(fs_info);
-	}
-	va_end(args);
 }
 
 static const char * const logtypes[] = {
@@ -185,7 +176,7 @@ static const char * const logtypes[] = {
 	"debug",
 };
 
-void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 {
 	struct super_block *sb = fs_info->sb;
 	char lvl[4];
@@ -209,7 +200,7 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf);
+	printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
 
 	va_end(args);
 }
@@ -253,21 +244,30 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root, const char *function,
 			       unsigned int line, int errno)
 {
-	WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");
+	/*
+	 * Report first abort since mount
+	 */
+	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
+				&root->fs_info->fs_state)) {
+		WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n",
+				errno);
+	}
 	trans->aborted = errno;
 	/* Nothing used. The other threads that have joined this
 	 * transaction may be able to continue. */
 	if (!trans->blocks_used) {
-		char nbuf[16];
 		const char *errstr;
 
-		errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
-		btrfs_printk(root->fs_info,
-			     "%s:%d: Aborting unused transaction(%s).\n",
-			     function, line, errstr);
+		errstr = btrfs_decode_error(errno);
+		btrfs_warn(root->fs_info,
+		           "%s:%d: Aborting unused transaction(%s).",
+		           function, line, errstr);
 		return;
 	}
 	ACCESS_ONCE(trans->transaction->aborted) = errno;
+	/* Wake up anybody who may be waiting on this transaction */
+	wake_up(&root->fs_info->transaction_wait);
+	wake_up(&root->fs_info->transaction_blocked_wait);
 	__btrfs_std_error(root->fs_info, function, line, errno, NULL);
 }
 /*
@@ -277,7 +277,6 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 		   unsigned int line, int errno, const char *fmt, ...)
 {
-	char nbuf[16];
 	char *s_id = "<unknown>";
 	const char *errstr;
 	struct va_format vaf = { .fmt = fmt };
@@ -289,13 +288,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 	va_start(args, fmt);
 	vaf.va = &args;
 
-	errstr = btrfs_decode_error(fs_info, errno, nbuf);
-	if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)
-		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
-			s_id, function, line, &vaf, errstr);
+	errstr = btrfs_decode_error(errno);
+	if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
+		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+			s_id, function, line, &vaf, errno, errstr);
 
-	printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
-	       s_id, function, line, &vaf, errstr);
+	printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+	       s_id, function, line, &vaf, errno, errstr);
 	va_end(args);
 	/* Caller calls BUG() */
 }
@@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_compress_force:
 		case Opt_compress_force_type:
 			compress_force = true;
+			/* Fallthrough */
 		case Opt_compress:
 		case Opt_compress_type:
 			if (token == Opt_compress ||
@@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_alloc_start:
 			num = match_strdup(&args[0]);
 			if (num) {
+				mutex_lock(&info->chunk_mutex);
 				info->alloc_start = memparse(num, NULL);
+				mutex_unlock(&info->chunk_mutex);
 				kfree(num);
 				printk(KERN_INFO
 					"btrfs: allocations start at %llu\n",
@@ -648,7 +650,7 @@ out:
  */
 static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		void *holder, char **subvol_name, u64 *subvol_objectid,
-		u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
+		struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *device_name, *opts, *orig, *p;
@@ -691,16 +693,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 			}
 			break;
 		case Opt_subvolrootid:
-			intarg = 0;
-			error = match_int(&args[0], &intarg);
-			if (!error) {
-				/* we want the original fs_tree */
-				if (!intarg)
-					*subvol_rootid =
-						BTRFS_FS_TREE_OBJECTID;
-				else
-					*subvol_rootid = intarg;
-			}
+			printk(KERN_WARNING
+				"btrfs: 'subvolrootid' mount option is deprecated and has no effect\n");
 			break;
 		case Opt_device:
 			device_name = match_strdup(&args[0]);
@@ -784,9 +778,6 @@ find_root:
 	if (IS_ERR(new_root))
 		return ERR_CAST(new_root);
 
-	if (btrfs_root_refs(&new_root->root_item) == 0)
-		return ERR_PTR(-ENOENT);
-
 	dir_id = btrfs_root_dirid(&new_root->root_item);
 setup_root:
 	location.objectid = dir_id;
@@ -874,9 +865,9 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 
-	btrfs_wait_ordered_extents(root, 0);
+	btrfs_wait_all_ordered_extents(fs_info, 1);
 
-	trans = btrfs_attach_transaction(root);
+	trans = btrfs_attach_transaction_barrier(root);
 	if (IS_ERR(trans)) {
 		/* no transaction, don't bother */
 		if (PTR_ERR(trans) == -ENOENT)
@@ -1078,7 +1069,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	fmode_t mode = FMODE_READ;
 	char *subvol_name = NULL;
 	u64 subvol_objectid = 0;
-	u64 subvol_rootid = 0;
 	int error = 0;
 
 	if (!(flags & MS_RDONLY))
@@ -1086,7 +1076,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 	error = btrfs_parse_early_options(data, mode, fs_type,
 					  &subvol_name, &subvol_objectid,
-					  &subvol_rootid, &fs_devices);
+					  &fs_devices);
 	if (error) {
 		kfree(subvol_name);
 		return ERR_PTR(error);
@@ -1200,6 +1190,41 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 			      new_pool_size);
 }
 
+static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
+{
+	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
+static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
+				       unsigned long old_opts, int flags)
+{
+	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+	     (flags & MS_RDONLY))) {
+		/* wait for any defraggers to finish */
+		wait_event(fs_info->transaction_wait,
+			   (atomic_read(&fs_info->defrag_running) == 0));
+		if (flags & MS_RDONLY)
+			sync_filesystem(fs_info->sb);
+	}
+}
+
+static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
+					 unsigned long old_opts)
+{
+	/*
+	 * We need cleanup all defragable inodes if the autodefragment is
+	 * close or the fs is R/O.
+	 */
+	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+	     (fs_info->sb->s_flags & MS_RDONLY))) {
+		btrfs_cleanup_defrag_inodes(fs_info);
+	}
+
+	clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1213,17 +1238,20 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	unsigned int old_metadata_ratio = fs_info->metadata_ratio;
 	int ret;
 
+	btrfs_remount_prepare(fs_info);
+
 	ret = btrfs_parse_options(root, data);
 	if (ret) {
 		ret = -EINVAL;
 		goto restore;
 	}
 
+	btrfs_remount_begin(fs_info, old_opts, *flags);
 	btrfs_resize_thread_pool(fs_info,
 		fs_info->thread_pool_size, old_thread_pool_size);
 
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
-		return 0;
+		goto out;
 
 	if (*flags & MS_RDONLY) {
 		/*
@@ -1234,6 +1262,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 
 		btrfs_dev_replace_suspend_for_unmount(fs_info);
 		btrfs_scrub_cancel(fs_info);
+		btrfs_pause_balance(fs_info);
 
 		ret = btrfs_commit_super(root);
 		if (ret)
@@ -1278,7 +1307,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		}
 		sb->s_flags &= ~MS_RDONLY;
 	}
-
+out:
+	btrfs_remount_cleanup(fs_info, old_opts);
 	return 0;
 
 restore:
@@ -1289,10 +1319,13 @@ restore:
 	fs_info->mount_opt = old_opts;
 	fs_info->compress_type = old_compress_type;
 	fs_info->max_inline = old_max_inline;
+	mutex_lock(&fs_info->chunk_mutex);
 	fs_info->alloc_start = old_alloc_start;
+	mutex_unlock(&fs_info->chunk_mutex);
 	btrfs_resize_thread_pool(fs_info,
 		old_thread_pool_size, fs_info->thread_pool_size);
 	fs_info->metadata_ratio = old_metadata_ratio;
+	btrfs_remount_cleanup(fs_info, old_opts);
 	return ret;
 }
 
@@ -1518,6 +1551,7 @@ static struct file_system_type btrfs_fs_type = {
 	.kill_sb	= btrfs_kill_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("btrfs");
 
 /*
  * used by btrfsctl to scan devices when no FS is mounted
@@ -1559,7 +1593,7 @@ static int btrfs_freeze(struct super_block *sb)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = btrfs_sb(sb)->tree_root;
 
-	trans = btrfs_attach_transaction(root);
+	trans = btrfs_attach_transaction_barrier(root);
 	if (IS_ERR(trans)) {
 		/* no transaction, don't bother */
 		if (PTR_ERR(trans) == -ENOENT)
@@ -1650,6 +1684,18 @@ static void btrfs_interface_exit(void)
 		printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
 }
 
+static void btrfs_print_info(void)
+{
+	printk(KERN_INFO "Btrfs loaded"
+#ifdef CONFIG_BTRFS_DEBUG
+			", debug=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+			", integrity-checker=on"
+#endif
+			"\n");
+}
+
 static int __init init_btrfs_fs(void)
 {
 	int err;
@@ -1684,21 +1730,29 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_delayed_inode;
 
-	err = btrfs_interface_init();
+	err = btrfs_delayed_ref_init();
 	if (err)
 		goto free_auto_defrag;
 
+	err = btrfs_interface_init();
+	if (err)
+		goto free_delayed_ref;
+
 	err = register_filesystem(&btrfs_fs_type);
 	if (err)
 		goto unregister_ioctl;
 
 	btrfs_init_lockdep();
 
-	printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
+	btrfs_print_info();
+	btrfs_test_free_space_cache();
+
 	return 0;
 
 unregister_ioctl:
 	btrfs_interface_exit();
+free_delayed_ref:
+	btrfs_delayed_ref_exit();
 free_auto_defrag:
 	btrfs_auto_defrag_exit();
 free_delayed_inode:
@@ -1720,6 +1774,7 @@ free_compress:
 static void __exit exit_btrfs_fs(void)
 {
 	btrfs_destroy_cachep();
+	btrfs_delayed_ref_exit();
 	btrfs_auto_defrag_exit();
 	btrfs_delayed_inode_exit();
 	ordered_data_exit();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index daac9ae6d73..5b326cd60a4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -21,7 +21,6 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/module.h>
 #include <linux/kobject.h>
 
 #include "ctree.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fc03aa60b68..d58cce77fc6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,13 +34,43 @@
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
-void put_transaction(struct btrfs_transaction *transaction)
+static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+	[TRANS_STATE_RUNNING]		= 0U,
+	[TRANS_STATE_BLOCKED]		= (__TRANS_USERSPACE |
+					   __TRANS_START),
+	[TRANS_STATE_COMMIT_START]	= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH),
+	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN),
+	[TRANS_STATE_UNBLOCKED]		= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN |
+					   __TRANS_JOIN_NOLOCK),
+	[TRANS_STATE_COMPLETED]		= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN |
+					   __TRANS_JOIN_NOLOCK),
+};
+
+static void put_transaction(struct btrfs_transaction *transaction)
 {
 	WARN_ON(atomic_read(&transaction->use_count) == 0);
 	if (atomic_dec_and_test(&transaction->use_count)) {
 		BUG_ON(!list_empty(&transaction->list));
 		WARN_ON(transaction->delayed_refs.root.rb_node);
-		memset(transaction, 0, sizeof(*transaction));
+		while (!list_empty(&transaction->pending_chunks)) {
+			struct extent_map *em;
+
+			em = list_first_entry(&transaction->pending_chunks,
+					      struct extent_map, list);
+			list_del_init(&em->list);
+			free_extent_map(em);
+		}
 		kmem_cache_free(btrfs_transaction_cachep, transaction);
 	}
 }
@@ -51,10 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 	root->commit_root = btrfs_root_node(root);
 }
 
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+					 unsigned int type)
+{
+	if (type & TRANS_EXTWRITERS)
+		atomic_inc(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+					 unsigned int type)
+{
+	if (type & TRANS_EXTWRITERS)
+		atomic_dec(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+					  unsigned int type)
+{
+	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
+{
+	return atomic_read(&trans->num_extwriters);
+}
+
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root, int type)
+static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
 {
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -62,33 +117,24 @@ static noinline int join_transaction(struct btrfs_root *root, int type)
 	spin_lock(&fs_info->trans_lock);
 loop:
 	/* The file system has been taken offline. No new transactions. */
-	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
 		spin_unlock(&fs_info->trans_lock);
 		return -EROFS;
 	}
 
-	if (fs_info->trans_no_join) {
-		/* 
-		 * If we are JOIN_NOLOCK we're already committing a current
-		 * transaction, we just need a handle to deal with something
-		 * when committing the transaction, such as inode cache and
-		 * space cache. It is a special case.
-		 */
-		if (type != TRANS_JOIN_NOLOCK) {
-			spin_unlock(&fs_info->trans_lock);
-			return -EBUSY;
-		}
-	}
-
 	cur_trans = fs_info->running_transaction;
 	if (cur_trans) {
 		if (cur_trans->aborted) {
 			spin_unlock(&fs_info->trans_lock);
 			return cur_trans->aborted;
 		}
+		if (btrfs_blocked_trans_types[cur_trans->state] & type) {
+			spin_unlock(&fs_info->trans_lock);
+			return -EBUSY;
+		}
 		atomic_inc(&cur_trans->use_count);
 		atomic_inc(&cur_trans->num_writers);
-		cur_trans->num_joined++;
+		extwriter_counter_inc(cur_trans, type);
 		spin_unlock(&fs_info->trans_lock);
 		return 0;
 	}
@@ -101,6 +147,12 @@ loop:
 	if (type == TRANS_ATTACH)
 		return -ENOENT;
 
+	/*
+	 * JOIN_NOLOCK only happens during the transaction commit, so
+	 * it is impossible that ->running_transaction is NULL
+	 */
+	BUG_ON(type == TRANS_JOIN_NOLOCK);
+
 	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
 	if (!cur_trans)
 		return -ENOMEM;
@@ -109,29 +161,26 @@ loop:
 	if (fs_info->running_transaction) {
 		/*
 		 * someone started a transaction after we unlocked.  Make sure
-		 * to redo the trans_no_join checks above
+		 * to redo the checks above
 		 */
 		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
-		cur_trans = fs_info->running_transaction;
 		goto loop;
-	} else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
 		spin_unlock(&fs_info->trans_lock);
 		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
 		return -EROFS;
 	}
 
 	atomic_set(&cur_trans->num_writers, 1);
-	cur_trans->num_joined = 0;
+	extwriter_counter_init(cur_trans, type);
 	init_waitqueue_head(&cur_trans->writer_wait);
 	init_waitqueue_head(&cur_trans->commit_wait);
-	cur_trans->in_commit = 0;
-	cur_trans->blocked = 0;
+	cur_trans->state = TRANS_STATE_RUNNING;
 	/*
 	 * One for this trans handle, one so it will live on until we
 	 * commit the transaction.
 	 */
 	atomic_set(&cur_trans->use_count, 2);
-	cur_trans->commit_done = 0;
 	cur_trans->start_time = get_seconds();
 
 	cur_trans->delayed_refs.root = RB_ROOT;
@@ -152,12 +201,16 @@ loop:
 	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
 		WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
 			"creating a fresh transaction\n");
-	atomic_set(&fs_info->tree_mod_seq, 0);
+	atomic64_set(&fs_info->tree_mod_seq, 0);
 
-	spin_lock_init(&cur_trans->commit_lock);
 	spin_lock_init(&cur_trans->delayed_refs.lock);
+	atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
+	atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
+	init_waitqueue_head(&cur_trans->delayed_refs.wait);
 
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+	INIT_LIST_HEAD(&cur_trans->ordered_operations);
+	INIT_LIST_HEAD(&cur_trans->pending_chunks);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(&cur_trans->dirty_pages,
 			     fs_info->btree_inode->i_mapping);
@@ -255,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+{
+	return (trans->state >= TRANS_STATE_BLOCKED &&
+		trans->state < TRANS_STATE_UNBLOCKED &&
+		!trans->aborted);
+}
+
 /* wait for commit against the current transaction to become unblocked
  * when this is done, it is safe to start a new transaction, but the current
  * transaction might not be fully on disk.
@@ -265,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root)
 
 	spin_lock(&root->fs_info->trans_lock);
 	cur_trans = root->fs_info->running_transaction;
-	if (cur_trans && cur_trans->blocked) {
+	if (cur_trans && is_transaction_blocked(cur_trans)) {
 		atomic_inc(&cur_trans->use_count);
 		spin_unlock(&root->fs_info->trans_lock);
 
 		wait_event(root->fs_info->transaction_wait,
-			   !cur_trans->blocked);
+			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+			   cur_trans->aborted);
 		put_transaction(cur_trans);
 	} else {
 		spin_unlock(&root->fs_info->trans_lock);
@@ -293,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 }
 
 static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, int type,
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
 		  enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_trans_handle *h;
@@ -302,11 +363,11 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
 	int ret;
 	u64 qgroup_reserved = 0;
 
-	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		return ERR_PTR(-EROFS);
 
 	if (current->journal_info) {
-		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+		WARN_ON(type & TRANS_EXTWRITERS);
 		h = current->journal_info;
 		h->use_count++;
 		WARN_ON(h->use_count > 2);
@@ -352,7 +413,7 @@ again:
 	 * If we are ATTACH, it means we just want to catch the current
 	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
 	 */
-	if (type < TRANS_JOIN_NOLOCK)
+	if (type & __TRANS_FREEZABLE)
 		sb_start_intwrite(root->fs_info->sb);
 
 	if (may_wait_transaction(root, type))
@@ -360,8 +421,11 @@ again:
 
 	do {
 		ret = join_transaction(root, type);
-		if (ret == -EBUSY)
+		if (ret == -EBUSY) {
 			wait_current_trans(root);
+			if (unlikely(type == TRANS_ATTACH))
+				ret = -ENOENT;
+		}
 	} while (ret == -EBUSY);
 
 	if (ret < 0) {
@@ -383,14 +447,16 @@ again:
 	h->block_rsv = NULL;
 	h->orig_rsv = NULL;
 	h->aborted = 0;
-	h->qgroup_reserved = qgroup_reserved;
+	h->qgroup_reserved = 0;
 	h->delayed_ref_elem.seq = 0;
 	h->type = type;
+	h->allocating_chunk = false;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
 
 	smp_mb();
-	if (cur_trans->blocked && may_wait_transaction(root, type)) {
+	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+	    may_wait_transaction(root, type)) {
 		btrfs_commit_transaction(h, root);
 		goto again;
 	}
@@ -401,6 +467,7 @@ again:
 		h->block_rsv = &root->fs_info->trans_block_rsv;
 		h->bytes_reserved = num_bytes;
 	}
+	h->qgroup_reserved = qgroup_reserved;
 
 got_it:
 	btrfs_record_root_in_trans(h, root);
@@ -410,7 +477,7 @@ got_it:
 	return h;
 
 join_fail:
-	if (type < TRANS_JOIN_NOLOCK)
+	if (type & __TRANS_FREEZABLE)
 		sb_end_intwrite(root->fs_info->sb);
 	kmem_cache_free(btrfs_trans_handle_cachep, h);
 alloc_fail:
@@ -452,16 +519,48 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
 	return start_transaction(root, 0, TRANS_USERSPACE, 0);
 }
 
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is used when we want to commit the current the transaction, but
+ * don't want to start a new one.
+ *
+ * Note: If this function return -ENOENT, it just means there is no
+ * running transaction. But it is possible that the inactive transaction
+ * is still in the memory, not fully on disk. If you hope there is no
+ * inactive transaction in the fs when -ENOENT is returned, you should
+ * invoke
+ *     btrfs_attach_transaction_barrier()
+ */
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 {
 	return start_transaction(root, 0, TRANS_ATTACH, 0);
 }
 
+/*
+ * btrfs_attach_transaction_barrier() - catch the running transaction
+ *
+ * It is similar to the above function, the differentia is this one
+ * will wait for all the inactive transactions until they fully
+ * complete.
+ */
+struct btrfs_trans_handle *
+btrfs_attach_transaction_barrier(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+
+	trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+	if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
+		btrfs_wait_for_commit(root, 0);
+
+	return trans;
+}
+
 /* wait for a transaction commit to be fully complete */
 static noinline void wait_for_commit(struct btrfs_root *root,
 				    struct btrfs_transaction *commit)
 {
-	wait_event(commit->commit_wait, commit->commit_done);
+	wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
 }
 
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -497,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
 		spin_lock(&root->fs_info->trans_lock);
 		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
 					    list) {
-			if (t->in_commit) {
-				if (t->commit_done)
+			if (t->state >= TRANS_STATE_COMMIT_START) {
+				if (t->state == TRANS_STATE_COMPLETED)
 					break;
 				cur_trans = t;
 				atomic_inc(&cur_trans->use_count);
@@ -525,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root)
 static int should_end_transaction(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
-	int ret;
+	if (root->fs_info->global_block_rsv.space_info->full &&
+	    btrfs_should_throttle_delayed_refs(trans, root))
+		return 1;
 
-	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
-	return ret ? 1 : 0;
+	return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
 }
 
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -539,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 	int err;
 
 	smp_mb();
-	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+	if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+	    cur_trans->delayed_refs.flushing)
 		return 1;
 
 	updates = trans->delayed_ref_updates;
@@ -558,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_fs_info *info = root->fs_info;
-	int count = 0;
+	unsigned long cur = trans->delayed_ref_updates;
 	int lock = (trans->type != TRANS_JOIN_NOLOCK);
 	int err = 0;
 
@@ -574,32 +675,26 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
-	/*
-	 * the same root has to be passed to start_transaction and
-	 * end_transaction. Subvolume quota depends on this.
-	 */
-	WARN_ON(trans->root != root);
 
 	if (trans->qgroup_reserved) {
-		btrfs_qgroup_free(root, trans->qgroup_reserved);
+		/*
+		 * the same root has to be passed here between start_transaction
+		 * and end_transaction. Subvolume quota depends on this.
+		 */
+		btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
 		trans->qgroup_reserved = 0;
 	}
 
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans, root);
 
-	while (count < 2) {
-		unsigned long cur = trans->delayed_ref_updates;
+	trans->delayed_ref_updates = 0;
+	if (btrfs_should_throttle_delayed_refs(trans, root)) {
+		cur = max_t(unsigned long, cur, 1);
 		trans->delayed_ref_updates = 0;
-		if (cur &&
-		    trans->transaction->delayed_refs.num_heads_ready > 64) {
-			trans->delayed_ref_updates = 0;
-			btrfs_run_delayed_refs(trans, root, cur);
-		} else {
-			break;
-		}
-		count++;
+		btrfs_run_delayed_refs(trans, root, cur);
 	}
+
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 
@@ -607,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		btrfs_create_pending_block_groups(trans, root);
 
 	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
-	    should_end_transaction(trans, root)) {
-		trans->transaction->blocked = 1;
-		smp_wmb();
+	    should_end_transaction(trans, root) &&
+	    ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
+		spin_lock(&info->trans_lock);
+		if (cur_trans->state == TRANS_STATE_RUNNING)
+			cur_trans->state = TRANS_STATE_BLOCKED;
+		spin_unlock(&info->trans_lock);
 	}
 
-	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
+	if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
 		if (throttle) {
 			/*
 			 * We may race with somebody else here so end up having
@@ -626,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	if (trans->type < TRANS_JOIN_NOLOCK)
+	if (trans->type & __TRANS_FREEZABLE)
 		sb_end_intwrite(root->fs_info->sb);
 
 	WARN_ON(cur_trans != info->running_transaction);
 	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
 	atomic_dec(&cur_trans->num_writers);
+	extwriter_counter_dec(cur_trans, trans->type);
 
 	smp_mb();
 	if (waitqueue_active(&cur_trans->writer_wait))
@@ -645,12 +744,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		btrfs_run_delayed_iputs(root);
 
 	if (trans->aborted ||
-	    root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+	    test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		err = -EIO;
-	}
 	assert_qgroups_uptodate(trans);
 
-	memset(trans, 0, sizeof(*trans));
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 	return err;
 }
@@ -658,23 +755,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root)
 {
-	int ret;
-
-	ret = __btrfs_end_transaction(trans, root, 0);
-	if (ret)
-		return ret;
-	return 0;
+	return __btrfs_end_transaction(trans, root, 0);
 }
 
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
-	int ret;
-
-	ret = __btrfs_end_transaction(trans, root, 1);
-	if (ret)
-		return ret;
-	return 0;
+	return __btrfs_end_transaction(trans, root, 1);
 }
 
 int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
@@ -755,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 {
 	int ret;
 	int ret2;
+	struct blk_plug plug;
 
+	blk_start_plug(&plug);
 	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+	blk_finish_plug(&plug);
 	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
 
 	if (ret)
@@ -896,7 +986,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 int btrfs_add_dead_root(struct btrfs_root *root)
 {
 	spin_lock(&root->fs_info->trans_lock);
-	list_add(&root->root_list, &root->fs_info->dead_roots);
+	list_add_tail(&root->root_list, &root->fs_info->dead_roots);
 	spin_unlock(&root->fs_info->trans_lock);
 	return 0;
 }
@@ -961,10 +1051,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 }
 
 /*
- * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
- * otherwise every leaf in the btree is read and defragged.
+ * defrag a given btree.
+ * Every leaf in the btree is read and defragged.
  */
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+int btrfs_defrag_root(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_trans_handle *trans;
@@ -978,7 +1068,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
 
-		ret = btrfs_defrag_leaves(trans, root, cacheonly);
+		ret = btrfs_defrag_leaves(trans, root);
 
 		btrfs_end_transaction(trans, root);
 		btrfs_btree_balance_dirty(info->tree_root);
@@ -986,6 +1076,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 
 		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
 			break;
+
+		if (btrfs_defrag_cancelled(root->fs_info)) {
+			printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
+			ret = -EAGAIN;
+			break;
+		}
 	}
 	root->defrag_running = 0;
 	return ret;
@@ -993,7 +1089,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 
 /*
  * new snapshots need to be created at a very specific time in the
- * transaction commit.  This does the actual creation
+ * transaction commit.  This does the actual creation.
+ *
+ * Note:
+ * If the error which may affect the commitment of the current transaction
+ * happens, we should return the error number. If the error which just affect
+ * the creation of the pending snapshots, just return 0.
  */
 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				   struct btrfs_fs_info *fs_info,
@@ -1008,12 +1109,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct inode *parent_inode;
 	struct btrfs_path *path;
 	struct btrfs_dir_item *dir_item;
-	struct dentry *parent;
 	struct dentry *dentry;
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
 	struct timespec cur_time = CURRENT_TIME;
-	int ret;
+	int ret = 0;
 	u64 to_reserve = 0;
 	u64 index = 0;
 	u64 objectid;
@@ -1022,40 +1122,36 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		ret = pending->error = -ENOMEM;
-		goto path_alloc_fail;
+		pending->error = -ENOMEM;
+		return 0;
 	}
 
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 	if (!new_root_item) {
-		ret = pending->error = -ENOMEM;
+		pending->error = -ENOMEM;
 		goto root_item_alloc_fail;
 	}
 
-	ret = btrfs_find_free_objectid(tree_root, &objectid);
-	if (ret) {
-		pending->error = ret;
+	pending->error = btrfs_find_free_objectid(tree_root, &objectid);
+	if (pending->error)
 		goto no_free_objectid;
-	}
 
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
-		ret = btrfs_block_rsv_add(root, &pending->block_rsv,
-					  to_reserve,
-					  BTRFS_RESERVE_NO_FLUSH);
-		if (ret) {
-			pending->error = ret;
+		pending->error = btrfs_block_rsv_add(root,
+						     &pending->block_rsv,
+						     to_reserve,
+						     BTRFS_RESERVE_NO_FLUSH);
+		if (pending->error)
 			goto no_free_objectid;
-		}
 	}
 
-	ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
-				   objectid, pending->inherit);
-	if (ret) {
-		pending->error = ret;
+	pending->error = btrfs_qgroup_inherit(trans, fs_info,
+					      root->root_key.objectid,
+					      objectid, pending->inherit);
+	if (pending->error)
 		goto no_free_objectid;
-	}
 
 	key.objectid = objectid;
 	key.offset = (u64)-1;
@@ -1063,10 +1159,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	rsv = trans->block_rsv;
 	trans->block_rsv = &pending->block_rsv;
+	trans->bytes_reserved = trans->block_rsv->reserved;
 
 	dentry = pending->dentry;
-	parent = dget_parent(dentry);
-	parent_inode = parent->d_inode;
+	parent_inode = pending->dir;
 	parent_root = BTRFS_I(parent_inode)->root;
 	record_root_in_trans(trans, parent_root);
 
@@ -1083,7 +1179,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 					 dentry->d_name.len, 0);
 	if (dir_item != NULL && !IS_ERR(dir_item)) {
 		pending->error = -EEXIST;
-		goto fail;
+		goto dir_item_existed;
 	} else if (IS_ERR(dir_item)) {
 		ret = PTR_ERR(dir_item);
 		btrfs_abort_transaction(trans, root, ret);
@@ -1121,13 +1217,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
 	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
 			BTRFS_UUID_SIZE);
+	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
+		memset(new_root_item->received_uuid, 0,
+		       sizeof(new_root_item->received_uuid));
+		memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
+		memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
+		btrfs_set_root_stransid(new_root_item, 0);
+		btrfs_set_root_rtransid(new_root_item, 0);
+	}
 	new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
 	new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec);
 	btrfs_set_root_otransid(new_root_item, trans->transid);
-	memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
-	memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
-	btrfs_set_root_stransid(new_root_item, 0);
-	btrfs_set_root_rtransid(new_root_item, 0);
 
 	old = btrfs_lock_root_node(root);
 	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
@@ -1214,14 +1314,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret)
 		btrfs_abort_transaction(trans, root, ret);
 fail:
-	dput(parent);
+	pending->error = ret;
+dir_item_existed:
 	trans->block_rsv = rsv;
+	trans->bytes_reserved = 0;
 no_free_objectid:
 	kfree(new_root_item);
 root_item_alloc_fail:
 	btrfs_free_path(path);
-path_alloc_fail:
-	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
 	return ret;
 }
 
@@ -1231,12 +1331,17 @@ path_alloc_fail:
 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 					     struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_pending_snapshot *pending;
+	struct btrfs_pending_snapshot *pending, *next;
 	struct list_head *head = &trans->transaction->pending_snapshots;
+	int ret = 0;
 
-	list_for_each_entry(pending, head, list)
-		create_pending_snapshot(trans, fs_info, pending);
-	return 0;
+	list_for_each_entry_safe(pending, next, head, list) {
+		list_del(&pending->list);
+		ret = create_pending_snapshot(trans, fs_info, pending);
+		if (ret)
+			break;
+	}
+	return ret;
 }
 
 static void update_super_roots(struct btrfs_root *root)
@@ -1261,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root)
 
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 {
+	struct btrfs_transaction *trans;
 	int ret = 0;
+
 	spin_lock(&info->trans_lock);
-	if (info->running_transaction)
-		ret = info->running_transaction->in_commit;
+	trans = info->running_transaction;
+	if (trans)
+		ret = (trans->state >= TRANS_STATE_COMMIT_START);
 	spin_unlock(&info->trans_lock);
 	return ret;
 }
 
 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 {
+	struct btrfs_transaction *trans;
 	int ret = 0;
+
 	spin_lock(&info->trans_lock);
-	if (info->running_transaction)
-		ret = info->running_transaction->blocked;
+	trans = info->running_transaction;
+	if (trans)
+		ret = is_transaction_blocked(trans);
 	spin_unlock(&info->trans_lock);
 	return ret;
 }
@@ -1286,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 static void wait_current_trans_commit_start(struct btrfs_root *root,
 					    struct btrfs_transaction *trans)
 {
-	wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
+	wait_event(root->fs_info->transaction_blocked_wait,
+		   trans->state >= TRANS_STATE_COMMIT_START ||
+		   trans->aborted);
 }
 
 /*
@@ -1297,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
 					 struct btrfs_transaction *trans)
 {
 	wait_event(root->fs_info->transaction_wait,
-		   trans->commit_done || (trans->in_commit && !trans->blocked));
+		   trans->state >= TRANS_STATE_UNBLOCKED ||
+		   trans->aborted);
 }
 
 /*
@@ -1307,13 +1421,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
 struct btrfs_async_commit {
 	struct btrfs_trans_handle *newtrans;
 	struct btrfs_root *root;
-	struct delayed_work work;
+	struct work_struct work;
 };
 
 static void do_async_commit(struct work_struct *work)
 {
 	struct btrfs_async_commit *ac =
-		container_of(work, struct btrfs_async_commit, work.work);
+		container_of(work, struct btrfs_async_commit, work);
 
 	/*
 	 * We've got freeze protection passed with the transaction.
@@ -1341,7 +1455,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	if (!ac)
 		return -ENOMEM;
 
-	INIT_DELAYED_WORK(&ac->work, do_async_commit);
+	INIT_WORK(&ac->work, do_async_commit);
 	ac->root = root;
 	ac->newtrans = btrfs_join_transaction(root);
 	if (IS_ERR(ac->newtrans)) {
@@ -1365,7 +1479,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 			&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
 			1, _THIS_IP_);
 
-	schedule_delayed_work(&ac->work, 0);
+	schedule_work(&ac->work);
 
 	/* wait for transaction to start and unblock */
 	if (wait_for_unblock)
@@ -1385,21 +1499,39 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, int err)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
+	DEFINE_WAIT(wait);
 
 	WARN_ON(trans->use_count > 1);
 
 	btrfs_abort_transaction(trans, root, err);
 
 	spin_lock(&root->fs_info->trans_lock);
+
+	/*
+	 * If the transaction is removed from the list, it means this
+	 * transaction has been committed successfully, so it is impossible
+	 * to call the cleanup function.
+	 */
+	BUG_ON(list_empty(&cur_trans->list));
+
 	list_del_init(&cur_trans->list);
 	if (cur_trans == root->fs_info->running_transaction) {
-		root->fs_info->running_transaction = NULL;
-		root->fs_info->trans_no_join = 0;
+		cur_trans->state = TRANS_STATE_COMMIT_DOING;
+		spin_unlock(&root->fs_info->trans_lock);
+		wait_event(cur_trans->writer_wait,
+			   atomic_read(&cur_trans->num_writers) == 1);
+
+		spin_lock(&root->fs_info->trans_lock);
 	}
 	spin_unlock(&root->fs_info->trans_lock);
 
 	btrfs_cleanup_one_transaction(trans->transaction, root);
 
+	spin_lock(&root->fs_info->trans_lock);
+	if (cur_trans == root->fs_info->running_transaction)
+		root->fs_info->running_transaction = NULL;
+	spin_unlock(&root->fs_info->trans_lock);
+
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
@@ -1416,22 +1548,8 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root)
 {
-	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
-	int snap_pending = 0;
 	int ret;
 
-	if (!flush_on_commit) {
-		spin_lock(&root->fs_info->trans_lock);
-		if (!list_empty(&trans->transaction->pending_snapshots))
-			snap_pending = 1;
-		spin_unlock(&root->fs_info->trans_lock);
-	}
-
-	if (flush_on_commit || snap_pending) {
-		btrfs_start_delalloc_inodes(root, 1);
-		btrfs_wait_ordered_extents(root, 1);
-	}
-
 	ret = btrfs_run_delayed_items(trans, root);
 	if (ret)
 		return ret;
@@ -1450,50 +1568,60 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
 	 * it here and no for sure that nothing new will be added
 	 * to the list
 	 */
-	btrfs_run_ordered_operations(root, 1);
+	ret = btrfs_run_ordered_operations(trans, root, 1);
 
+	return ret;
+}
+
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+		return btrfs_start_all_delalloc_inodes(fs_info, 1);
 	return 0;
 }
 
-/*
- * btrfs_transaction state sequence:
- *    in_commit = 0, blocked = 0  (initial)
- *    in_commit = 1, blocked = 1
- *    blocked = 0
- *    commit_done = 1
- */
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+		btrfs_wait_all_ordered_extents(fs_info, 1);
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
-	unsigned long joined = 0;
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_transaction *prev_trans = NULL;
-	DEFINE_WAIT(wait);
 	int ret;
-	int should_grow = 0;
-	unsigned long now = get_seconds();
 
-	ret = btrfs_run_ordered_operations(root, 0);
+	ret = btrfs_run_ordered_operations(trans, root, 0);
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
-		goto cleanup_transaction;
+		btrfs_end_transaction(trans, root);
+		return ret;
 	}
 
 	/* Stop the commit early if ->aborted is set */
 	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
 		ret = cur_trans->aborted;
-		goto cleanup_transaction;
+		btrfs_end_transaction(trans, root);
+		return ret;
 	}
 
 	/* make a pass through all the delayed refs we have so far
 	 * any runnings procs may add more while we are here
 	 */
 	ret = btrfs_run_delayed_refs(trans, root, 0);
-	if (ret)
-		goto cleanup_transaction;
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
 
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
+	if (trans->qgroup_reserved) {
+		btrfs_qgroup_free(root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
+	}
 
 	cur_trans = trans->transaction;
 
@@ -1502,17 +1630,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 * start sending their work down.
 	 */
 	cur_trans->delayed_refs.flushing = 1;
+	smp_wmb();
 
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans, root);
 
 	ret = btrfs_run_delayed_refs(trans, root, 0);
-	if (ret)
-		goto cleanup_transaction;
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
 
-	spin_lock(&cur_trans->commit_lock);
-	if (cur_trans->in_commit) {
-		spin_unlock(&cur_trans->commit_lock);
+	spin_lock(&root->fs_info->trans_lock);
+	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+		spin_unlock(&root->fs_info->trans_lock);
 		atomic_inc(&cur_trans->use_count);
 		ret = btrfs_end_transaction(trans, root);
 
@@ -1523,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
-	trans->transaction->in_commit = 1;
-	trans->transaction->blocked = 1;
-	spin_unlock(&cur_trans->commit_lock);
+	cur_trans->state = TRANS_STATE_COMMIT_START;
 	wake_up(&root->fs_info->transaction_blocked_wait);
 
-	spin_lock(&root->fs_info->trans_lock);
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
 					struct btrfs_transaction, list);
-		if (!prev_trans->commit_done) {
+		if (prev_trans->state != TRANS_STATE_COMPLETED) {
 			atomic_inc(&prev_trans->use_count);
 			spin_unlock(&root->fs_info->trans_lock);
 
@@ -1546,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		spin_unlock(&root->fs_info->trans_lock);
 	}
 
-	if (!btrfs_test_opt(root, SSD) &&
-	    (now < cur_trans->start_time || now - cur_trans->start_time < 1))
-		should_grow = 1;
-
-	do {
-		joined = cur_trans->num_joined;
-
-		WARN_ON(cur_trans != trans->transaction);
+	extwriter_counter_dec(cur_trans, trans->type);
 
-		ret = btrfs_flush_all_pending_stuffs(trans, root);
-		if (ret)
-			goto cleanup_transaction;
-
-		prepare_to_wait(&cur_trans->writer_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
+	ret = btrfs_start_delalloc_flush(root->fs_info);
+	if (ret)
+		goto cleanup_transaction;
 
-		if (atomic_read(&cur_trans->num_writers) > 1)
-			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
-		else if (should_grow)
-			schedule_timeout(1);
+	ret = btrfs_flush_all_pending_stuffs(trans, root);
+	if (ret)
+		goto cleanup_transaction;
 
-		finish_wait(&cur_trans->writer_wait, &wait);
-	} while (atomic_read(&cur_trans->num_writers) > 1 ||
-		 (should_grow && cur_trans->num_joined != joined));
+	wait_event(cur_trans->writer_wait,
+		   extwriter_counter_read(cur_trans) == 0);
 
+	/* some pending stuffs might be added after the previous flush. */
 	ret = btrfs_flush_all_pending_stuffs(trans, root);
 	if (ret)
 		goto cleanup_transaction;
 
+	btrfs_wait_delalloc_flush(root->fs_info);
 	/*
 	 * Ok now we need to make sure to block out any other joins while we
 	 * commit the transaction.  We could have started a join before setting
-	 * no_join so make sure to wait for num_writers to == 1 again.
+	 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
 	 */
 	spin_lock(&root->fs_info->trans_lock);
-	root->fs_info->trans_no_join = 1;
+	cur_trans->state = TRANS_STATE_COMMIT_DOING;
 	spin_unlock(&root->fs_info->trans_lock);
 	wait_event(cur_trans->writer_wait,
 		   atomic_read(&cur_trans->num_writers) == 1);
@@ -1708,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
 	       sizeof(*root->fs_info->super_copy));
 
-	trans->transaction->blocked = 0;
 	spin_lock(&root->fs_info->trans_lock);
+	cur_trans->state = TRANS_STATE_UNBLOCKED;
 	root->fs_info->running_transaction = NULL;
-	root->fs_info->trans_no_join = 0;
 	spin_unlock(&root->fs_info->trans_lock);
 	mutex_unlock(&root->fs_info->reloc_mutex);
 
@@ -1720,7 +1837,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	if (ret) {
 		btrfs_error(root->fs_info, ret,
-			    "Error while writing out transaction.");
+			    "Error while writing out transaction");
 		mutex_unlock(&root->fs_info->tree_log_mutex);
 		goto cleanup_transaction;
 	}
@@ -1739,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_finish_extent_commit(trans, root);
 
-	cur_trans->commit_done = 1;
-
 	root->fs_info->last_trans_committed = cur_trans->transid;
-
+	/*
+	 * We needn't acquire the lock here because there is no other task
+	 * which can change it.
+	 */
+	cur_trans->state = TRANS_STATE_COMPLETED;
 	wake_up(&cur_trans->commit_wait);
 
 	spin_lock(&root->fs_info->trans_lock);
@@ -1752,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
-	if (trans->type < TRANS_JOIN_NOLOCK)
+	if (trans->type & __TRANS_FREEZABLE)
 		sb_end_intwrite(root->fs_info->sb);
 
 	trace_btrfs_transaction_commit(root);
@@ -1772,8 +1891,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 cleanup_transaction:
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
-	btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
-//	WARN_ON(1);
+	if (trans->qgroup_reserved) {
+		btrfs_qgroup_free(root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
+	}
+	btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
 	cleanup_transaction(trans, root, ret);
@@ -1782,31 +1904,44 @@ cleanup_transaction:
 }
 
 /*
- * interface function to delete all the snapshots we have scheduled for deletion
+ * return < 0 if error
+ * 0 if there are no more dead_roots at the time of call
+ * 1 there are more to be processed, call me again
+ *
+ * The return value indicates there are certainly more snapshots to delete, but
+ * if there comes a new one during processing, it may return 0. We don't mind,
+ * because btrfs_commit_super will poke cleaner thread and it will process it a
+ * few seconds later.
  */
-int btrfs_clean_old_snapshots(struct btrfs_root *root)
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
 {
-	LIST_HEAD(list);
+	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	spin_lock(&fs_info->trans_lock);
-	list_splice_init(&fs_info->dead_roots, &list);
+	if (list_empty(&fs_info->dead_roots)) {
+		spin_unlock(&fs_info->trans_lock);
+		return 0;
+	}
+	root = list_first_entry(&fs_info->dead_roots,
+			struct btrfs_root, root_list);
+	list_del(&root->root_list);
 	spin_unlock(&fs_info->trans_lock);
 
-	while (!list_empty(&list)) {
-		int ret;
-
-		root = list_entry(list.next, struct btrfs_root, root_list);
-		list_del(&root->root_list);
+	pr_debug("btrfs: cleaner removing %llu\n",
+			(unsigned long long)root->objectid);
 
-		btrfs_kill_all_delayed_nodes(root);
+	btrfs_kill_all_delayed_nodes(root);
 
-		if (btrfs_header_backref_rev(root->node) <
-		    BTRFS_MIXED_BACKREF_REV)
-			ret = btrfs_drop_snapshot(root, NULL, 0, 0);
-		else
-			ret =btrfs_drop_snapshot(root, NULL, 1, 0);
-		BUG_ON(ret < 0);
-	}
-	return 0;
+	if (btrfs_header_backref_rev(root->node) <
+			BTRFS_MIXED_BACKREF_REV)
+		ret = btrfs_drop_snapshot(root, NULL, 0, 0);
+	else
+		ret = btrfs_drop_snapshot(root, NULL, 1, 0);
+	/*
+	 * If we encounter a transaction abort during snapshot cleaning, we
+	 * don't want to crash here
+	 */
+	BUG_ON(ret < 0 && ret != -EAGAIN && ret != -EROFS);
+	return 1;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0e8aa1e6c28..005b0375d18 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -22,38 +22,61 @@
 #include "delayed-ref.h"
 #include "ctree.h"
 
+enum btrfs_trans_state {
+	TRANS_STATE_RUNNING		= 0,
+	TRANS_STATE_BLOCKED		= 1,
+	TRANS_STATE_COMMIT_START	= 2,
+	TRANS_STATE_COMMIT_DOING	= 3,
+	TRANS_STATE_UNBLOCKED		= 4,
+	TRANS_STATE_COMPLETED		= 5,
+	TRANS_STATE_MAX			= 6,
+};
+
 struct btrfs_transaction {
 	u64 transid;
 	/*
+	 * total external writers(USERSPACE/START/ATTACH) in this
+	 * transaction, it must be zero before the transaction is
+	 * being committed
+	 */
+	atomic_t num_extwriters;
+	/*
 	 * total writers in this transaction, it must be zero before the
 	 * transaction can end
 	 */
 	atomic_t num_writers;
 	atomic_t use_count;
 
-	unsigned long num_joined;
-
-	spinlock_t commit_lock;
-	int in_commit;
-	int commit_done;
-	int blocked;
+	/* Be protected by fs_info->trans_lock when we want to change it. */
+	enum btrfs_trans_state state;
 	struct list_head list;
 	struct extent_io_tree dirty_pages;
 	unsigned long start_time;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
+	struct list_head ordered_operations;
+	struct list_head pending_chunks;
 	struct btrfs_delayed_ref_root delayed_refs;
 	int aborted;
 };
 
-enum btrfs_trans_type {
-	TRANS_START,
-	TRANS_JOIN,
-	TRANS_USERSPACE,
-	TRANS_JOIN_NOLOCK,
-	TRANS_ATTACH,
-};
+#define __TRANS_FREEZABLE	(1U << 0)
+
+#define __TRANS_USERSPACE	(1U << 8)
+#define __TRANS_START		(1U << 9)
+#define __TRANS_ATTACH		(1U << 10)
+#define __TRANS_JOIN		(1U << 11)
+#define __TRANS_JOIN_NOLOCK	(1U << 12)
+
+#define TRANS_USERSPACE		(__TRANS_USERSPACE | __TRANS_FREEZABLE)
+#define TRANS_START		(__TRANS_START | __TRANS_FREEZABLE)
+#define TRANS_ATTACH		(__TRANS_ATTACH)
+#define TRANS_JOIN		(__TRANS_JOIN | __TRANS_FREEZABLE)
+#define TRANS_JOIN_NOLOCK	(__TRANS_JOIN_NOLOCK)
+
+#define TRANS_EXTWRITERS	(__TRANS_USERSPACE | __TRANS_START |	\
+				 __TRANS_ATTACH)
 
 struct btrfs_trans_handle {
 	u64 transid;
@@ -68,7 +91,8 @@ struct btrfs_trans_handle {
 	struct btrfs_block_rsv *orig_rsv;
 	short aborted;
 	short adding_csums;
-	enum btrfs_trans_type type;
+	bool allocating_chunk;
+	unsigned int type;
 	/*
 	 * this root is only needed to validate that the root passed to
 	 * start_transaction is the same as the one passed to end_transaction.
@@ -82,11 +106,13 @@ struct btrfs_trans_handle {
 
 struct btrfs_pending_snapshot {
 	struct dentry *dentry;
+	struct inode *dir;
 	struct btrfs_root *root;
 	struct btrfs_root *snap;
 	struct btrfs_qgroup_inherit *inherit;
 	/* block reservation for the operation */
 	struct btrfs_block_rsv block_rsv;
+	u64 qgroup_reserved;
 	/* extra metadata reseration for relocation */
 	int error;
 	bool readonly;
@@ -110,14 +136,16 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
+					struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root);
 
 int btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
-int btrfs_clean_old_snapshots(struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root);
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
 int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
@@ -140,5 +168,4 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
-void put_transaction(struct btrfs_transaction *transaction);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3b580ee8ab1..94e05c1f118 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,13 +23,14 @@
 #include "transaction.h"
 #include "locking.h"
 
-/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
- * things from disk, otherwise read all the leaves and try to get key order to
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
  * better reflect disk order
  */
 
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, int cache_only)
+			struct btrfs_root *root)
 {
 	struct btrfs_path *path = NULL;
 	struct btrfs_key key;
@@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	u64 last_ret = 0;
 	u64 min_trans = 0;
 
-	if (cache_only)
-		goto out;
-
 	if (root->fs_info->extent_root == root) {
 		/*
 		 * there's recursion here right now in the tree locking,
@@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	}
 
 	path->keep_locks = 1;
-	if (cache_only)
-		min_trans = root->defrag_trans_start;
 
-	ret = btrfs_search_forward(root, &key, NULL, path,
-				   cache_only, min_trans);
+	ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
@@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
-	next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
 					   min_trans);
 	ret = btrfs_realloc_node(trans, root,
 				 path->nodes[1], 0,
-				 cache_only, &last_ret,
+				 &last_ret,
 				 &root->defrag_progress);
 	if (ret) {
 		WARN_ON(ret == -EAGAIN);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9027bb1e746..2c679149363 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include <linux/list_sort.h>
 #include "ctree.h"
 #include "transaction.h"
@@ -277,18 +278,31 @@ static int process_one_buffer(struct btrfs_root *log,
 			      struct extent_buffer *eb,
 			      struct walk_control *wc, u64 gen)
 {
+	int ret = 0;
+
+	/*
+	 * If this fs is mixed then we need to be able to process the leaves to
+	 * pin down any logged extents, so we have to read the block.
+	 */
+	if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
+		ret = btrfs_read_buffer(eb, gen);
+		if (ret)
+			return ret;
+	}
+
 	if (wc->pin)
-		btrfs_pin_extent_for_log_replay(wc->trans,
-						log->fs_info->extent_root,
-						eb->start, eb->len);
+		ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
+						      eb->start, eb->len);
 
-	if (btrfs_buffer_uptodate(eb, gen, 0)) {
+	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
+		if (wc->pin && btrfs_header_level(eb) == 0)
+			ret = btrfs_exclude_logged_extents(log, eb);
 		if (wc->write)
 			btrfs_write_tree_block(eb);
 		if (wc->wait)
 			btrfs_wait_tree_block_writeback(eb);
 	}
-	return 0;
+	return ret;
 }
 
 /*
@@ -318,6 +332,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 	unsigned long src_ptr;
 	unsigned long dst_ptr;
 	int overwrite_root = 0;
+	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 		overwrite_root = 1;
@@ -327,6 +342,9 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 
 	/* look for the key in the destination tree */
 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
 	if (ret == 0) {
 		char *src_copy;
 		char *dst_copy;
@@ -368,6 +386,30 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 			return 0;
 		}
 
+		/*
+		 * We need to load the old nbytes into the inode so when we
+		 * replay the extents we've logged we get the right nbytes.
+		 */
+		if (inode_item) {
+			struct btrfs_inode_item *item;
+			u64 nbytes;
+
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_inode_item);
+			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
+			item = btrfs_item_ptr(eb, slot,
+					      struct btrfs_inode_item);
+			btrfs_set_inode_nbytes(eb, item, nbytes);
+		}
+	} else if (inode_item) {
+		struct btrfs_inode_item *item;
+
+		/*
+		 * New inode, set nbytes to 0 so that the nbytes comes out
+		 * properly when we replay the extents.
+		 */
+		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+		btrfs_set_inode_nbytes(eb, item, 0);
 	}
 insert:
 	btrfs_release_path(path);
@@ -381,9 +423,9 @@ insert:
 		found_size = btrfs_item_size_nr(path->nodes[0],
 						path->slots[0]);
 		if (found_size > item_size)
-			btrfs_truncate_item(trans, root, path, item_size, 1);
+			btrfs_truncate_item(root, path, item_size, 1);
 		else if (found_size < item_size)
-			btrfs_extend_item(trans, root, path,
+			btrfs_extend_item(root, path,
 					  item_size - found_size);
 	} else if (ret) {
 		return ret;
@@ -485,10 +527,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 				      struct btrfs_key *key)
 {
 	int found_type;
-	u64 mask = root->sectorsize - 1;
 	u64 extent_end;
 	u64 start = key->offset;
-	u64 saved_nbytes;
+	u64 nbytes = 0;
 	struct btrfs_file_extent_item *item;
 	struct inode *inode = NULL;
 	unsigned long size;
@@ -498,11 +539,20 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	found_type = btrfs_file_extent_type(eb, item);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
-	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
-		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
-	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		nbytes = btrfs_file_extent_num_bytes(eb, item);
+		extent_end = start + nbytes;
+
+		/*
+		 * We don't add to the inodes nbytes if we are prealloc or a
+		 * hole.
+		 */
+		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+			nbytes = 0;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		size = btrfs_file_extent_inline_len(eb, item);
-		extent_end = (start + size + mask) & ~mask;
+		nbytes = btrfs_file_extent_ram_bytes(eb, item);
+		extent_end = ALIGN(start + size, root->sectorsize);
 	} else {
 		ret = 0;
 		goto out;
@@ -550,10 +600,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 
-	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
 	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -563,7 +613,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_insert_empty_item(trans, root, path, key,
 					      sizeof(*item));
-		BUG_ON(ret);
+		if (ret)
+			goto out;
 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 						    path->slots[0]);
 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
@@ -589,7 +640,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 						ins.objectid, ins.offset,
 						0, root->root_key.objectid,
 						key->objectid, offset, 0);
-				BUG_ON(ret);
+				if (ret)
+					goto out;
 			} else {
 				/*
 				 * insert the extent pointer in the extent
@@ -598,7 +650,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 				ret = btrfs_alloc_logged_file_extent(trans,
 						root, root->root_key.objectid,
 						key->objectid, offset, &ins);
-				BUG_ON(ret);
+				if (ret)
+					goto out;
 			}
 			btrfs_release_path(path);
 
@@ -615,29 +668,33 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			ret = btrfs_lookup_csums_range(root->log_root,
 						csum_start, csum_end - 1,
 						&ordered_sums, 0);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 			while (!list_empty(&ordered_sums)) {
 				struct btrfs_ordered_sum *sums;
 				sums = list_entry(ordered_sums.next,
 						struct btrfs_ordered_sum,
 						list);
-				ret = btrfs_csum_file_blocks(trans,
+				if (!ret)
+					ret = btrfs_csum_file_blocks(trans,
 						root->fs_info->csum_root,
 						sums);
-				BUG_ON(ret);
 				list_del(&sums->list);
 				kfree(sums);
 			}
+			if (ret)
+				goto out;
 		} else {
 			btrfs_release_path(path);
 		}
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		/* inline extents are easy, we just overwrite them */
 		ret = overwrite_item(trans, root, path, eb, slot, key);
-		BUG_ON(ret);
+		if (ret)
+			goto out;
 	}
 
-	inode_set_bytes(inode, saved_nbytes);
+	inode_add_bytes(inode, nbytes);
 	ret = btrfs_update_inode(trans, root, inode);
 out:
 	if (inode)
@@ -679,20 +736,21 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 
 	inode = read_one_inode(root, location.objectid);
 	if (!inode) {
-		kfree(name);
-		return -EIO;
+		ret = -EIO;
+		goto out;
 	}
 
 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 
 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
+	btrfs_run_delayed_items(trans, root);
+out:
 	kfree(name);
-
 	iput(inode);
-
-	btrfs_run_delayed_items(trans, root);
 	return ret;
 }
 
@@ -845,7 +903,8 @@ again:
 			victim_name_len = btrfs_inode_ref_name_len(leaf,
 								   victim_ref);
 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
-			BUG_ON(!victim_name);
+			if (!victim_name)
+				return -ENOMEM;
 
 			read_extent_buffer(leaf, victim_name,
 					   (unsigned long)(victim_ref + 1),
@@ -861,9 +920,10 @@ again:
 				ret = btrfs_unlink_inode(trans, root, dir,
 							 inode, victim_name,
 							 victim_name_len);
-				BUG_ON(ret);
-				btrfs_run_delayed_items(trans, root);
 				kfree(victim_name);
+				if (ret)
+					return ret;
+				btrfs_run_delayed_items(trans, root);
 				*search_done = 1;
 				goto again;
 			}
@@ -871,7 +931,6 @@ again:
 
 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
 		}
-		BUG_ON(ret);
 
 		/*
 		 * NOTE: we have searched root tree and checked the
@@ -905,6 +964,8 @@ again:
 				goto next;
 
 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			if (!victim_name)
+				return -ENOMEM;
 			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
 					   victim_name_len);
 
@@ -931,14 +992,16 @@ again:
 								 victim_name_len);
 					btrfs_run_delayed_items(trans, root);
 				}
-				BUG_ON(ret);
 				iput(victim_parent);
 				kfree(victim_name);
+				if (ret)
+					return ret;
 				*search_done = 1;
 				goto again;
 			}
 			kfree(victim_name);
-			BUG_ON(ret);
+			if (ret)
+				return ret;
 next:
 			cur_offset += victim_name_len + sizeof(*extref);
 		}
@@ -951,7 +1014,8 @@ next:
 					 ref_index, name, namelen, 0);
 	if (di && !IS_ERR(di)) {
 		ret = drop_one_dir_item(trans, root, path, dir, di);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
 	btrfs_release_path(path);
 
@@ -960,7 +1024,8 @@ next:
 				   name, namelen, 0);
 	if (di && !IS_ERR(di)) {
 		ret = drop_one_dir_item(trans, root, path, dir, di);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
 	btrfs_release_path(path);
 
@@ -1105,15 +1170,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 						      parent_objectid,
 						      ref_index, name, namelen,
 						      &search_done);
-				if (ret == 1)
+				if (ret == 1) {
+					ret = 0;
+					goto out;
+				}
+				if (ret)
 					goto out;
-				BUG_ON(ret);
 			}
 
 			/* insert our name */
 			ret = btrfs_add_link(trans, dir, inode, name, namelen,
 					     0, ref_index);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 
 			btrfs_update_inode(trans, root, inode);
 		}
@@ -1128,13 +1197,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
 	/* finally write the back reference in the inode */
 	ret = overwrite_item(trans, root, path, eb, slot, key);
-	BUG_ON(ret);
-
 out:
 	btrfs_release_path(path);
 	iput(dir);
 	iput(inode);
-	return 0;
+	return ret;
 }
 
 static int insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -1292,10 +1359,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		if (S_ISDIR(inode->i_mode)) {
 			ret = replay_dir_deletes(trans, root, NULL, path,
 						 ino, 1);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 		}
 		ret = insert_orphan_item(trans, root, ino);
-		BUG_ON(ret);
 	}
 
 out:
@@ -1340,9 +1407,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 			return -EIO;
 
 		ret = fixup_inode_link_count(trans, root, inode);
-		BUG_ON(ret);
-
 		iput(inode);
+		if (ret)
+			goto out;
 
 		/*
 		 * fixup on a directory may create new entries,
@@ -1384,12 +1451,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 
 	btrfs_release_path(path);
 	if (ret == 0) {
-		btrfs_inc_nlink(inode);
+		if (!inode->i_nlink)
+			set_nlink(inode, 1);
+		else
+			btrfs_inc_nlink(inode);
 		ret = btrfs_update_inode(trans, root, inode);
 	} else if (ret == -EEXIST) {
 		ret = 0;
 	} else {
-		BUG();
+		BUG(); /* Logic Error */
 	}
 	iput(inode);
 
@@ -1458,7 +1528,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	struct inode *dir;
 	u8 log_type;
 	int exists;
-	int ret;
+	int ret = 0;
 
 	dir = read_one_inode(root, key->objectid);
 	if (!dir)
@@ -1490,7 +1560,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 						     key->offset, name,
 						     name_len, 1);
 	} else {
-		BUG();
+		/* Corruption */
+		ret = -EINVAL;
+		goto out;
 	}
 	if (IS_ERR_OR_NULL(dst_di)) {
 		/* we need a sequence number to insert, so we only
@@ -1518,7 +1590,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 		goto out;
 
 	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 
 	if (key->type == BTRFS_DIR_INDEX_KEY)
 		goto insert;
@@ -1526,14 +1599,15 @@ out:
 	btrfs_release_path(path);
 	kfree(name);
 	iput(dir);
-	return 0;
+	return ret;
 
 insert:
 	btrfs_release_path(path);
 	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
 			      name, name_len, log_type, &log_key);
-
-	BUG_ON(ret && ret != -ENOENT);
+	if (ret && ret != -ENOENT)
+		goto out;
+	ret = 0;
 	goto out;
 }
 
@@ -1564,7 +1638,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 			return -EIO;
 		name_len = btrfs_dir_name_len(eb, di);
 		ret = replay_one_name(trans, root, path, eb, di, key);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 		ptr = (unsigned long)(di + 1);
 		ptr += name_len;
 	}
@@ -1725,16 +1800,21 @@ again:
 
 			ret = link_to_fixup_dir(trans, root,
 						path, location.objectid);
-			BUG_ON(ret);
+			if (ret) {
+				kfree(name);
+				iput(inode);
+				goto out;
+			}
+
 			btrfs_inc_nlink(inode);
 			ret = btrfs_unlink_inode(trans, root, dir, inode,
 						 name, name_len);
-			BUG_ON(ret);
-
-			btrfs_run_delayed_items(trans, root);
-
+			if (!ret)
+				btrfs_run_delayed_items(trans, root);
 			kfree(name);
 			iput(inode);
+			if (ret)
+				goto out;
 
 			/* there might still be more names under this key
 			 * check and repeat if required
@@ -1838,7 +1918,8 @@ again:
 			ret = check_item_in_log(trans, root, log, path,
 						log_path, dir,
 						&found_key);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 			if (found_key.offset == (u64)-1)
 				break;
 			dir_key.offset = found_key.offset + 1;
@@ -1915,11 +1996,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			if (S_ISDIR(mode)) {
 				ret = replay_dir_deletes(wc->trans,
 					 root, log, path, key.objectid, 0);
-				BUG_ON(ret);
+				if (ret)
+					break;
 			}
 			ret = overwrite_item(wc->trans, root, path,
 					     eb, i, &key);
-			BUG_ON(ret);
+			if (ret)
+				break;
 
 			/* for regular files, make sure corresponding
 			 * orhpan item exist. extents past the new EOF
@@ -1928,12 +2011,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			if (S_ISREG(mode)) {
 				ret = insert_orphan_item(wc->trans, root,
 							 key.objectid);
-				BUG_ON(ret);
+				if (ret)
+					break;
 			}
 
 			ret = link_to_fixup_dir(wc->trans, root,
 						path, key.objectid);
-			BUG_ON(ret);
+			if (ret)
+				break;
 		}
 		if (wc->stage < LOG_WALK_REPLAY_ALL)
 			continue;
@@ -1942,28 +2027,30 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
 			ret = overwrite_item(wc->trans, root, path,
 					     eb, i, &key);
-			BUG_ON(ret);
-		} else if (key.type == BTRFS_INODE_REF_KEY) {
-			ret = add_inode_ref(wc->trans, root, log, path,
-					    eb, i, &key);
-			BUG_ON(ret && ret != -ENOENT);
-		} else if (key.type == BTRFS_INODE_EXTREF_KEY) {
+			if (ret)
+				break;
+		} else if (key.type == BTRFS_INODE_REF_KEY ||
+			   key.type == BTRFS_INODE_EXTREF_KEY) {
 			ret = add_inode_ref(wc->trans, root, log, path,
 					    eb, i, &key);
-			BUG_ON(ret && ret != -ENOENT);
+			if (ret && ret != -ENOENT)
+				break;
+			ret = 0;
 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
 			ret = replay_one_extent(wc->trans, root, path,
 						eb, i, &key);
-			BUG_ON(ret);
+			if (ret)
+				break;
 		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
 			   key.type == BTRFS_DIR_INDEX_KEY) {
 			ret = replay_one_dir_item(wc->trans, root, path,
 						  eb, i, &key);
-			BUG_ON(ret);
+			if (ret)
+				break;
 		}
 	}
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
 
 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
@@ -2008,8 +2095,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 		if (*level == 1) {
 			ret = wc->process_func(root, next, wc, ptr_gen);
-			if (ret)
+			if (ret) {
+				free_extent_buffer(next);
 				return ret;
+			}
 
 			path->slots[*level]++;
 			if (wc->free) {
@@ -2029,7 +2118,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 					BTRFS_TREE_LOG_OBJECTID);
 				ret = btrfs_free_and_pin_reserved_extent(root,
 							 bytenr, blocksize);
-				BUG_ON(ret); /* -ENOMEM or logic errors */
+				if (ret) {
+					free_extent_buffer(next);
+					return ret;
+				}
 			}
 			free_extent_buffer(next);
 			continue;
@@ -2102,7 +2194,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				ret = btrfs_free_and_pin_reserved_extent(root,
 						path->nodes[*level]->start,
 						path->nodes[*level]->len);
-				BUG_ON(ret);
+				if (ret)
+					return ret;
 			}
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
@@ -2124,7 +2217,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 	int wret;
 	int level;
 	struct btrfs_path *path;
-	int i;
 	int orig_level;
 
 	path = btrfs_alloc_path();
@@ -2176,17 +2268,12 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 				BTRFS_TREE_LOG_OBJECTID);
 			ret = btrfs_free_and_pin_reserved_extent(log, next->start,
 							 next->len);
-			BUG_ON(ret); /* -ENOMEM or logic errors */
+			if (ret)
+				goto out;
 		}
 	}
 
 out:
-	for (i = 0; i <= orig_level; i++) {
-		if (path->nodes[i]) {
-			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = NULL;
-		}
-	}
 	btrfs_free_path(path);
 	return ret;
 }
@@ -2279,8 +2366,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
 	unsigned long log_transid = 0;
+	struct blk_plug plug;
 
 	mutex_lock(&root->log_mutex);
+	log_transid = root->log_transid;
 	index1 = root->log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
 		wait_log_commit(trans, root, root->log_transid);
@@ -2308,11 +2397,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	/* bail out if we need to do a full commit */
 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
 		ret = -EAGAIN;
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&root->log_mutex);
 		goto out;
 	}
 
-	log_transid = root->log_transid;
 	if (log_transid % 2 == 0)
 		mark = EXTENT_DIRTY;
 	else
@@ -2321,9 +2410,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	/* we start IO on  all the marked extents here, but we don't actually
 	 * wait for them until later.
 	 */
+	blk_start_plug(&plug);
 	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
 	if (ret) {
+		blk_finish_plug(&plug);
 		btrfs_abort_transaction(trans, root, ret);
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&root->log_mutex);
 		goto out;
 	}
@@ -2356,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	}
 
 	if (ret) {
+		blk_finish_plug(&plug);
 		if (ret != -ENOSPC) {
 			btrfs_abort_transaction(trans, root, ret);
 			mutex_unlock(&log_root_tree->log_mutex);
@@ -2363,6 +2456,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		}
 		root->fs_info->last_trans_log_full_commit = trans->transid;
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = -EAGAIN;
 		goto out;
@@ -2370,9 +2464,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	index2 = log_root_tree->log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
+		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		wait_log_commit(trans, log_root_tree,
 				log_root_tree->log_transid);
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = 0;
 		goto out;
@@ -2391,21 +2487,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * check the full commit flag again
 	 */
 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = -EAGAIN;
 		goto out_wake_log_root;
 	}
 
-	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
-				&log_root_tree->dirty_log_pages,
-				EXTENT_DIRTY | EXTENT_NEW);
+	ret = btrfs_write_marked_extents(log_root_tree,
+					 &log_root_tree->dirty_log_pages,
+					 EXTENT_DIRTY | EXTENT_NEW);
+	blk_finish_plug(&plug);
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		goto out_wake_log_root;
 	}
 	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+	btrfs_wait_marked_extents(log_root_tree,
+				  &log_root_tree->dirty_log_pages,
+				  EXTENT_NEW | EXTENT_DIRTY);
+	btrfs_wait_logged_extents(log, log_transid);
 
 	btrfs_set_super_log_root(root->fs_info->super_for_commit,
 				log_root_tree->node->start);
@@ -2461,8 +2565,13 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 		.process_func = process_one_buffer
 	};
 
-	ret = walk_log_tree(trans, log, &wc);
-	BUG_ON(ret);
+	if (trans) {
+		ret = walk_log_tree(trans, log, &wc);
+
+		/* I don't think this can happen but just in case */
+		if (ret)
+			btrfs_abort_transaction(trans, log, ret);
+	}
 
 	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -2475,6 +2584,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
 	}
 
+	/*
+	 * We may have short-circuited the log tree with the full commit logic
+	 * and left ordered extents on our list, so clear these out to keep us
+	 * from leaking inodes and memory.
+	 */
+	btrfs_free_logged_extents(log, 0);
+	btrfs_free_logged_extents(log, 1);
+
 	free_extent_buffer(log->node);
 	kfree(log);
 }
@@ -2561,7 +2678,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	if (di) {
 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
 		bytes_del += name_len;
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
 	}
 	btrfs_release_path(path);
 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
@@ -2573,7 +2693,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	if (di) {
 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
 		bytes_del += name_len;
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
 	}
 
 	/* update the directory size in the log to reflect the names
@@ -2724,7 +2847,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	path->keep_locks = 1;
 
 	ret = btrfs_search_forward(root, &min_key, &max_key,
-				   path, 0, trans->transid);
+				   path, trans->transid);
 
 	/*
 	 * we didn't find anything from this transaction, see if there
@@ -2912,7 +3035,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
-		BUG_ON(ret == 0);
+		BUG_ON(ret == 0); /* Logic error */
 		if (ret < 0)
 			break;
 
@@ -3115,7 +3238,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 						log->fs_info->csum_root,
 						ds + cs, ds + cs + cl - 1,
 						&ordered_sums, 0);
-				BUG_ON(ret);
+				if (ret) {
+					btrfs_release_path(dst_path);
+					kfree(ins_data);
+					return ret;
+				}
 			}
 		}
 	}
@@ -3155,115 +3282,6 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
 	return 0;
 }
 
-static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root, struct inode *inode,
-				 struct extent_map *em,
-				 struct btrfs_path *path)
-{
-	struct btrfs_file_extent_item *fi;
-	struct extent_buffer *leaf;
-	struct btrfs_key key, new_key;
-	struct btrfs_map_token token;
-	u64 extent_end;
-	u64 extent_offset = 0;
-	int extent_type;
-	int del_slot = 0;
-	int del_nr = 0;
-	int ret = 0;
-
-	while (1) {
-		btrfs_init_map_token(&token);
-		leaf = path->nodes[0];
-		path->slots[0]++;
-		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
-			if (del_nr) {
-				ret = btrfs_del_items(trans, root, path,
-						      del_slot, del_nr);
-				if (ret)
-					return ret;
-				del_nr = 0;
-			}
-
-			ret = btrfs_next_leaf_write(trans, root, path, 1);
-			if (ret < 0)
-				return ret;
-			if (ret > 0)
-				return 0;
-			leaf = path->nodes[0];
-		}
-
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid != btrfs_ino(inode) ||
-		    key.type != BTRFS_EXTENT_DATA_KEY ||
-		    key.offset >= em->start + em->len)
-			break;
-
-		fi = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-		extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
-		if (extent_type == BTRFS_FILE_EXTENT_REG ||
-		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-			extent_offset = btrfs_token_file_extent_offset(leaf,
-								fi, &token);
-			extent_end = key.offset +
-				btrfs_token_file_extent_num_bytes(leaf, fi,
-								  &token);
-		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-			extent_end = key.offset +
-				btrfs_file_extent_inline_len(leaf, fi);
-		} else {
-			BUG();
-		}
-
-		if (extent_end <= em->len + em->start) {
-			if (!del_nr) {
-				del_slot = path->slots[0];
-			}
-			del_nr++;
-			continue;
-		}
-
-		/*
-		 * Ok so we'll ignore previous items if we log a new extent,
-		 * which can lead to overlapping extents, so if we have an
-		 * existing extent we want to adjust we _have_ to check the next
-		 * guy to make sure we even need this extent anymore, this keeps
-		 * us from panicing in set_item_key_safe.
-		 */
-		if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
-			struct btrfs_key tmp_key;
-
-			btrfs_item_key_to_cpu(leaf, &tmp_key,
-					      path->slots[0] + 1);
-			if (tmp_key.objectid == btrfs_ino(inode) &&
-			    tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
-			    tmp_key.offset <= em->start + em->len) {
-				if (!del_nr)
-					del_slot = path->slots[0];
-				del_nr++;
-				continue;
-			}
-		}
-
-		BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
-		memcpy(&new_key, &key, sizeof(new_key));
-		new_key.offset = em->start + em->len;
-		btrfs_set_item_key_safe(trans, root, path, &new_key);
-		extent_offset += em->start + em->len - key.offset;
-		btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
-						   &token);
-		btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
-						      (em->start + em->len),
-						      &token);
-		btrfs_mark_buffer_dirty(leaf);
-	}
-
-	if (del_nr)
-		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
-
-	return ret;
-}
-
 static int log_one_extent(struct btrfs_trans_handle *trans,
 			  struct inode *inode, struct btrfs_root *root,
 			  struct extent_map *em, struct btrfs_path *path)
@@ -3271,31 +3289,38 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *leaf;
+	struct btrfs_ordered_extent *ordered;
 	struct list_head ordered_sums;
 	struct btrfs_map_token token;
 	struct btrfs_key key;
-	u64 csum_offset = em->mod_start - em->start;
-	u64 csum_len = em->mod_len;
+	u64 mod_start = em->mod_start;
+	u64 mod_len = em->mod_len;
+	u64 csum_offset;
+	u64 csum_len;
 	u64 extent_offset = em->start - em->orig_start;
 	u64 block_len;
 	int ret;
+	int index = log->log_transid % 2;
 	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
+	ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+				   em->start + em->len, NULL, 0);
+	if (ret)
+		return ret;
+
 	INIT_LIST_HEAD(&ordered_sums);
 	btrfs_init_map_token(&token);
 	key.objectid = btrfs_ino(inode);
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = em->start;
-	path->really_keep_locks = 1;
 
 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
-	if (ret && ret != -EEXIST) {
-		path->really_keep_locks = 0;
+	if (ret)
 		return ret;
-	}
 	leaf = path->nodes[0];
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
+
 	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
 					       &token);
 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3334,22 +3359,14 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 					   em->start - em->orig_start,
 					   &token);
 	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
-	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
+	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
 	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
 						&token);
 	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
 	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
 	btrfs_mark_buffer_dirty(leaf);
 
-	/*
-	 * Have to check the extent to the right of us to make sure it doesn't
-	 * fall in our current range.  We're ok if the previous extent is in our
-	 * range since the recovery stuff will run us in key order and thus just
-	 * drop the part we overwrote.
-	 */
-	ret = drop_adjacent_extents(trans, log, inode, em, path);
 	btrfs_release_path(path);
-	path->really_keep_locks = 0;
 	if (ret) {
 		return ret;
 	}
@@ -3362,6 +3379,92 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 		csum_len = block_len;
 	}
 
+	/*
+	 * First check and see if our csums are on our outstanding ordered
+	 * extents.
+	 */
+again:
+	spin_lock_irq(&log->log_extents_lock[index]);
+	list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+		struct btrfs_ordered_sum *sum;
+
+		if (!mod_len)
+			break;
+
+		if (ordered->inode != inode)
+			continue;
+
+		if (ordered->file_offset + ordered->len <= mod_start ||
+		    mod_start + mod_len <= ordered->file_offset)
+			continue;
+
+		/*
+		 * We are going to copy all the csums on this ordered extent, so
+		 * go ahead and adjust mod_start and mod_len in case this
+		 * ordered extent has already been logged.
+		 */
+		if (ordered->file_offset > mod_start) {
+			if (ordered->file_offset + ordered->len >=
+			    mod_start + mod_len)
+				mod_len = ordered->file_offset - mod_start;
+			/*
+			 * If we have this case
+			 *
+			 * |--------- logged extent ---------|
+			 *       |----- ordered extent ----|
+			 *
+			 * Just don't mess with mod_start and mod_len, we'll
+			 * just end up logging more csums than we need and it
+			 * will be ok.
+			 */
+		} else {
+			if (ordered->file_offset + ordered->len <
+			    mod_start + mod_len) {
+				mod_len = (mod_start + mod_len) -
+					(ordered->file_offset + ordered->len);
+				mod_start = ordered->file_offset +
+					ordered->len;
+			} else {
+				mod_len = 0;
+			}
+		}
+
+		/*
+		 * To keep us from looping for the above case of an ordered
+		 * extent that falls inside of the logged extent.
+		 */
+		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
+				     &ordered->flags))
+			continue;
+		atomic_inc(&ordered->refs);
+		spin_unlock_irq(&log->log_extents_lock[index]);
+		/*
+		 * we've dropped the lock, we must either break or
+		 * start over after this.
+		 */
+
+		wait_event(ordered->wait, ordered->csum_bytes_left == 0);
+
+		list_for_each_entry(sum, &ordered->list, list) {
+			ret = btrfs_csum_file_blocks(trans, log, sum);
+			if (ret) {
+				btrfs_put_ordered_extent(ordered);
+				goto unlocked;
+			}
+		}
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+
+	}
+	spin_unlock_irq(&log->log_extents_lock[index]);
+unlocked:
+
+	if (!mod_len || ret)
+		return ret;
+
+	csum_offset = mod_start - em->start;
+	csum_len = mod_len;
+
 	/* block start is already adjusted for the file extent offset. */
 	ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
 				       em->block_start + csum_offset,
@@ -3393,6 +3496,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
 	u64 test_gen;
 	int ret = 0;
+	int num = 0;
 
 	INIT_LIST_HEAD(&extents);
 
@@ -3401,16 +3505,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
 		list_del_init(&em->list);
+
+		/*
+		 * Just an arbitrary number, this can be really CPU intensive
+		 * once we start getting a lot of extents, and really once we
+		 * have a bunch of extents we just want to commit since it will
+		 * be faster.
+		 */
+		if (++num > 32768) {
+			list_del_init(&tree->modified_extents);
+			ret = -EFBIG;
+			goto process;
+		}
+
 		if (em->generation <= test_gen)
 			continue;
 		/* Need a ref to keep it from getting evicted from cache */
 		atomic_inc(&em->refs);
 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
 		list_add_tail(&em->list, &extents);
+		num++;
 	}
 
 	list_sort(NULL, &extents, extent_cmp);
 
+process:
 	while (!list_empty(&extents)) {
 		em = list_entry(extents.next, struct extent_map, list);
 
@@ -3472,8 +3591,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	bool fast_search = false;
 	u64 ino = btrfs_ino(inode);
 
-	log = root->log_root;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -3513,6 +3630,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
+	btrfs_get_logged_extents(log, inode);
+
 	/*
 	 * a brute force approach to making sure we get the most uptodate
 	 * copies of everything.
@@ -3558,7 +3677,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	while (1) {
 		ins_nr = 0;
 		ret = btrfs_search_forward(root, &min_key, &max_key,
-					   path, 0, trans->transid);
+					   path, trans->transid);
 		if (ret != 0)
 			break;
 again:
@@ -3656,6 +3775,8 @@ log_extents:
 	BTRFS_I(inode)->logged_trans = trans->transid;
 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
+	if (err)
+		btrfs_free_logged_extents(log, log->log_transid);
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
 	btrfs_free_path(path);
@@ -3736,9 +3857,9 @@ out:
  * only logging is done of any parent directories that are older than
  * the last committed transaction
  */
-int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    struct dentry *parent, int exists_only)
+static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+			    	  struct btrfs_root *root, struct inode *inode,
+			    	  struct dentry *parent, int exists_only)
 {
 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
 	struct super_block *sb;
@@ -3822,7 +3943,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 end_trans:
 	dput(old_parent);
 	if (ret < 0) {
-		WARN_ON(ret != -ENOSPC);
 		root->fs_info->last_trans_log_full_commit = trans->transid;
 		ret = 1;
 	}
@@ -3914,8 +4034,7 @@ again:
 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 			break;
 
-		log = btrfs_read_fs_root_no_radix(log_root_tree,
-						  &found_key);
+		log = btrfs_read_fs_root(log_root_tree, &found_key);
 		if (IS_ERR(log)) {
 			ret = PTR_ERR(log);
 			btrfs_error(fs_info, ret,
@@ -3930,6 +4049,9 @@ again:
 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
 		if (IS_ERR(wc.replay_dest)) {
 			ret = PTR_ERR(wc.replay_dest);
+			free_extent_buffer(log->node);
+			free_extent_buffer(log->commit_root);
+			kfree(log);
 			btrfs_error(fs_info, ret, "Couldn't read target root "
 				    "for tree log recovery.");
 			goto error;
@@ -3938,12 +4060,10 @@ again:
 		wc.replay_dest->log_root = log;
 		btrfs_record_root_in_trans(trans, wc.replay_dest);
 		ret = walk_log_tree(trans, log, &wc);
-		BUG_ON(ret);
 
-		if (wc.stage == LOG_WALK_REPLAY_ALL) {
+		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
 						      path);
-			BUG_ON(ret);
 		}
 
 		key.offset = found_key.offset - 1;
@@ -3952,6 +4072,9 @@ again:
 		free_extent_buffer(log->commit_root);
 		kfree(log);
 
+		if (ret)
+			goto error;
+
 		if (found_key.offset == 0)
 			break;
 	}
@@ -3972,17 +4095,20 @@ again:
 
 	btrfs_free_path(path);
 
+	/* step 4: commit the transaction, which also unpins the blocks */
+	ret = btrfs_commit_transaction(trans, fs_info->tree_root);
+	if (ret)
+		return ret;
+
 	free_extent_buffer(log_root_tree->node);
 	log_root_tree->log_root = NULL;
 	fs_info->log_root_recovering = 0;
-
-	/* step 4: commit the transaction, which also unpins the blocks */
-	btrfs_commit_transaction(trans, fs_info->tree_root);
-
 	kfree(log_root_tree);
-	return 0;
 
+	return 0;
 error:
+	if (wc.trans)
+		btrfs_end_transaction(wc.trans, fs_info->tree_root);
 	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 862ac813f6b..1d4ae0d15a7 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -40,9 +40,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 			       struct inode *inode, u64 dirid);
 void btrfs_end_log_trans(struct btrfs_root *root);
 int btrfs_pin_log_trans(struct btrfs_root *root);
-int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    struct dentry *parent, int exists_only);
 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 			     struct inode *dir, struct inode *inode,
 			     int for_rename);
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 99be4c138db..b0a523b2c60 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,7 +5,7 @@
  */
 
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include "ulist.h"
 
 /*
@@ -53,6 +53,7 @@ void ulist_init(struct ulist *ulist)
 	ulist->nnodes = 0;
 	ulist->nodes = ulist->int_nodes;
 	ulist->nodes_alloced = ULIST_SIZE;
+	ulist->root = RB_ROOT;
 }
 EXPORT_SYMBOL(ulist_init);
 
@@ -72,6 +73,7 @@ void ulist_fini(struct ulist *ulist)
 	if (ulist->nodes_alloced > ULIST_SIZE)
 		kfree(ulist->nodes);
 	ulist->nodes_alloced = 0;	/* in case ulist_fini is called twice */
+	ulist->root = RB_ROOT;
 }
 EXPORT_SYMBOL(ulist_fini);
 
@@ -123,6 +125,45 @@ void ulist_free(struct ulist *ulist)
 }
 EXPORT_SYMBOL(ulist_free);
 
+static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
+{
+	struct rb_node *n = ulist->root.rb_node;
+	struct ulist_node *u = NULL;
+
+	while (n) {
+		u = rb_entry(n, struct ulist_node, rb_node);
+		if (u->val < val)
+			n = n->rb_right;
+		else if (u->val > val)
+			n = n->rb_left;
+		else
+			return u;
+	}
+	return NULL;
+}
+
+static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
+{
+	struct rb_node **p = &ulist->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct ulist_node *cur = NULL;
+
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct ulist_node, rb_node);
+
+		if (cur->val < ins->val)
+			p = &(*p)->rb_right;
+		else if (cur->val > ins->val)
+			p = &(*p)->rb_left;
+		else
+			return -EEXIST;
+	}
+	rb_link_node(&ins->rb_node, parent, p);
+	rb_insert_color(&ins->rb_node, &ulist->root);
+	return 0;
+}
+
 /**
  * ulist_add - add an element to the ulist
  * @ulist:	ulist to add the element to
@@ -151,20 +192,23 @@ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
 int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 		    u64 *old_aux, gfp_t gfp_mask)
 {
-	int i;
-
-	for (i = 0; i < ulist->nnodes; ++i) {
-		if (ulist->nodes[i].val == val) {
-			if (old_aux)
-				*old_aux = ulist->nodes[i].aux;
-			return 0;
-		}
+	int ret = 0;
+	struct ulist_node *node = NULL;
+	node = ulist_rbtree_search(ulist, val);
+	if (node) {
+		if (old_aux)
+			*old_aux = node->aux;
+		return 0;
 	}
 
 	if (ulist->nnodes >= ulist->nodes_alloced) {
 		u64 new_alloced = ulist->nodes_alloced + 128;
 		struct ulist_node *new_nodes;
 		void *old = NULL;
+		int i;
+
+		for (i = 0; i < ulist->nnodes; i++)
+			rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
 
 		/*
 		 * if nodes_alloced == ULIST_SIZE no memory has been allocated
@@ -184,9 +228,22 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 
 		ulist->nodes = new_nodes;
 		ulist->nodes_alloced = new_alloced;
+
+		/*
+		 * krealloc actually uses memcpy, which does not copy rb_node
+		 * pointers, so we have to do it ourselves.  Otherwise we may
+		 * be bitten by crashes.
+		 */
+		for (i = 0; i < ulist->nnodes; i++) {
+			ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
+			if (ret < 0)
+				return ret;
+		}
 	}
 	ulist->nodes[ulist->nnodes].val = val;
 	ulist->nodes[ulist->nnodes].aux = aux;
+	ret = ulist_rbtree_insert(ulist, &ulist->nodes[ulist->nnodes]);
+	BUG_ON(ret);
 	++ulist->nnodes;
 
 	return 1;
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 21a1963439c..fb36731074b 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -8,6 +8,9 @@
 #ifndef __ULIST__
 #define __ULIST__
 
+#include <linux/list.h>
+#include <linux/rbtree.h>
+
 /*
  * ulist is a generic data structure to hold a collection of unique u64
  * values. The only operations it supports is adding to the list and
@@ -34,6 +37,7 @@ struct ulist_iterator {
 struct ulist_node {
 	u64 val;		/* value to store */
 	u64 aux;		/* auxiliary value saved along with the val */
+	struct rb_node rb_node;	/* used to speed up search */
 };
 
 struct ulist {
@@ -54,6 +58,8 @@ struct ulist {
 	 */
 	struct ulist_node *nodes;
 
+	struct rb_root root;
+
 	/*
 	 * inline storage space for the first ULIST_SIZE entries
 	 */
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644
index 9bf3946d5ef..00000000000
--- a/fs/btrfs/version.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __BTRFS_VERSION_H
-#define __BTRFS_VERSION_H
-#define BTRFS_BUILD_VERSION "Btrfs"
-#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cbb7f4b167..78b871753cb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
 #include <linux/capability.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
@@ -32,6 +34,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "raid56.h"
 #include "async-thread.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
@@ -43,6 +46,7 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_device *device);
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 
 static DEFINE_MUTEX(uuid_mutex);
@@ -647,6 +651,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		new_device->writeable = 0;
 		new_device->in_fs_metadata = 0;
 		new_device->can_discard = 0;
+		spin_lock_init(&new_device->io_lock);
 		list_replace_rcu(&device->dev_list, &new_device->dev_list);
 
 		call_rcu(&device->rcu, free_device);
@@ -680,6 +685,12 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		__btrfs_close_devices(fs_devices);
 		free_fs_devices(fs_devices);
 	}
+	/*
+	 * Wait for rcu kworkers under __btrfs_close_devices
+	 * to finish all blkdev_puts so device is really
+	 * free when umount is done.
+	 */
+	rcu_barrier();
 	return ret;
 }
 
@@ -707,9 +718,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		if (!device->name)
 			continue;
 
-		ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
-					    &bdev, &bh);
-		if (ret)
+		/* Just open everything we can; ignore failures here */
+		if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+					    &bdev, &bh))
 			continue;
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -792,26 +803,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	return ret;
 }
 
+/*
+ * Look for a btrfs signature on a device. This may be called out of the mount path
+ * and we are not allowed to call set_blocksize during the scan. The superblock
+ * is read via pagecache
+ */
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret)
 {
 	struct btrfs_super_block *disk_super;
 	struct block_device *bdev;
-	struct buffer_head *bh;
-	int ret;
+	struct page *page;
+	void *p;
+	int ret = -EINVAL;
 	u64 devid;
 	u64 transid;
 	u64 total_devices;
+	u64 bytenr;
+	pgoff_t index;
 
+	/*
+	 * we would like to check all the supers, but that would make
+	 * a btrfs mount succeed after a mkfs from a different FS.
+	 * So, we need to add a special mount option to scan for
+	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+	 */
+	bytenr = btrfs_sb_offset(0);
 	flags |= FMODE_EXCL;
 	mutex_lock(&uuid_mutex);
-	ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
-	if (ret)
+
+	bdev = blkdev_get_by_path(path, flags, holder);
+
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
 		goto error;
-	disk_super = (struct btrfs_super_block *)bh->b_data;
+	}
+
+	/* make sure our super fits in the device */
+	if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+		goto error_bdev_put;
+
+	/* make sure our super fits in the page */
+	if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+		goto error_bdev_put;
+
+	/* make sure our super doesn't straddle pages on disk */
+	index = bytenr >> PAGE_CACHE_SHIFT;
+	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+		goto error_bdev_put;
+
+	/* pull in the page with our super */
+	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+				   index, GFP_NOFS);
+
+	if (IS_ERR_OR_NULL(page))
+		goto error_bdev_put;
+
+	p = kmap(page);
+
+	/* align our pointer to the offset of the super block */
+	disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+
+	if (btrfs_super_bytenr(disk_super) != bytenr ||
+	    disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
+		goto error_unmap;
+
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
 	transid = btrfs_super_generation(disk_super);
 	total_devices = btrfs_super_num_devices(disk_super);
+
 	if (disk_super->label[0]) {
 		if (disk_super->label[BTRFS_LABEL_SIZE - 1])
 			disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
@@ -819,12 +879,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	} else {
 		printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
 	}
+
 	printk(KERN_CONT "devid %llu transid %llu %s\n",
 	       (unsigned long long)devid, (unsigned long long)transid, path);
+
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 	if (!ret && fs_devices_ret)
 		(*fs_devices_ret)->total_devices = total_devices;
-	brelse(bh);
+
+error_unmap:
+	kunmap(page);
+	page_cache_release(page);
+
+error_bdev_put:
 	blkdev_put(bdev, flags);
 error:
 	mutex_unlock(&uuid_mutex);
@@ -915,6 +982,35 @@ out:
 	return ret;
 }
 
+static int contains_pending_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_device *device,
+				   u64 *start, u64 len)
+{
+	struct extent_map *em;
+	int ret = 0;
+
+	list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+		struct map_lookup *map;
+		int i;
+
+		map = (struct map_lookup *)em->bdev;
+		for (i = 0; i < map->num_stripes; i++) {
+			if (map->stripes[i].dev != device)
+				continue;
+			if (map->stripes[i].physical >= *start + len ||
+			    map->stripes[i].physical + em->orig_block_len <=
+			    *start)
+				continue;
+			*start = map->stripes[i].physical +
+				em->orig_block_len;
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+
 /*
  * find_free_dev_extent - find free space in the specified device
  * @device:	the device which we search the free space in
@@ -935,7 +1031,8 @@ out:
  * But if we don't find suitable free space, it is used to store the size of
  * the max free space.
  */
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *len)
 {
 	struct btrfs_key key;
@@ -959,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 	 */
 	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
 
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+again:
 	max_hole_start = search_start;
 	max_hole_size = 0;
 	hole_size = 0;
 
 	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
 		ret = -ENOSPC;
-		goto error;
+		goto out;
 	}
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto error;
-	}
 	path->reada = 2;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
 
 	key.objectid = device->devid;
 	key.offset = search_start;
@@ -1014,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
 		if (key.offset > search_start) {
 			hole_size = key.offset - search_start;
 
+			/*
+			 * Have to check before we set max_hole_start, otherwise
+			 * we could end up sending back this offset anyway.
+			 */
+			if (contains_pending_extent(trans, device,
+						    &search_start,
+						    hole_size))
+				hole_size = 0;
+
 			if (hole_size > max_hole_size) {
 				max_hole_start = search_start;
 				max_hole_size = hole_size;
@@ -1057,6 +1164,11 @@ next:
 		max_hole_size = hole_size;
 	}
 
+	if (contains_pending_extent(trans, device, &search_start, hole_size)) {
+		btrfs_release_path(path);
+		goto again;
+	}
+
 	/* See above. */
 	if (hole_size < num_bytes)
 		ret = -ENOSPC;
@@ -1065,7 +1177,6 @@ next:
 
 out:
 	btrfs_free_path(path);
-error:
 	*start = max_hole_start;
 	if (len)
 		*len = max_hole_size;
@@ -1133,10 +1244,10 @@ out:
 	return ret;
 }
 
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
-			   struct btrfs_device *device,
-			   u64 chunk_tree, u64 chunk_objectid,
-			   u64 chunk_offset, u64 start, u64 num_bytes)
+static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_device *device,
+				  u64 chunk_tree, u64 chunk_objectid,
+				  u64 chunk_offset, u64 start, u64 num_bytes)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1177,47 +1288,22 @@ out:
 	return ret;
 }
 
-static noinline int find_next_chunk(struct btrfs_root *root,
-				    u64 objectid, u64 *offset)
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_path *path;
-	int ret;
-	struct btrfs_key key;
-	struct btrfs_chunk *chunk;
-	struct btrfs_key found_key;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	key.objectid = objectid;
-	key.offset = (u64)-1;
-	key.type = BTRFS_CHUNK_ITEM_KEY;
-
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0)
-		goto error;
-
-	BUG_ON(ret == 0); /* Corruption */
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct rb_node *n;
+	u64 ret = 0;
 
-	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
-	if (ret) {
-		*offset = 0;
-	} else {
-		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-				      path->slots[0]);
-		if (found_key.objectid != objectid)
-			*offset = 0;
-		else {
-			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
-					       struct btrfs_chunk);
-			*offset = found_key.offset +
-				btrfs_chunk_length(path->nodes[0], chunk);
-		}
+	em_tree = &fs_info->mapping_tree.map_tree;
+	read_lock(&em_tree->lock);
+	n = rb_last(&em_tree->map);
+	if (n) {
+		em = rb_entry(n, struct extent_map, rb_node);
+		ret = em->start + em->len;
 	}
-	ret = 0;
-error:
-	btrfs_free_path(path);
+	read_unlock(&em_tree->lock);
+
 	return ret;
 }
 
@@ -1263,9 +1349,9 @@ error:
  * the device information is stored in the chunk root
  * the btrfs_device struct should be fully filled in
  */
-int btrfs_add_device(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root,
-		     struct btrfs_device *device)
+static int btrfs_add_device(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_device *device)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1372,14 +1458,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	u64 devid;
 	u64 num_devices;
 	u8 *dev_uuid;
+	unsigned seq;
 	int ret = 0;
 	bool clear_super = false;
 
 	mutex_lock(&uuid_mutex);
 
-	all_avail = root->fs_info->avail_data_alloc_bits |
-		root->fs_info->avail_system_alloc_bits |
-		root->fs_info->avail_metadata_alloc_bits;
+	do {
+		seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+		all_avail = root->fs_info->avail_data_alloc_bits |
+			    root->fs_info->avail_system_alloc_bits |
+			    root->fs_info->avail_metadata_alloc_bits;
+	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
 
 	num_devices = root->fs_info->fs_devices->num_devices;
 	btrfs_dev_replace_lock(&root->fs_info->dev_replace);
@@ -1390,16 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
-		printk(KERN_ERR "btrfs: unable to go below four devices "
-		       "on raid10\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
-		printk(KERN_ERR "btrfs: unable to go below two "
-		       "devices on raid1\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
+		goto out;
+	}
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
+	    root->fs_info->fs_devices->rw_devices <= 2) {
+		ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
+		goto out;
+	}
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
+	    root->fs_info->fs_devices->rw_devices <= 3) {
+		ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
 		goto out;
 	}
 
@@ -1425,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		bh = NULL;
 		disk_super = NULL;
 		if (!device) {
-			printk(KERN_ERR "btrfs: no missing devices found to "
-			       "remove\n");
+			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
 			goto out;
 		}
 	} else {
@@ -1448,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 
 	if (device->is_tgtdev_for_dev_replace) {
-		pr_err("btrfs: unable to remove the dev_replace target dev\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
 		goto error_brelse;
 	}
 
 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-		printk(KERN_ERR "btrfs: unable to remove the only writeable "
-		       "device\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
 		goto error_brelse;
 	}
 
@@ -1624,8 +1718,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 }
 
-int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
-			      struct btrfs_device **device)
+static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+				     struct btrfs_device **device)
 {
 	int ret = 0;
 	struct btrfs_super_block *disk_super;
@@ -2299,7 +2393,11 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 		return ret;
 
 	trans = btrfs_start_transaction(root, 0);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		btrfs_std_error(root->fs_info, ret);
+		return ret;
+	}
 
 	lock_chunks(root);
 
@@ -2616,7 +2714,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
 	chunk_used = btrfs_block_group_used(&cache->item);
 
 	if (bargs->usage == 0)
-		user_thresh = 0;
+		user_thresh = 1;
 	else if (bargs->usage > 100)
 		user_thresh = cache->key.offset;
 	else
@@ -2664,11 +2762,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
 		return 0;
 
 	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
-	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
-		factor = 2;
-	else
-		factor = 1;
-	factor = num_stripes / factor;
+	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
+		factor = num_stripes / 2;
+	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
+		factor = num_stripes - 1;
+	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
+		factor = num_stripes - 2;
+	} else {
+		factor = num_stripes;
+	}
 
 	for (i = 0; i < num_stripes; i++) {
 		stripe = btrfs_stripe_nr(chunk, i);
@@ -2966,7 +3068,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
 
 	unset_balance_control(fs_info);
 	ret = del_balance_item(fs_info->tree_root);
-	BUG_ON(ret);
+	if (ret)
+		btrfs_std_error(fs_info, ret);
 
 	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 }
@@ -2985,6 +3088,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	int mixed = 0;
 	int ret;
 	u64 num_devices;
+	unsigned seq;
 
 	if (btrfs_fs_closing(fs_info) ||
 	    atomic_read(&fs_info->balance_pause_req) ||
@@ -3023,12 +3127,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
 	if (num_devices == 1)
 		allowed |= BTRFS_BLOCK_GROUP_DUP;
-	else if (num_devices < 4)
+	else if (num_devices > 1)
 		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
-	else
-		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
-				BTRFS_BLOCK_GROUP_RAID10);
-
+	if (num_devices > 2)
+		allowed |= BTRFS_BLOCK_GROUP_RAID5;
+	if (num_devices > 3)
+		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
+			    BTRFS_BLOCK_GROUP_RAID6);
 	if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
 	    (!alloc_profile_is_valid(bctl->data.target, 1) ||
 	     (bctl->data.target & ~allowed))) {
@@ -3067,23 +3172,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 
 	/* allow to reduce meta or sys integrity only if force set */
 	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
-			BTRFS_BLOCK_GROUP_RAID10;
-	if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-	     (fs_info->avail_system_alloc_bits & allowed) &&
-	     !(bctl->sys.target & allowed)) ||
-	    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-	     (fs_info->avail_metadata_alloc_bits & allowed) &&
-	     !(bctl->meta.target & allowed))) {
-		if (bctl->flags & BTRFS_BALANCE_FORCE) {
-			printk(KERN_INFO "btrfs: force reducing metadata "
-			       "integrity\n");
-		} else {
-			printk(KERN_ERR "btrfs: balance will reduce metadata "
-			       "integrity, use force if you want this\n");
-			ret = -EINVAL;
-			goto out;
+			BTRFS_BLOCK_GROUP_RAID10 |
+			BTRFS_BLOCK_GROUP_RAID5 |
+			BTRFS_BLOCK_GROUP_RAID6;
+	do {
+		seq = read_seqbegin(&fs_info->profiles_lock);
+
+		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		     (fs_info->avail_system_alloc_bits & allowed) &&
+		     !(bctl->sys.target & allowed)) ||
+		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		     (fs_info->avail_metadata_alloc_bits & allowed) &&
+		     !(bctl->meta.target & allowed))) {
+			if (bctl->flags & BTRFS_BALANCE_FORCE) {
+				printk(KERN_INFO "btrfs: force reducing metadata "
+				       "integrity\n");
+			} else {
+				printk(KERN_ERR "btrfs: balance will reduce metadata "
+				       "integrity, use force if you want this\n");
+				ret = -EINVAL;
+				goto out;
+			}
 		}
-	}
+	} while (read_seqretry(&fs_info->profiles_lock, seq));
 
 	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
 		int num_tolerated_disk_barrier_failures;
@@ -3127,6 +3238,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	mutex_lock(&fs_info->balance_mutex);
 	atomic_dec(&fs_info->balance_running);
 
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		fs_info->num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+	}
+
 	if (bargs) {
 		memset(bargs, 0, sizeof(*bargs));
 		update_ioctl_balance_args(fs_info, 0, bargs);
@@ -3137,11 +3253,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		__cancel_balance(fs_info);
 	}
 
-	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-		fs_info->num_tolerated_disk_barrier_failures =
-			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
-	}
-
 	wake_up(&fs_info->balance_wait_q);
 
 	return ret;
@@ -3191,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
 	}
 
 	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
-	if (IS_ERR(tsk))
-		return PTR_ERR(tsk);
-
-	return 0;
+	return PTR_RET(tsk);
 }
 
 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@ -3503,19 +3611,82 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
 	return 0;
 }
 
-struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
-	{ 2, 1, 0, 4, 2, 2 /* raid10 */ },
-	{ 1, 1, 2, 2, 2, 2 /* raid1 */ },
-	{ 1, 2, 1, 1, 1, 2 /* dup */ },
-	{ 1, 1, 0, 2, 1, 1 /* raid0 */ },
-	{ 1, 1, 1, 1, 1, 1 /* single */ },
+static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+	[BTRFS_RAID_RAID10] = {
+		.sub_stripes	= 2,
+		.dev_stripes	= 1,
+		.devs_max	= 0,	/* 0 == as many as possible */
+		.devs_min	= 4,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID1] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 2,
+		.devs_min	= 2,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_DUP] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 2,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID0] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_SINGLE] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_RAID5] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID6] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 3,
+		.devs_increment	= 1,
+		.ncopies	= 3,
+	},
 };
 
+static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
+{
+	/* TODO allow them to set a preferred stripe size */
+	return 64 * 1024;
+}
+
+static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+	if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+		return;
+
+	btrfs_set_fs_incompat(info, RAID56);
+}
+
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *extent_root,
-			       struct map_lookup **map_ret,
-			       u64 *num_bytes_out, u64 *stripe_size_out,
-			       u64 start, u64 type)
+			       struct btrfs_root *extent_root, u64 start,
+			       u64 type)
 {
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3526,6 +3697,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct btrfs_device_info *devices_info = NULL;
 	u64 total_avail;
 	int num_stripes;	/* total number of stripes to allocate */
+	int data_stripes;	/* number of stripes that count for
+				   block group size */
 	int sub_stripes;	/* sub_stripes info for map */
 	int dev_stripes;	/* stripes per dev */
 	int devs_max;		/* max devs to use */
@@ -3537,6 +3710,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	u64 max_chunk_size;
 	u64 stripe_size;
 	u64 num_bytes;
+	u64 raid_stripe_len = BTRFS_STRIPE_LEN;
 	int ndevs;
 	int i;
 	int j;
@@ -3619,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		if (total_avail == 0)
 			continue;
 
-		ret = find_free_dev_extent(device,
+		ret = find_free_dev_extent(trans, device,
 					   max_stripe_size * dev_stripes,
 					   &dev_offset, &max_avail);
 		if (ret && ret != -ENOSPC)
@@ -3631,12 +3805,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
 			continue;
 
+		if (ndevs == fs_devices->rw_devices) {
+			WARN(1, "%s: found more than %llu devices\n",
+			     __func__, fs_devices->rw_devices);
+			break;
+		}
 		devices_info[ndevs].dev_offset = dev_offset;
 		devices_info[ndevs].max_avail = max_avail;
 		devices_info[ndevs].total_avail = total_avail;
 		devices_info[ndevs].dev = device;
 		++ndevs;
-		WARN_ON(ndevs > fs_devices->rw_devices);
 	}
 
 	/*
@@ -3662,16 +3840,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	stripe_size = devices_info[ndevs-1].max_avail;
 	num_stripes = ndevs * dev_stripes;
 
-	if (stripe_size * ndevs > max_chunk_size * ncopies) {
-		stripe_size = max_chunk_size * ncopies;
-		do_div(stripe_size, ndevs);
+	/*
+	 * this will have to be fixed for RAID1 and RAID10 over
+	 * more drives
+	 */
+	data_stripes = num_stripes / ncopies;
+
+	if (type & BTRFS_BLOCK_GROUP_RAID5) {
+		raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
+				 btrfs_super_stripesize(info->super_copy));
+		data_stripes = num_stripes - 1;
+	}
+	if (type & BTRFS_BLOCK_GROUP_RAID6) {
+		raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
+				 btrfs_super_stripesize(info->super_copy));
+		data_stripes = num_stripes - 2;
+	}
+
+	/*
+	 * Use the number of data stripes to figure out how big this chunk
+	 * is really going to be in terms of logical address space,
+	 * and compare that answer with the max chunk size
+	 */
+	if (stripe_size * data_stripes > max_chunk_size) {
+		u64 mask = (1ULL << 24) - 1;
+		stripe_size = max_chunk_size;
+		do_div(stripe_size, data_stripes);
+
+		/* bump the answer up to a 16MB boundary */
+		stripe_size = (stripe_size + mask) & ~mask;
+
+		/* but don't go higher than the limits we found
+		 * while searching for free extents
+		 */
+		if (stripe_size > devices_info[ndevs-1].max_avail)
+			stripe_size = devices_info[ndevs-1].max_avail;
 	}
 
 	do_div(stripe_size, dev_stripes);
 
 	/* align to BTRFS_STRIPE_LEN */
-	do_div(stripe_size, BTRFS_STRIPE_LEN);
-	stripe_size *= BTRFS_STRIPE_LEN;
+	do_div(stripe_size, raid_stripe_len);
+	stripe_size *= raid_stripe_len;
 
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
@@ -3689,17 +3899,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		}
 	}
 	map->sector_size = extent_root->sectorsize;
-	map->stripe_len = BTRFS_STRIPE_LEN;
-	map->io_align = BTRFS_STRIPE_LEN;
-	map->io_width = BTRFS_STRIPE_LEN;
+	map->stripe_len = raid_stripe_len;
+	map->io_align = raid_stripe_len;
+	map->io_width = raid_stripe_len;
 	map->type = type;
 	map->sub_stripes = sub_stripes;
 
-	*map_ret = map;
-	num_bytes = stripe_size * (num_stripes / ncopies);
-
-	*stripe_size_out = stripe_size;
-	*num_bytes_out = num_bytes;
+	num_bytes = stripe_size * data_stripes;
 
 	trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
 
@@ -3713,74 +3919,110 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	em->len = num_bytes;
 	em->block_start = 0;
 	em->block_len = em->len;
+	em->orig_block_len = stripe_size;
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
+	ret = add_extent_mapping(em_tree, em, 0);
+	if (!ret) {
+		list_add_tail(&em->list, &trans->transaction->pending_chunks);
+		atomic_inc(&em->refs);
+	}
 	write_unlock(&em_tree->lock);
-	free_extent_map(em);
-	if (ret)
+	if (ret) {
+		free_extent_map(em);
 		goto error;
+	}
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, type,
 				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
 				     start, num_bytes);
 	if (ret)
-		goto error;
-
-	for (i = 0; i < map->num_stripes; ++i) {
-		struct btrfs_device *device;
-		u64 dev_offset;
-
-		device = map->stripes[i].dev;
-		dev_offset = map->stripes[i].physical;
+		goto error_del_extent;
 
-		ret = btrfs_alloc_dev_extent(trans, device,
-				info->chunk_root->root_key.objectid,
-				BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-				start, dev_offset, stripe_size);
-		if (ret) {
-			btrfs_abort_transaction(trans, extent_root, ret);
-			goto error;
-		}
-	}
+	free_extent_map(em);
+	check_raid56_incompat_flag(extent_root->fs_info, type);
 
 	kfree(devices_info);
 	return 0;
 
+error_del_extent:
+	write_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	write_unlock(&em_tree->lock);
+
+	/* One for our allocation */
+	free_extent_map(em);
+	/* One for the tree reference */
+	free_extent_map(em);
 error:
 	kfree(map);
 	kfree(devices_info);
 	return ret;
 }
 
-static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 				struct btrfs_root *extent_root,
-				struct map_lookup *map, u64 chunk_offset,
-				u64 chunk_size, u64 stripe_size)
+				u64 chunk_offset, u64 chunk_size)
 {
-	u64 dev_offset;
 	struct btrfs_key key;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
 	struct btrfs_device *device;
 	struct btrfs_chunk *chunk;
 	struct btrfs_stripe *stripe;
-	size_t item_size = btrfs_chunk_item_size(map->num_stripes);
-	int index = 0;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	size_t item_size;
+	u64 dev_offset;
+	u64 stripe_size;
+	int i = 0;
 	int ret;
 
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+	read_unlock(&em_tree->lock);
+
+	if (!em) {
+		btrfs_crit(extent_root->fs_info, "unable to find logical "
+			   "%Lu len %Lu", chunk_offset, chunk_size);
+		return -EINVAL;
+	}
+
+	if (em->start != chunk_offset || em->len != chunk_size) {
+		btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+			  " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+			  chunk_size, em->start, em->len);
+		free_extent_map(em);
+		return -EINVAL;
+	}
+
+	map = (struct map_lookup *)em->bdev;
+	item_size = btrfs_chunk_item_size(map->num_stripes);
+	stripe_size = em->orig_block_len;
+
 	chunk = kzalloc(item_size, GFP_NOFS);
-	if (!chunk)
-		return -ENOMEM;
+	if (!chunk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < map->num_stripes; i++) {
+		device = map->stripes[i].dev;
+		dev_offset = map->stripes[i].physical;
 
-	index = 0;
-	while (index < map->num_stripes) {
-		device = map->stripes[index].dev;
 		device->bytes_used += stripe_size;
 		ret = btrfs_update_device(trans, device);
 		if (ret)
-			goto out_free;
-		index++;
+			goto out;
+		ret = btrfs_alloc_dev_extent(trans, device,
+					     chunk_root->root_key.objectid,
+					     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+					     chunk_offset, dev_offset,
+					     stripe_size);
+		if (ret)
+			goto out;
 	}
 
 	spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -3788,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
 						   map->num_stripes);
 	spin_unlock(&extent_root->fs_info->free_chunk_lock);
 
-	index = 0;
 	stripe = &chunk->stripe;
-	while (index < map->num_stripes) {
-		device = map->stripes[index].dev;
-		dev_offset = map->stripes[index].physical;
+	for (i = 0; i < map->num_stripes; i++) {
+		device = map->stripes[i].dev;
+		dev_offset = map->stripes[i].physical;
 
 		btrfs_set_stack_stripe_devid(stripe, device->devid);
 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
 		stripe++;
-		index++;
 	}
 
 	btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -3816,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
 	key.offset = chunk_offset;
 
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
-
 	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		/*
 		 * TODO: Cleanup of inserted chunk root in case of
@@ -3826,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
 					     item_size);
 	}
 
-out_free:
+out:
 	kfree(chunk);
+	free_extent_map(em);
 	return ret;
 }
 
@@ -3842,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 type)
 {
 	u64 chunk_offset;
-	u64 chunk_size;
-	u64 stripe_size;
-	struct map_lookup *map;
-	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
-	int ret;
 
-	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-			      &chunk_offset);
-	if (ret)
-		return ret;
-
-	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
-				  &stripe_size, chunk_offset, type);
-	if (ret)
-		return ret;
-
-	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-				   chunk_size, stripe_size);
-	if (ret)
-		return ret;
-	return 0;
+	chunk_offset = find_next_chunk(extent_root->fs_info);
+	return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
 }
 
 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -3871,72 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 {
 	u64 chunk_offset;
 	u64 sys_chunk_offset;
-	u64 chunk_size;
-	u64 sys_chunk_size;
-	u64 stripe_size;
-	u64 sys_stripe_size;
 	u64 alloc_profile;
-	struct map_lookup *map;
-	struct map_lookup *sys_map;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	int ret;
 
-	ret = find_next_chunk(fs_info->chunk_root,
-			      BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
-	if (ret)
-		return ret;
-
-	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-				fs_info->avail_metadata_alloc_bits;
-	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
-	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
-				  &stripe_size, chunk_offset, alloc_profile);
+	chunk_offset = find_next_chunk(fs_info);
+	alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
+	ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
+				  alloc_profile);
 	if (ret)
 		return ret;
 
-	sys_chunk_offset = chunk_offset + chunk_size;
-
-	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-				fs_info->avail_system_alloc_bits;
-	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
-	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
-				  &sys_chunk_size, &sys_stripe_size,
-				  sys_chunk_offset, alloc_profile);
+	sys_chunk_offset = find_next_chunk(root->fs_info);
+	alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
+	ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+				  alloc_profile);
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
 		goto out;
 	}
 
 	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
-	if (ret) {
-		btrfs_abort_transaction(trans, root, ret);
-		goto out;
-	}
-
-	/*
-	 * Modifying chunk tree needs allocating new blocks from both
-	 * system block group and metadata block group. So we only can
-	 * do operations require modifying the chunk tree after both
-	 * block groups were created.
-	 */
-	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-				   chunk_size, stripe_size);
-	if (ret) {
-		btrfs_abort_transaction(trans, root, ret);
-		goto out;
-	}
-
-	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
-				   sys_chunk_offset, sys_chunk_size,
-				   sys_stripe_size);
 	if (ret)
 		btrfs_abort_transaction(trans, root, ret);
-
 out:
-
 	return ret;
 }
 
@@ -4006,14 +4187,34 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, len);
 	read_unlock(&em_tree->lock);
-	BUG_ON(!em);
 
-	BUG_ON(em->start > logical || em->start + em->len < logical);
+	/*
+	 * We could return errors for these cases, but that could get ugly and
+	 * we'd probably do the same thing which is just not do anything else
+	 * and exit, so return 1 so the callers don't try to use other copies.
+	 */
+	if (!em) {
+		btrfs_emerg(fs_info, "No mapping for %Lu-%Lu\n", logical,
+			    logical+len);
+		return 1;
+	}
+
+	if (em->start > logical || em->start + em->len < logical) {
+		btrfs_emerg(fs_info, "Invalid mapping for %Lu-%Lu, got "
+			    "%Lu-%Lu\n", logical, logical+len, em->start,
+			    em->start + em->len);
+		return 1;
+	}
+
 	map = (struct map_lookup *)em->bdev;
 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
 		ret = map->num_stripes;
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 		ret = map->sub_stripes;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+		ret = 2;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+		ret = 3;
 	else
 		ret = 1;
 	free_extent_map(em);
@@ -4026,6 +4227,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 	return ret;
 }
 
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+				    struct btrfs_mapping_tree *map_tree,
+				    u64 logical)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	unsigned long len = root->sectorsize;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	read_unlock(&em_tree->lock);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			 BTRFS_BLOCK_GROUP_RAID6)) {
+		len = map->stripe_len * nr_data_stripes(map);
+	}
+	free_extent_map(em);
+	return len;
+}
+
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+			   u64 logical, u64 len, int mirror_num)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	int ret = 0;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	read_unlock(&em_tree->lock);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			 BTRFS_BLOCK_GROUP_RAID6))
+		ret = 1;
+	free_extent_map(em);
+	return ret;
+}
+
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
 			    struct map_lookup *map, int first, int num,
 			    int optimal, int dev_replace_is_ongoing)
@@ -4063,10 +4310,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	return optimal;
 }
 
+static inline int parity_smaller(u64 a, u64 b)
+{
+	return a > b;
+}
+
+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+{
+	struct btrfs_bio_stripe s;
+	int i;
+	u64 l;
+	int again = 1;
+
+	while (again) {
+		again = 0;
+		for (i = 0; i < bbio->num_stripes - 1; i++) {
+			if (parity_smaller(raid_map[i], raid_map[i+1])) {
+				s = bbio->stripes[i];
+				l = raid_map[i];
+				bbio->stripes[i] = bbio->stripes[i+1];
+				raid_map[i] = raid_map[i+1];
+				bbio->stripes[i+1] = s;
+				raid_map[i+1] = l;
+				again = 1;
+			}
+		}
+	}
+}
+
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 			     u64 logical, u64 *length,
 			     struct btrfs_bio **bbio_ret,
-			     int mirror_num)
+			     int mirror_num, u64 **raid_map_ret)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -4078,6 +4354,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	u64 stripe_nr;
 	u64 stripe_nr_orig;
 	u64 stripe_nr_end;
+	u64 stripe_len;
+	u64 *raid_map = NULL;
 	int stripe_index;
 	int i;
 	int ret = 0;
@@ -4089,45 +4367,83 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	int num_alloc_stripes;
 	int patch_the_first_stripe_for_dev_replace = 0;
 	u64 physical_to_patch_in_first_stripe = 0;
+	u64 raid56_full_stripe_start = (u64)-1;
 
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	read_unlock(&em_tree->lock);
 
 	if (!em) {
-		printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
-		       (unsigned long long)logical,
-		       (unsigned long long)*length);
-		BUG();
+		btrfs_crit(fs_info, "unable to find logical %llu len %llu",
+			(unsigned long long)logical,
+			(unsigned long long)*length);
+		return -EINVAL;
+	}
+
+	if (em->start > logical || em->start + em->len < logical) {
+		btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
+			   "found %Lu-%Lu\n", logical, em->start,
+			   em->start + em->len);
+		return -EINVAL;
 	}
 
-	BUG_ON(em->start > logical || em->start + em->len < logical);
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
 
+	stripe_len = map->stripe_len;
 	stripe_nr = offset;
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
 	 * to get to this block
 	 */
-	do_div(stripe_nr, map->stripe_len);
+	do_div(stripe_nr, stripe_len);
 
-	stripe_offset = stripe_nr * map->stripe_len;
+	stripe_offset = stripe_nr * stripe_len;
 	BUG_ON(offset < stripe_offset);
 
 	/* stripe_offset is the offset of this block in its stripe*/
 	stripe_offset = offset - stripe_offset;
 
-	if (rw & REQ_DISCARD)
+	/* if we're here for raid56, we need to know the stripe aligned start */
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+		raid56_full_stripe_start = offset;
+
+		/* allow a write of a full stripe, but make sure we don't
+		 * allow straddling of stripes
+		 */
+		do_div(raid56_full_stripe_start, full_stripe_len);
+		raid56_full_stripe_start *= full_stripe_len;
+	}
+
+	if (rw & REQ_DISCARD) {
+		/* we don't discard raid56 yet */
+		if (map->type &
+		    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
 		*length = min_t(u64, em->len - offset, *length);
-	else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
-		/* we limit the length of each bio to what fits in a stripe */
-		*length = min_t(u64, em->len - offset,
-				map->stripe_len - stripe_offset);
+	} else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+		u64 max_len;
+		/* For writes to RAID[56], allow a full stripeset across all disks.
+		   For other RAID types and for RAID[56] reads, just allow a single
+		   stripe (on a single disk). */
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+		    (rw & REQ_WRITE)) {
+			max_len = stripe_len * nr_data_stripes(map) -
+				(offset - raid56_full_stripe_start);
+		} else {
+			/* we limit the length of each bio to what fits in a stripe */
+			max_len = stripe_len - stripe_offset;
+		}
+		*length = min_t(u64, em->len - offset, max_len);
 	} else {
 		*length = em->len - offset;
 	}
 
+	/* This is for when we're called from btrfs_merge_bio_hook() and all
+	   it cares about is the length */
 	if (!bbio_ret)
 		goto out;
 
@@ -4160,7 +4476,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		u64 physical_of_found = 0;
 
 		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
-			     logical, &tmp_length, &tmp_bbio, 0);
+			     logical, &tmp_length, &tmp_bbio, 0, NULL);
 		if (ret) {
 			WARN_ON(tmp_bbio != NULL);
 			goto out;
@@ -4221,11 +4537,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	num_stripes = 1;
 	stripe_index = 0;
 	stripe_nr_orig = stripe_nr;
-	stripe_nr_end = (offset + *length + map->stripe_len - 1) &
-			(~(map->stripe_len - 1));
+	stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
 	do_div(stripe_nr_end, map->stripe_len);
 	stripe_end_offset = stripe_nr_end * map->stripe_len -
 			    (offset + *length);
+
 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 		if (rw & REQ_DISCARD)
 			num_stripes = min_t(u64, map->num_stripes,
@@ -4276,6 +4592,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 					      dev_replace_is_ongoing);
 			mirror_num = stripe_index - old_stripe_index + 1;
 		}
+
+	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+				BTRFS_BLOCK_GROUP_RAID6)) {
+		u64 tmp;
+
+		if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
+		    && raid_map_ret) {
+			int i, rot;
+
+			/* push stripe_nr back to the start of the full stripe */
+			stripe_nr = raid56_full_stripe_start;
+			do_div(stripe_nr, stripe_len);
+
+			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+
+			/* RAID[56] write or recovery. Return all stripes */
+			num_stripes = map->num_stripes;
+			max_errors = nr_parity_stripes(map);
+
+			raid_map = kmalloc(sizeof(u64) * num_stripes,
+					   GFP_NOFS);
+			if (!raid_map) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			/* Work out the disk rotation on this stripe-set */
+			tmp = stripe_nr;
+			rot = do_div(tmp, num_stripes);
+
+			/* Fill in the logical address of each stripe */
+			tmp = stripe_nr * nr_data_stripes(map);
+			for (i = 0; i < nr_data_stripes(map); i++)
+				raid_map[(i+rot) % num_stripes] =
+					em->start + (tmp + i) * map->stripe_len;
+
+			raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+			if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+				raid_map[(i+rot+1) % num_stripes] =
+					RAID6_Q_STRIPE;
+
+			*length = map->stripe_len;
+			stripe_index = 0;
+			stripe_offset = 0;
+		} else {
+			/*
+			 * Mirror #0 or #1 means the original data block.
+			 * Mirror #2 is RAID5 parity block.
+			 * Mirror #3 is RAID6 Q block.
+			 */
+			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+			if (mirror_num > 1)
+				stripe_index = nr_data_stripes(map) +
+						mirror_num - 2;
+
+			/* We distribute the parity blocks across stripes */
+			tmp = stripe_nr + stripe_index;
+			stripe_index = do_div(tmp, map->num_stripes);
+		}
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -4384,8 +4759,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 				 BTRFS_BLOCK_GROUP_RAID10 |
+				 BTRFS_BLOCK_GROUP_RAID5 |
 				 BTRFS_BLOCK_GROUP_DUP)) {
 			max_errors = 1;
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+			max_errors = 2;
 		}
 	}
 
@@ -4486,6 +4864,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
 		bbio->mirror_num = map->num_stripes + 1;
 	}
+	if (raid_map) {
+		sort_parity_stripes(bbio, raid_map);
+		*raid_map_ret = raid_map;
+	}
 out:
 	if (dev_replace_is_ongoing)
 		btrfs_dev_replace_unlock(dev_replace);
@@ -4498,7 +4880,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		      struct btrfs_bio **bbio_ret, int mirror_num)
 {
 	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
-				 mirror_num);
+				 mirror_num, NULL);
 }
 
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4512,20 +4894,39 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	u64 bytenr;
 	u64 length;
 	u64 stripe_nr;
+	u64 rmap_len;
 	int i, j, nr = 0;
 
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
 	read_unlock(&em_tree->lock);
 
-	BUG_ON(!em || em->start != chunk_start);
+	if (!em) {
+		printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n",
+		       chunk_start);
+		return -EIO;
+	}
+
+	if (em->start != chunk_start) {
+		printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n",
+		       em->start, chunk_start);
+		free_extent_map(em);
+		return -EIO;
+	}
 	map = (struct map_lookup *)em->bdev;
 
 	length = em->len;
+	rmap_len = map->stripe_len;
+
 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 		do_div(length, map->num_stripes / map->sub_stripes);
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
 		do_div(length, map->num_stripes);
+	else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			      BTRFS_BLOCK_GROUP_RAID6)) {
+		do_div(length, nr_data_stripes(map));
+		rmap_len = map->stripe_len * nr_data_stripes(map);
+	}
 
 	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
 	BUG_ON(!buf); /* -ENOMEM */
@@ -4545,8 +4946,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 			do_div(stripe_nr, map->sub_stripes);
 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 			stripe_nr = stripe_nr * map->num_stripes + i;
-		}
-		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		} /* else if RAID[56], multiply by nr_data_stripes().
+		   * Alternatively, just use rmap_len below instead of
+		   * map->stripe_len */
+
+		bytenr = chunk_start + stripe_nr * rmap_len;
 		WARN_ON(nr >= map->num_stripes);
 		for (j = 0; j < nr; j++) {
 			if (buf[j] == bytenr)
@@ -4560,48 +4964,22 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
 	*logical = buf;
 	*naddrs = nr;
-	*stripe_len = map->stripe_len;
+	*stripe_len = rmap_len;
 
 	free_extent_map(em);
 	return 0;
 }
 
-static void *merge_stripe_index_into_bio_private(void *bi_private,
-						 unsigned int stripe_index)
-{
-	/*
-	 * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
-	 * at most 1.
-	 * The alternative solution (instead of stealing bits from the
-	 * pointer) would be to allocate an intermediate structure
-	 * that contains the old private pointer plus the stripe_index.
-	 */
-	BUG_ON((((uintptr_t)bi_private) & 3) != 0);
-	BUG_ON(stripe_index > 3);
-	return (void *)(((uintptr_t)bi_private) | stripe_index);
-}
-
-static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
-{
-	return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
-}
-
-static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
-{
-	return (unsigned int)((uintptr_t)bi_private) & 3;
-}
-
 static void btrfs_end_bio(struct bio *bio, int err)
 {
-	struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
+	struct btrfs_bio *bbio = bio->bi_private;
 	int is_orig_bio = 0;
 
 	if (err) {
 		atomic_inc(&bbio->error);
 		if (err == -EIO || err == -EREMOTEIO) {
 			unsigned int stripe_index =
-				extract_stripe_index_from_bio_private(
-					bio->bi_private);
+				btrfs_io_bio(bio)->stripe_index;
 			struct btrfs_device *dev;
 
 			BUG_ON(stripe_index >= bbio->num_stripes);
@@ -4631,10 +5009,9 @@ static void btrfs_end_bio(struct bio *bio, int err)
 		}
 		bio->bi_private = bbio->private;
 		bio->bi_end_io = bbio->end_io;
-		bio->bi_bdev = (struct block_device *)
-					(unsigned long)bbio->mirror_num;
+		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		/* only send an error to the higher layers if it is
-		 * beyond the tolerance of the multi-bio
+		 * beyond the tolerance of the btrfs bio
 		 */
 		if (atomic_read(&bbio->error) > bbio->max_errors) {
 			err = -EIO;
@@ -4668,13 +5045,18 @@ struct async_sched {
  * This will add one bio to the pending list for a device and make sure
  * the work struct is scheduled.
  */
-static noinline void schedule_bio(struct btrfs_root *root,
-				 struct btrfs_device *device,
-				 int rw, struct bio *bio)
+static noinline void btrfs_schedule_bio(struct btrfs_root *root,
+					struct btrfs_device *device,
+					int rw, struct bio *bio)
 {
 	int should_queue = 1;
 	struct btrfs_pending_bios *pending_bios;
 
+	if (device->missing || !device->bdev) {
+		bio_endio(bio, -EIO);
+		return;
+	}
+
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & REQ_WRITE)) {
 		bio_get(bio);
@@ -4734,7 +5116,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
 	}
 
 	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
-	if ((bio->bi_size >> 9) > max_sectors)
+	if (bio_sectors(bio) > max_sectors)
 		return 0;
 
 	if (!q->merge_bvec_fn)
@@ -4753,8 +5135,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
 
 	bio->bi_private = bbio;
-	bio->bi_private = merge_stripe_index_into_bio_private(
-			bio->bi_private, (unsigned int)dev_nr);
+	btrfs_io_bio(bio)->stripe_index = dev_nr;
 	bio->bi_end_io = btrfs_end_bio;
 	bio->bi_sector = physical >> 9;
 #ifdef DEBUG
@@ -4772,7 +5153,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 #endif
 	bio->bi_bdev = dev->bdev;
 	if (async)
-		schedule_bio(root, dev, rw, bio);
+		btrfs_schedule_bio(root, dev, rw, bio);
 	else
 		btrfsic_submit_bio(rw, bio);
 }
@@ -4815,8 +5196,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 	if (atomic_dec_and_test(&bbio->stripes_pending)) {
 		bio->bi_private = bbio->private;
 		bio->bi_end_io = bbio->end_io;
-		bio->bi_bdev = (struct block_device *)
-			(unsigned long)bbio->mirror_num;
+		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		bio->bi_sector = logical >> 9;
 		kfree(bbio);
 		bio_endio(bio, -EIO);
@@ -4831,6 +5211,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	u64 logical = (u64)bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
+	u64 *raid_map = NULL;
 	int ret;
 	int dev_nr = 0;
 	int total_devs = 1;
@@ -4839,25 +5220,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	length = bio->bi_size;
 	map_length = length;
 
-	ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
-			      mirror_num);
-	if (ret)
+	ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
+			      mirror_num, &raid_map);
+	if (ret) /* -ENOMEM */
 		return ret;
 
 	total_devs = bbio->num_stripes;
-	if (map_length < length) {
-		printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
-		       "len %llu\n", (unsigned long long)logical,
-		       (unsigned long long)length,
-		       (unsigned long long)map_length);
-		BUG();
-	}
-
 	bbio->orig_bio = first_bio;
 	bbio->private = first_bio->bi_private;
 	bbio->end_io = first_bio->bi_end_io;
 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
+	if (raid_map) {
+		/* In this case, map_length has been set to the length of
+		   a single stripe; not the whole write */
+		if (rw & WRITE) {
+			return raid56_parity_write(root, bio, bbio,
+						   raid_map, map_length);
+		} else {
+			return raid56_parity_recover(root, bio, bbio,
+						     raid_map, map_length,
+						     mirror_num);
+		}
+	}
+
+	if (map_length < length) {
+		btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
+			(unsigned long long)logical,
+			(unsigned long long)length,
+			(unsigned long long)map_length);
+		BUG();
+	}
+
 	while (dev_nr < total_devs) {
 		dev = bbio->stripes[dev_nr].dev;
 		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
@@ -4880,7 +5274,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 
 		if (dev_nr < total_devs - 1) {
-			bio = bio_clone(first_bio, GFP_NOFS);
+			bio = btrfs_bio_clone(first_bio, GFP_NOFS);
 			BUG_ON(!bio); /* -ENOMEM */
 		} else {
 			bio = first_bio;
@@ -4925,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 		return NULL;
 	list_add(&device->dev_list,
 		 &fs_devices->devices);
-	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
 	device->work.func = pending_bios_fn;
 	device->fs_devices = fs_devices;
@@ -5019,7 +5412,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	}
 
 	write_lock(&map_tree->map_tree.lock);
-	ret = add_extent_mapping(&map_tree->map_tree, em);
+	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
 	write_unlock(&map_tree->map_tree.lock);
 	BUG_ON(ret); /* Tree corruption */
 	free_extent_map(em);
@@ -5126,8 +5519,8 @@ static int read_one_dev(struct btrfs_root *root,
 			return -EIO;
 
 		if (!device) {
-			printk(KERN_WARNING "warning devid %llu missing\n",
-			       (unsigned long long)devid);
+			btrfs_warn(root->fs_info, "devid %llu missing",
+				(unsigned long long)devid);
 			device = add_missing_dev(root, devid, dev_uuid);
 			if (!device)
 				return -ENOMEM;
@@ -5151,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root,
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
-	device->dev_root = root->fs_info->dev_root;
 	device->in_fs_metadata = 1;
 	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;
@@ -5309,6 +5701,17 @@ error:
 	return ret;
 }
 
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list)
+		device->dev_root = fs_info->dev_root;
+	mutex_unlock(&fs_devices->device_list_mutex);
+}
+
 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
 {
 	int i;
@@ -5469,7 +5872,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
 	btrfs_dev_stat_print_on_error(dev);
 }
 
-void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
 {
 	if (!dev->dev_stats_valid)
 		return;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d3c3939ac75..86705583480 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,8 +21,8 @@
 
 #include <linux/bio.h>
 #include <linux/sort.h>
+#include <linux/btrfs.h>
 #include "async-thread.h"
-#include "ioctl.h"
 
 #define BTRFS_STRIPE_LEN	(64 * 1024)
 
@@ -152,6 +152,26 @@ struct btrfs_fs_devices {
 	int rotating;
 };
 
+/*
+ * we need the mirror number and stripe index to be passed around
+ * the call chain while we are processing end_io (especially errors).
+ * Really, what we need is a btrfs_bio structure that has this info
+ * and is properly sized with its stripe array, but we're not there
+ * quite yet.  We have our own btrfs bioset, and all of the bios
+ * we allocate are actually btrfs_io_bios.  We'll cram as much of
+ * struct btrfs_bio as we can into this over time.
+ */
+struct btrfs_io_bio {
+	unsigned long mirror_num;
+	unsigned long stripe_index;
+	struct bio bio;
+};
+
+static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+{
+	return container_of(bio, struct btrfs_io_bio, bio);
+}
+
 struct btrfs_bio_stripe {
 	struct btrfs_device *dev;
 	u64 physical;
@@ -254,10 +274,6 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 #define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
-			   struct btrfs_device *device,
-			   u64 chunk_tree, u64 chunk_objectid,
-			   u64 chunk_offset, u64 start, u64 num_bytes);
 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_bio **bbio_ret, int mirror_num);
@@ -282,11 +298,6 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
 					 char *device_path,
 					 struct btrfs_device **device);
-int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
-			      struct btrfs_device **device);
-int btrfs_add_device(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root,
-		     struct btrfs_device *device);
 int btrfs_rm_device(struct btrfs_root *root, char *device_path);
 void btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
@@ -305,12 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
 int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *max_avail);
-void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
 int btrfs_get_dev_stats(struct btrfs_root *root,
 			struct btrfs_ioctl_get_dev_stats *stats);
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
 			struct btrfs_fs_info *fs_info);
@@ -321,7 +333,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
 					      struct btrfs_device *tgtdev);
 int btrfs_scratch_superblock(struct btrfs_device *device);
-
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+			   u64 logical, u64 len, int mirror_num);
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+				    struct btrfs_mapping_tree *map_tree,
+				    u64 logical);
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				u64 chunk_offset, u64 chunk_size);
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
 				      int index)
 {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 446a6848c55..05740b9789e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -406,8 +406,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 				XATTR_REPLACE);
 }
 
-int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-		     void *fs_info)
+static int btrfs_initxattrs(struct inode *inode,
+			    const struct xattr *xattr_array, void *fs_info)
 {
 	const struct xattr *xattr;
 	struct btrfs_trans_handle *trans = fs_info;
diff --git a/fs/buffer.c b/fs/buffer.c
index 7a75c3e0fd5..4d7433534f5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <trace/events/block.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 
@@ -53,6 +54,13 @@ void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 }
 EXPORT_SYMBOL(init_buffer);
 
+inline void touch_buffer(struct buffer_head *bh)
+{
+	trace_block_touch_buffer(bh);
+	mark_page_accessed(bh->b_page);
+}
+EXPORT_SYMBOL(touch_buffer);
+
 static int sleep_on_buffer(void *word)
 {
 	io_schedule();
@@ -75,6 +83,40 @@ void unlock_buffer(struct buffer_head *bh)
 EXPORT_SYMBOL(unlock_buffer);
 
 /*
+ * Returns if the page has dirty or writeback buffers. If all the buffers
+ * are unlocked and clean then the PageDirty information is stale. If
+ * any of the pages are locked, it is assumed they are locked for IO.
+ */
+void buffer_check_dirty_writeback(struct page *page,
+				     bool *dirty, bool *writeback)
+{
+	struct buffer_head *head, *bh;
+	*dirty = false;
+	*writeback = false;
+
+	BUG_ON(!PageLocked(page));
+
+	if (!page_has_buffers(page))
+		return;
+
+	if (PageWriteback(page))
+		*writeback = true;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (buffer_locked(bh))
+			*writeback = true;
+
+		if (buffer_dirty(bh))
+			*dirty = true;
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+EXPORT_SYMBOL(buffer_check_dirty_writeback);
+
+/*
  * Block until a buffer comes unlocked.  This doesn't stop it
  * from becoming locked again - you have to lock it yourself
  * if you want to preserve its state.
@@ -857,8 +899,6 @@ try_again:
 
 		/* Link the buffer to its page */
 		set_bh_page(bh, page, offset);
-
-		init_buffer(bh, NULL, NULL);
 	}
 	return head;
 /*
@@ -1113,6 +1153,8 @@ void mark_buffer_dirty(struct buffer_head *bh)
 {
 	WARN_ON_ONCE(!buffer_uptodate(bh));
 
+	trace_block_dirty_buffer(bh);
+
 	/*
 	 * Very *carefully* optimize the it-is-already-dirty case.
 	 *
@@ -1446,7 +1488,8 @@ static void discard_buffer(struct buffer_head * bh)
  * block_invalidatepage - invalidate part or all of a buffer-backed page
  *
  * @page: the page which is affected
- * @offset: the index of the truncation point
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
  *
  * block_invalidatepage() is called when all or part of the page has become
  * invalidated by a truncate operation.
@@ -1457,15 +1500,22 @@ static void discard_buffer(struct buffer_head * bh)
  * point.  Because the caller is about to free (and possibly reuse) those
  * blocks on-disk.
  */
-void block_invalidatepage(struct page *page, unsigned long offset)
+void block_invalidatepage(struct page *page, unsigned int offset,
+			  unsigned int length)
 {
 	struct buffer_head *head, *bh, *next;
 	unsigned int curr_off = 0;
+	unsigned int stop = length + offset;
 
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		goto out;
 
+	/*
+	 * Check for overflow
+	 */
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
 	head = page_buffers(page);
 	bh = head;
 	do {
@@ -1473,6 +1523,12 @@ void block_invalidatepage(struct page *page, unsigned long offset)
 		next = bh->b_this_page;
 
 		/*
+		 * Are we still fully in range ?
+		 */
+		if (next_off > stop)
+			goto out;
+
+		/*
 		 * is this block fully invalidated?
 		 */
 		if (offset <= curr_off)
@@ -1493,6 +1549,7 @@ out:
 }
 EXPORT_SYMBOL(block_invalidatepage);
 
+
 /*
  * We attach and possibly dirty the buffers atomically wrt
  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
@@ -2332,7 +2389,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 			 get_block_t get_block)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(vma->vm_file);
 	unsigned long end;
 	loff_t size;
 	int ret;
@@ -2359,7 +2416,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (unlikely(ret < 0))
 		goto out_unlock;
 	set_page_dirty(page);
-	wait_on_page_writeback(page);
+	wait_for_stable_page(page);
 	return 0;
 out_unlock:
 	unlock_page(page);
@@ -2371,7 +2428,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 		   get_block_t get_block)
 {
 	int ret;
-	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
 
 	sb_start_pagefault(sb);
 
@@ -2833,7 +2890,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
 		 * they may have been added in ext3_writepage().  Make them
 		 * freeable here, so the page does not leak.
 		 */
-		do_invalidatepage(page, 0);
+		do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 		unlock_page(page);
 		return 0; /* don't care */
 	}
@@ -2939,7 +2996,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
 	}
 }
 
-int submit_bh(int rw, struct buffer_head * bh)
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 {
 	struct bio *bio;
 	int ret = 0;
@@ -2969,15 +3026,20 @@ int submit_bh(int rw, struct buffer_head * bh)
 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
 
 	bio->bi_vcnt = 1;
-	bio->bi_idx = 0;
 	bio->bi_size = bh->b_size;
 
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
+	bio->bi_flags |= bio_flags;
 
 	/* Take care of bh's that straddle the end of the device */
 	guard_bh_eod(rw, bio, bh);
 
+	if (buffer_meta(bh))
+		rw |= REQ_META;
+	if (buffer_prio(bh))
+		rw |= REQ_PRIO;
+
 	bio_get(bio);
 	submit_bio(rw, bio);
 
@@ -2987,6 +3049,12 @@ int submit_bh(int rw, struct buffer_head * bh)
 	bio_put(bio);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(_submit_bh);
+
+int submit_bh(int rw, struct buffer_head *bh)
+{
+	return _submit_bh(rw, bh, 0);
+}
 EXPORT_SYMBOL(submit_bh);
 
 /**
@@ -3227,7 +3295,7 @@ static struct kmem_cache *bh_cachep __read_mostly;
  * Once the number of bh's in the machine exceeds this level, we start
  * stripping them in writeback.
  */
-static int max_buffer_heads;
+static unsigned long max_buffer_heads;
 
 int buffer_heads_over_limit;
 
@@ -3343,7 +3411,7 @@ EXPORT_SYMBOL(bh_submit_read);
 
 void __init buffer_init(void)
 {
-	int nrpages;
+	unsigned long nrpages;
 
 	bh_cachep = kmem_cache_create("buffer_head",
 			sizeof(struct buffer_head), 0,
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 746ce532e13..d4c1206af9f 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -13,8 +13,6 @@
 #include <linux/mount.h>
 #include "internal.h"
 
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
-
 struct cachefiles_lookup_data {
 	struct cachefiles_xattr	*auxdata;	/* auxiliary data */
 	char			*key;		/* key path */
@@ -212,20 +210,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
 	object = container_of(_object, struct cachefiles_object, fscache);
 	cache = container_of(object->fscache.cache, struct cachefiles_cache,
 			     cache);
+
+	if (!fscache_use_cookie(_object)) {
+		_leave(" [relinq]");
+		return;
+	}
+
 	cookie = object->fscache.cookie;
 
 	if (!cookie->def->get_aux) {
+		fscache_unuse_cookie(_object);
 		_leave(" [no aux]");
 		return;
 	}
 
 	auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
 	if (!auxdata) {
+		fscache_unuse_cookie(_object);
 		_leave(" [nomem]");
 		return;
 	}
 
 	auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+	fscache_unuse_cookie(_object);
 	ASSERTCMP(auxlen, <, 511);
 
 	auxdata->len = auxlen + 1;
@@ -263,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
 #endif
 
 	/* delete retired objects */
-	if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+	if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
 	    _object != cache->cache.fsdef
 	    ) {
 		_debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 8c01c5fcdf7..25badd1aec5 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -38,7 +38,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
 	printk(KERN_ERR "%sobject: OBJ%x\n",
 	       prefix, object->fscache.debug_id);
 	printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
-	       prefix, fscache_object_states[object->fscache.state],
+	       prefix, object->fscache.state->name,
 	       object->fscache.flags, work_busy(&object->fscache.work),
 	       object->fscache.events, object->fscache.event_mask);
 	printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -127,10 +127,10 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
 found_dentry:
 	kdebug("preemptive burial: OBJ%x [%s] %p",
 	       object->fscache.debug_id,
-	       fscache_object_states[object->fscache.state],
+	       object->fscache.state->name,
 	       dentry);
 
-	if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+	if (fscache_object_is_live(&object->fscache)) {
 		printk(KERN_ERR "\n");
 		printk(KERN_ERR "CacheFiles: Error:"
 		       " Can't preemptively bury live object\n");
@@ -192,7 +192,7 @@ try_again:
 	/* an old object from a previous incarnation is hogging the slot - we
 	 * need to wait for it to be destroyed */
 wait_for_old_object:
-	if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+	if (fscache_object_is_live(&object->fscache)) {
 		printk(KERN_ERR "\n");
 		printk(KERN_ERR "CacheFiles: Error:"
 		       " Unexpected object collision\n");
@@ -836,7 +836,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
 	//       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
 
 	/* look up the victim */
-	mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
 
 	start = jiffies;
 	victim = lookup_one_len(filename, dir, strlen(filename));
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 48099225970..ebaff368120 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -12,6 +12,7 @@
 #include <linux/mount.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/swap.h>
 #include "internal.h"
 
 /*
@@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
  */
 static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 					    struct fscache_retrieval *op,
-					    struct page *netpage,
-					    struct pagevec *pagevec)
+					    struct page *netpage)
 {
 	struct cachefiles_one_read *monitor;
 	struct address_space *bmapping;
@@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 
 	_enter("");
 
-	pagevec_reinit(pagevec);
-
 	_debug("read back %p{%lu,%d}",
 	       netpage, netpage->index, page_count(netpage));
 
@@ -283,9 +281,7 @@ installed_new_backing_page:
 	backpage = newpage;
 	newpage = NULL;
 
-	page_cache_get(backpage);
-	pagevec_add(pagevec, backpage);
-	__pagevec_lru_add_file(pagevec);
+	lru_cache_add_file(backpage);
 
 read_backing_page:
 	ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	if (block) {
 		/* submit the apparently valid page to the backing fs to be
 		 * read from disk */
-		ret = cachefiles_read_backing_file_one(object, op, page,
-						       &pagevec);
+		ret = cachefiles_read_backing_file_one(object, op, page);
 	} else if (cachefiles_has_space(cache, 0, 1) == 0) {
 		/* there's space in the cache we can use */
 		fscache_mark_page_cached(op, page);
@@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 {
 	struct cachefiles_one_read *monitor = NULL;
 	struct address_space *bmapping = object->backer->d_inode->i_mapping;
-	struct pagevec lru_pvec;
 	struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
 	int ret = 0;
 
 	_enter("");
 
-	pagevec_init(&lru_pvec, 0);
-
 	list_for_each_entry_safe(netpage, _n, list, lru) {
 		list_del(&netpage->lru);
 
@@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 		backpage = newpage;
 		newpage = NULL;
 
-		page_cache_get(backpage);
-		if (!pagevec_add(&lru_pvec, backpage))
-			__pagevec_lru_add_file(&lru_pvec);
+		lru_cache_add_file(backpage);
 
 	reread_backing_page:
 		ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 			goto nomem;
 		}
 
-		page_cache_get(netpage);
-		if (!pagevec_add(&lru_pvec, netpage))
-			__pagevec_lru_add_file(&lru_pvec);
+		lru_cache_add_file(netpage);
 
 		/* install a monitor */
 		page_cache_get(netpage);
@@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 
 		fscache_mark_page_cached(op, netpage);
 
-		page_cache_get(netpage);
-		if (!pagevec_add(&lru_pvec, netpage))
-			__pagevec_lru_add_file(&lru_pvec);
+		lru_cache_add_file(netpage);
 
 		/* the netpage is unlocked and marked up to date here */
 		fscache_end_io(op, netpage, 0);
@@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 
 out:
 	/* tidy up */
-	pagevec_lru_add_file(&lru_pvec);
-
 	if (newpage)
 		page_cache_release(newpage);
 	if (netpage)
@@ -962,12 +946,14 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
 			}
 
 			data = kmap(page);
+			file_start_write(file);
 			old_fs = get_fs();
 			set_fs(KERNEL_DS);
 			ret = file->f_op->write(
 				file, (const void __user *) data, len, &pos);
 			set_fs(old_fs);
 			kunmap(page);
+			file_end_write(file);
 			if (ret != len)
 				ret = -EIO;
 		}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 73b46288b54..2476e516260 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
 	struct dentry *dentry = object->dentry;
 	int ret;
 
-	ASSERT(object->fscache.cookie);
 	ASSERT(dentry);
 
 	_enter("%p,#%d", object, auxdata->len);
 
 	/* attempt to install the cache metadata directly */
-	_debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+	_debug("SET #%u", auxdata->len);
 
 	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
 			   &auxdata->type, auxdata->len,
@@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
 	struct dentry *dentry = object->dentry;
 	int ret;
 
-	ASSERT(object->fscache.cookie);
 	ASSERT(dentry);
 
 	_enter("%p,#%d", object, auxdata->len);
 
 	/* attempt to install the cache metadata directly */
-	_debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+	_debug("SET #%u", auxdata->len);
 
 	ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
 			   &auxdata->type, auxdata->len,
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 9eb134ea6eb..49bc78243db 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,6 +1,6 @@
 config CEPH_FS
-        tristate "Ceph distributed file system (EXPERIMENTAL)"
-	depends on INET && EXPERIMENTAL
+	tristate "Ceph distributed file system"
+	depends on INET
 	select CEPH_LIB
 	select LIBCRC32C
 	select CRYPTO_AES
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 064d1a68d2c..5318a3b704f 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page)
  * dirty page counters appropriately.  Only called if there is private
  * data on the page.
  */
-static void ceph_invalidatepage(struct page *page, unsigned long offset)
+static void ceph_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
@@ -163,20 +164,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
 	if (!PageDirty(page))
 		pr_err("%p invalidatepage %p page not dirty\n", inode, page);
 
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		ClearPageChecked(page);
 
 	ci = ceph_inode(inode);
-	if (offset == 0) {
-		dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
-		     inode, page, page->index, offset);
+	if (offset == 0 && length == PAGE_CACHE_SIZE) {
+		dout("%p invalidatepage %p idx %lu full dirty page\n",
+		     inode, page, page->index);
 		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
 		ceph_put_snap_context(snapc);
 		page->private = 0;
 		ClearPagePrivate(page);
 	} else {
-		dout("%p invalidatepage %p idx %lu partial dirty page\n",
-		     inode, page, page->index);
+		dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
+		     inode, page, page->index, offset, length);
 	}
 }
 
@@ -195,7 +196,7 @@ static int ceph_releasepage(struct page *page, gfp_t g)
  */
 static int readpage_nounlock(struct file *filp, struct page *page)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc = 
 		&ceph_inode_to_client(inode)->client->osdc;
@@ -236,21 +237,21 @@ static int ceph_readpage(struct file *filp, struct page *page)
 static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 {
 	struct inode *inode = req->r_inode;
-	struct ceph_osd_reply_head *replyhead;
-	int rc, bytes;
+	struct ceph_osd_data *osd_data;
+	int rc = req->r_result;
+	int bytes = le32_to_cpu(msg->hdr.data_len);
+	int num_pages;
 	int i;
 
-	/* parse reply */
-	replyhead = msg->front.iov_base;
-	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
-	rc = le32_to_cpu(replyhead->result);
-	bytes = le32_to_cpu(msg->hdr.data_len);
-
 	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
 
 	/* unlock all pages, zeroing any data we didn't read */
-	for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
-		struct page *page = req->r_pages[i];
+	osd_data = osd_req_op_extent_osd_data(req, 0);
+	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+	num_pages = calc_pages_for((u64)osd_data->alignment,
+					(u64)osd_data->length);
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = osd_data->pages[i];
 
 		if (bytes < (int)PAGE_CACHE_SIZE) {
 			/* zero (remainder of) page */
@@ -263,8 +264,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 		SetPageUptodate(page);
 		unlock_page(page);
 		page_cache_release(page);
+		bytes -= PAGE_CACHE_SIZE;
 	}
-	kfree(req->r_pages);
+	kfree(osd_data->pages);
 }
 
 static void ceph_unlock_page_vector(struct page **pages, int num_pages)
@@ -285,6 +287,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		&ceph_inode_to_client(inode)->client->osdc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct page *page = list_entry(page_list->prev, struct page, lru);
+	struct ceph_vino vino;
 	struct ceph_osd_request *req;
 	u64 off;
 	u64 len;
@@ -309,18 +312,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	len = nr_pages << PAGE_CACHE_SHIFT;
 	dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
 	     off, len);
-
-	req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
-				    off, &len,
-				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
-				    NULL, 0,
+	vino = ceph_vino(inode);
+	req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
+				    1, CEPH_OSD_OP_READ,
+				    CEPH_OSD_FLAG_READ, NULL,
 				    ci->i_truncate_seq, ci->i_truncate_size,
-				    NULL, false, 1, 0);
+				    false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
 	/* build page vector */
-	nr_pages = len >> PAGE_CACHE_SHIFT;
+	nr_pages = calc_pages_for(0, len);
 	pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
 	ret = -ENOMEM;
 	if (!pages)
@@ -342,11 +344,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		}
 		pages[i] = page;
 	}
-	req->r_pages = pages;
-	req->r_num_pages = nr_pages;
+	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
 	req->r_callback = finish_read;
 	req->r_inode = inode;
 
+	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
+
 	dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
 	ret = ceph_osdc_start_request(osdc, req, false);
 	if (ret < 0)
@@ -370,7 +373,7 @@ out:
 static int ceph_readpages(struct file *file, struct address_space *mapping,
 			  struct list_head *page_list, unsigned nr_pages)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	int rc = 0;
 	int max = 0;
@@ -379,7 +382,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 		max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
 
-	dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages,
+	dout("readpages %p file %p nr_pages %d max %d\n", inode,
+		file, nr_pages,
 	     max);
 	while (!list_empty(page_list)) {
 		rc = start_read(inode, page_list, max);
@@ -435,13 +439,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	struct ceph_inode_info *ci;
 	struct ceph_fs_client *fsc;
 	struct ceph_osd_client *osdc;
-	loff_t page_off = page_offset(page);
-	int len = PAGE_CACHE_SIZE;
-	loff_t i_size;
-	int err = 0;
 	struct ceph_snap_context *snapc, *oldest;
-	u64 snap_size = 0;
+	loff_t page_off = page_offset(page);
 	long writeback_stat;
+	u64 truncate_size, snap_size = 0;
+	u32 truncate_seq;
+	int err = 0, len = PAGE_CACHE_SIZE;
 
 	dout("writepage %p idx %lu\n", page, page->index);
 
@@ -471,13 +474,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	}
 	ceph_put_snap_context(oldest);
 
+	spin_lock(&ci->i_ceph_lock);
+	truncate_seq = ci->i_truncate_seq;
+	truncate_size = ci->i_truncate_size;
+	if (!snap_size)
+		snap_size = i_size_read(inode);
+	spin_unlock(&ci->i_ceph_lock);
+
 	/* is this a partial page at end of file? */
-	if (snap_size)
-		i_size = snap_size;
-	else
-		i_size = i_size_read(inode);
-	if (i_size < page_off + len)
-		len = i_size - page_off;
+	if (page_off >= snap_size) {
+		dout("%p page eof %llu\n", page, snap_size);
+		goto out;
+	}
+	if (snap_size < page_off + len)
+		len = snap_size - page_off;
 
 	dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
 	     inode, page, page->index, page_off, len, snapc);
@@ -491,9 +501,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
 				   &ci->i_layout, snapc,
 				   page_off, len,
-				   ci->i_truncate_seq, ci->i_truncate_size,
-				   &inode->i_mtime,
-				   &page, 1, 0, 0, true);
+				   truncate_seq, truncate_size,
+				   &inode->i_mtime, &page, 1);
 	if (err < 0) {
 		dout("writepage setting page/mapping error %d %p\n", err, page);
 		SetPageError(page);
@@ -554,27 +563,24 @@ static void writepages_finish(struct ceph_osd_request *req,
 			      struct ceph_msg *msg)
 {
 	struct inode *inode = req->r_inode;
-	struct ceph_osd_reply_head *replyhead;
-	struct ceph_osd_op *op;
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_data *osd_data;
 	unsigned wrote;
 	struct page *page;
+	int num_pages;
 	int i;
 	struct ceph_snap_context *snapc = req->r_snapc;
 	struct address_space *mapping = inode->i_mapping;
-	__s32 rc = -EIO;
-	u64 bytes = 0;
+	int rc = req->r_result;
+	u64 bytes = req->r_ops[0].extent.length;
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
 
-	/* parse reply */
-	replyhead = msg->front.iov_base;
-	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
-	op = (void *)(replyhead + 1);
-	rc = le32_to_cpu(replyhead->result);
-	bytes = le64_to_cpu(op->extent.length);
-
+	osd_data = osd_req_op_extent_osd_data(req, 0);
+	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+	num_pages = calc_pages_for((u64)osd_data->alignment,
+					(u64)osd_data->length);
 	if (rc >= 0) {
 		/*
 		 * Assume we wrote the pages we originally sent.  The
@@ -582,7 +588,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 		 * raced with a truncation and was adjusted at the osd,
 		 * so don't believe the reply.
 		 */
-		wrote = req->r_num_pages;
+		wrote = num_pages;
 	} else {
 		wrote = 0;
 		mapping_set_error(mapping, rc);
@@ -591,8 +597,8 @@ static void writepages_finish(struct ceph_osd_request *req,
 	     inode, rc, bytes, wrote);
 
 	/* clean all pages */
-	for (i = 0; i < req->r_num_pages; i++) {
-		page = req->r_pages[i];
+	for (i = 0; i < num_pages; i++) {
+		page = osd_data->pages[i];
 		BUG_ON(!page);
 		WARN_ON(!PageUptodate(page));
 
@@ -621,35 +627,18 @@ static void writepages_finish(struct ceph_osd_request *req,
 		unlock_page(page);
 	}
 	dout("%p wrote+cleaned %d pages\n", inode, wrote);
-	ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+	ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
 
-	ceph_release_pages(req->r_pages, req->r_num_pages);
-	if (req->r_pages_from_pool)
-		mempool_free(req->r_pages,
+	ceph_release_pages(osd_data->pages, num_pages);
+	if (osd_data->pages_from_pool)
+		mempool_free(osd_data->pages,
 			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
 	else
-		kfree(req->r_pages);
+		kfree(osd_data->pages);
 	ceph_osdc_put_request(req);
 }
 
 /*
- * allocate a page vec, either directly, or if necessary, via a the
- * mempool.  we avoid the mempool if we can because req->r_num_pages
- * may be less than the maximum write size.
- */
-static void alloc_page_vec(struct ceph_fs_client *fsc,
-			   struct ceph_osd_request *req)
-{
-	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
-			       GFP_NOFS);
-	if (!req->r_pages) {
-		req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
-		req->r_pages_from_pool = 1;
-		WARN_ON(!req->r_pages);
-	}
-}
-
-/*
  * initiate async writeback
  */
 static int ceph_writepages_start(struct address_space *mapping,
@@ -657,7 +646,8 @@ static int ceph_writepages_start(struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_vino vino = ceph_vino(inode);
 	pgoff_t index, start, end;
 	int range_whole = 0;
 	int should_loop = 1;
@@ -669,22 +659,22 @@ static int ceph_writepages_start(struct address_space *mapping,
 	unsigned wsize = 1 << inode->i_blkbits;
 	struct ceph_osd_request *req = NULL;
 	int do_sync;
-	u64 snap_size = 0;
+	u64 truncate_size, snap_size;
+	u32 truncate_seq;
 
 	/*
 	 * Include a 'sync' in the OSD request if this is a data
 	 * integrity write (e.g., O_SYNC write or fsync()), or if our
 	 * cap is being revoked.
 	 */
-	do_sync = wbc->sync_mode == WB_SYNC_ALL;
-	if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+	if ((wbc->sync_mode == WB_SYNC_ALL) ||
+		ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
 		do_sync = 1;
 	dout("writepages_start %p dosync=%d (mode=%s)\n",
 	     inode, do_sync,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-	fsc = ceph_inode_to_client(inode);
 	if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
 		pr_warning("writepage_start %p on forced umount\n", inode);
 		return -EIO; /* we're in a forced umount, don't write! */
@@ -715,6 +705,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 retry:
 	/* find oldest snap context with dirty data */
 	ceph_put_snap_context(snapc);
+	snap_size = 0;
 	snapc = get_oldest_context(inode, &snap_size);
 	if (!snapc) {
 		/* hmm, why does writepages get called when there
@@ -722,8 +713,18 @@ retry:
 		dout(" no snap context with dirty data?\n");
 		goto out;
 	}
+	if (snap_size == 0)
+		snap_size = i_size_read(inode);
 	dout(" oldest snapc is %p seq %lld (%d snaps)\n",
 	     snapc, snapc->seq, snapc->num_snaps);
+
+	spin_lock(&ci->i_ceph_lock);
+	truncate_seq = ci->i_truncate_seq;
+	truncate_size = ci->i_truncate_size;
+	if (!snap_size)
+		snap_size = i_size_read(inode);
+	spin_unlock(&ci->i_ceph_lock);
+
 	if (last_snapc && snapc != last_snapc) {
 		/* if we switched to a newer snapc, restart our scan at the
 		 * start of the original file range. */
@@ -734,15 +735,16 @@ retry:
 	last_snapc = snapc;
 
 	while (!done && index <= end) {
+		int num_ops = do_sync ? 2 : 1;
 		unsigned i;
 		int first;
 		pgoff_t next;
 		int pvec_pages, locked_pages;
+		struct page **pages = NULL;
+		mempool_t *pool = NULL;	/* Becomes non-null if mempool used */
 		struct page *page;
 		int want;
 		u64 offset, len;
-		struct ceph_osd_request_head *reqhead;
-		struct ceph_osd_op *op;
 		long writeback_stat;
 
 		next = 0;
@@ -791,11 +793,8 @@ get_more_pages:
 				dout("waiting on writeback %p\n", page);
 				wait_on_page_writeback(page);
 			}
-			if ((snap_size && page_offset(page) > snap_size) ||
-			    (!snap_size &&
-			     page_offset(page) > i_size_read(inode))) {
-				dout("%p page eof %llu\n", page, snap_size ?
-				     snap_size : i_size_read(inode));
+			if (page_offset(page) >= snap_size) {
+				dout("%p page eof %llu\n", page, snap_size);
 				done = 1;
 				unlock_page(page);
 				break;
@@ -823,34 +822,42 @@ get_more_pages:
 				break;
 			}
 
-			/* ok */
+			/*
+			 * We have something to write.  If this is
+			 * the first locked page this time through,
+			 * allocate an osd request and a page array
+			 * that it will use.
+			 */
 			if (locked_pages == 0) {
+				BUG_ON(pages);
 				/* prepare async write request */
-				offset = (u64) page_offset(page);
+				offset = (u64)page_offset(page);
 				len = wsize;
 				req = ceph_osdc_new_request(&fsc->client->osdc,
-					    &ci->i_layout,
-					    ceph_vino(inode),
-					    offset, &len,
-					    CEPH_OSD_OP_WRITE,
-					    CEPH_OSD_FLAG_WRITE |
-						    CEPH_OSD_FLAG_ONDISK,
-					    snapc, do_sync,
-					    ci->i_truncate_seq,
-					    ci->i_truncate_size,
-					    &inode->i_mtime, true, 1, 0);
-
+							&ci->i_layout, vino,
+							offset, &len, num_ops,
+							CEPH_OSD_OP_WRITE,
+							CEPH_OSD_FLAG_WRITE |
+							CEPH_OSD_FLAG_ONDISK,
+							snapc, truncate_seq,
+							truncate_size, true);
 				if (IS_ERR(req)) {
 					rc = PTR_ERR(req);
 					unlock_page(page);
 					break;
 				}
 
-				max_pages = req->r_num_pages;
-
-				alloc_page_vec(fsc, req);
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
+
+				max_pages = calc_pages_for(0, (u64)len);
+				pages = kmalloc(max_pages * sizeof (*pages),
+						GFP_NOFS);
+				if (!pages) {
+					pool = fsc->wb_pagevec_pool;
+					pages = mempool_alloc(pool, GFP_NOFS);
+					BUG_ON(!pages);
+				}
 			}
 
 			/* note position of first page in pvec */
@@ -868,7 +875,7 @@ get_more_pages:
 			}
 
 			set_page_writeback(page);
-			req->r_pages[locked_pages] = page;
+			pages[locked_pages] = page;
 			locked_pages++;
 			next = page->index + 1;
 		}
@@ -897,20 +904,27 @@ get_more_pages:
 			pvec.nr -= i-first;
 		}
 
-		/* submit the write */
-		offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
-		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+		/* Format the osd request message and submit the write */
+
+		offset = page_offset(pages[0]);
+		len = min(snap_size - offset,
 			  (u64)locked_pages << PAGE_CACHE_SHIFT);
 		dout("writepages got %d pages at %llu~%llu\n",
 		     locked_pages, offset, len);
 
-		/* revise final length, page count */
-		req->r_num_pages = locked_pages;
-		reqhead = req->r_request->front.iov_base;
-		op = (void *)(reqhead + 1);
-		op->extent.length = cpu_to_le64(len);
-		op->payload_len = cpu_to_le32(len);
-		req->r_request->hdr.data_len = cpu_to_le32(len);
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+							!!pool, false);
+
+		pages = NULL;	/* request message now owns the pages array */
+		pool = NULL;
+
+		/* Update the write op length in case we changed it */
+
+		osd_req_op_extent_update(req, 0, len);
+
+		vino = ceph_vino(inode);
+		ceph_osdc_build_request(req, offset, snapc, vino.snap,
+					&inode->i_mtime);
 
 		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
 		BUG_ON(rc);
@@ -977,7 +991,7 @@ static int ceph_update_writeable_page(struct file *file,
 			    loff_t pos, unsigned len,
 			    struct page *page)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	loff_t page_off = pos & PAGE_CACHE_MASK;
@@ -1086,52 +1100,24 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 			    loff_t pos, unsigned len, unsigned flags,
 			    struct page **pagep, void **fsdata)
 {
-	struct inode *inode = file->f_dentry->d_inode;
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file_inode(file);
 	struct page *page;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	int r, want, got = 0;
-
-	if (fi->fmode & CEPH_FILE_MODE_LAZY)
-		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
-	else
-		want = CEPH_CAP_FILE_BUFFER;
-
-	dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
-	     inode, ceph_vinop(inode), pos, len, inode->i_size);
-	r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
-	if (r < 0)
-		return r;
-	dout("write_begin %p %llx.%llx %llu~%u  got cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
-	if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
-		ceph_put_cap_refs(ci, got);
-		return -EAGAIN;
-	}
+	int r;
 
 	do {
 		/* get a page */
 		page = grab_cache_page_write_begin(mapping, index, 0);
-		if (!page) {
-			r = -ENOMEM;
-			break;
-		}
+		if (!page)
+			return -ENOMEM;
+		*pagep = page;
 
 		dout("write_begin file %p inode %p page %p %d~%d\n", file,
 		     inode, page, (int)pos, (int)len);
 
 		r = ceph_update_writeable_page(file, pos, len, page);
-		if (r)
-			page_cache_release(page);
 	} while (r == -EAGAIN);
 
-	if (r) {
-		ceph_put_cap_refs(ci, got);
-	} else {
-		*pagep = page;
-		*(int *)fsdata = got;
-	}
 	return r;
 }
 
@@ -1144,13 +1130,11 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 			  loff_t pos, unsigned len, unsigned copied,
 			  struct page *page, void *fsdata)
 {
-	struct inode *inode = file->f_dentry->d_inode;
-	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct inode *inode = file_inode(file);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	int check_cap = 0;
-	int got = (unsigned long)fsdata;
 
 	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
 	     inode, page, (int)pos, (int)copied, (int)len);
@@ -1173,19 +1157,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 	up_read(&mdsc->snap_rwsem);
 	page_cache_release(page);
 
-	if (copied > 0) {
-		int dirty;
-		spin_lock(&ci->i_ceph_lock);
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
-		spin_unlock(&ci->i_ceph_lock);
-		if (dirty)
-			__mark_inode_dirty(inode, dirty);
-	}
-
-	dout("write_end %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
-	ceph_put_cap_refs(ci, got);
-
 	if (check_cap)
 		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
 
@@ -1228,7 +1199,7 @@ const struct address_space_operations ceph_aops = {
  */
 static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct inode *inode = file_inode(vma->vm_file);
 	struct page *page = vmf->page;
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	loff_t off = page_offset(page);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a1d9bb30c1b..25442b40c25 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -147,7 +147,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
 	spin_unlock(&mdsc->caps_list_lock);
 }
 
-int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+void ceph_reserve_caps(struct ceph_mds_client *mdsc,
 		      struct ceph_cap_reservation *ctx, int need)
 {
 	int i;
@@ -155,7 +155,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 	int have;
 	int alloc = 0;
 	LIST_HEAD(newcaps);
-	int ret = 0;
 
 	dout("reserve caps ctx=%p need=%d\n", ctx, need);
 
@@ -174,14 +173,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 
 	for (i = have; i < need; i++) {
 		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
-		if (!cap) {
-			ret = -ENOMEM;
-			goto out_alloc_count;
-		}
+		if (!cap)
+			break;
 		list_add(&cap->caps_item, &newcaps);
 		alloc++;
 	}
-	BUG_ON(have + alloc != need);
+	/* we didn't manage to reserve as much as we needed */
+	if (have + alloc != need)
+		pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+			ctx, need, have + alloc);
 
 	spin_lock(&mdsc->caps_list_lock);
 	mdsc->caps_total_count += alloc;
@@ -197,13 +197,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc,
 	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
 	     ctx, mdsc->caps_total_count, mdsc->caps_use_count,
 	     mdsc->caps_reserve_count, mdsc->caps_avail_count);
-	return 0;
-
-out_alloc_count:
-	/* we didn't manage to reserve as much as we needed */
-	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
-		   ctx, need, have);
-	return ret;
 }
 
 int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
@@ -490,15 +483,17 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
 		ci->i_rdcache_gen++;
 
 	/*
-	 * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
+	 * if we are newly issued FILE_SHARED, mark dir not complete; we
 	 * don't know what happened to this directory while we didn't
 	 * have the cap.
 	 */
 	if ((issued & CEPH_CAP_FILE_SHARED) &&
 	    (had & CEPH_CAP_FILE_SHARED) == 0) {
 		ci->i_shared_gen++;
-		if (S_ISDIR(ci->vfs_inode.i_mode))
-			ceph_dir_clear_complete(&ci->vfs_inode);
+		if (S_ISDIR(ci->vfs_inode.i_mode)) {
+			dout(" marking %p NOT complete\n", &ci->vfs_inode);
+			__ceph_dir_clear_complete(ci);
+		}
 	}
 }
 
@@ -553,6 +548,7 @@ retry:
 		cap->implemented = 0;
 		cap->mds = mds;
 		cap->mds_wanted = 0;
+		cap->mseq = 0;
 
 		cap->ci = ci;
 		__insert_cap_node(ci, cap);
@@ -609,10 +605,20 @@ retry:
 		__cap_delay_requeue(mdsc, ci);
 	}
 
-	if (flags & CEPH_CAP_FLAG_AUTH)
-		ci->i_auth_cap = cap;
-	else if (ci->i_auth_cap == cap)
+	if (flags & CEPH_CAP_FLAG_AUTH) {
+		if (ci->i_auth_cap == NULL ||
+		    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+			ci->i_auth_cap = cap;
+	} else if (ci->i_auth_cap == cap) {
 		ci->i_auth_cap = NULL;
+		spin_lock(&mdsc->cap_dirty_lock);
+		if (!list_empty(&ci->i_dirty_item)) {
+			dout(" moving %p to cap_dirty_migrating\n", inode);
+			list_move(&ci->i_dirty_item,
+				  &mdsc->cap_dirty_migrating);
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+	}
 
 	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
 	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
@@ -620,7 +626,10 @@ retry:
 	cap->cap_id = cap_id;
 	cap->issued = issued;
 	cap->implemented |= issued;
-	cap->mds_wanted |= wanted;
+	if (mseq > cap->mseq)
+		cap->mds_wanted = wanted;
+	else
+		cap->mds_wanted |= wanted;
 	cap->seq = seq;
 	cap->issue_seq = seq;
 	cap->mseq = mseq;
@@ -681,6 +690,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
 		if (implemented)
 			*implemented |= cap->implemented;
 	}
+	/*
+	 * exclude caps issued by non-auth MDS, but are been revoking
+	 * by the auth MDS. The non-auth MDS should be revoking/exporting
+	 * these caps, but the message is delayed.
+	 */
+	if (ci->i_auth_cap) {
+		cap = ci->i_auth_cap;
+		have &= ~cap->implemented | cap->issued;
+	}
 	return have;
 }
 
@@ -788,22 +806,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 /*
  * Return true if mask caps are currently being revoked by an MDS.
  */
-int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+			       struct ceph_cap *ocap, int mask)
 {
-	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap *cap;
 	struct rb_node *p;
-	int ret = 0;
 
-	spin_lock(&ci->i_ceph_lock);
 	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
 		cap = rb_entry(p, struct ceph_cap, ci_node);
-		if (__cap_is_valid(cap) &&
-		    (cap->implemented & ~cap->issued & mask)) {
-			ret = 1;
-			break;
-		}
+		if (cap != ocap && __cap_is_valid(cap) &&
+		    (cap->implemented & ~cap->issued & mask))
+			return 1;
 	}
+	return 0;
+}
+
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int ret;
+
+	spin_lock(&ci->i_ceph_lock);
+	ret = __ceph_caps_revoking_other(ci, NULL, mask);
 	spin_unlock(&ci->i_ceph_lock);
 	dout("ceph_caps_revoking %p %s = %d\n", inode,
 	     ceph_cap_string(mask), ret);
@@ -930,7 +954,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
 			u64 size, u64 max_size,
 			struct timespec *mtime, struct timespec *atime,
 			u64 time_warp_seq,
-			uid_t uid, gid_t gid, umode_t mode,
+			kuid_t uid, kgid_t gid, umode_t mode,
 			u64 xattr_version,
 			struct ceph_buffer *xattrs_buf,
 			u64 follows)
@@ -974,8 +998,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
 		ceph_encode_timespec(&fc->atime, atime);
 	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
 
-	fc->uid = cpu_to_le32(uid);
-	fc->gid = cpu_to_le32(gid);
+	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
+	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));
 	fc->mode = cpu_to_le32(mode);
 
 	fc->xattr_version = cpu_to_le64(xattr_version);
@@ -989,9 +1013,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	return 0;
 }
 
-static void __queue_cap_release(struct ceph_mds_session *session,
-				u64 ino, u64 cap_id, u32 migrate_seq,
-				u32 issue_seq)
+void __queue_cap_release(struct ceph_mds_session *session,
+			 u64 ino, u64 cap_id, u32 migrate_seq,
+			 u32 issue_seq)
 {
 	struct ceph_msg *msg;
 	struct ceph_mds_cap_release *head;
@@ -1081,8 +1105,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	struct timespec mtime, atime;
 	int wake = 0;
 	umode_t mode;
-	uid_t uid;
-	gid_t gid;
+	kuid_t uid;
+	kgid_t gid;
 	struct ceph_mds_session *session;
 	u64 xattr_version = 0;
 	struct ceph_buffer *xattr_blob = NULL;
@@ -1460,7 +1484,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap *cap;
-	int file_wanted, used;
+	int file_wanted, used, cap_used;
 	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
 	int issued, implemented, want, retain, revoking, flushing = 0;
 	int mds = -1;   /* keep track of how far we've gone through i_caps list
@@ -1563,9 +1587,14 @@ retry_locked:
 
 		/* NOTE: no side-effects allowed, until we take s_mutex */
 
+		cap_used = used;
+		if (ci->i_auth_cap && cap != ci->i_auth_cap)
+			cap_used &= ~ci->i_auth_cap->issued;
+
 		revoking = cap->implemented & ~cap->issued;
-		dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+		dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
 		     cap->mds, cap, ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap_used),
 		     ceph_cap_string(cap->implemented),
 		     ceph_cap_string(revoking));
 
@@ -1593,7 +1622,7 @@ retry_locked:
 		}
 
 		/* completed revocation? going down and there are no caps? */
-		if (revoking && (revoking & used) == 0) {
+		if (revoking && (revoking & cap_used) == 0) {
 			dout("completed revocation of %s\n",
 			     ceph_cap_string(cap->implemented & ~cap->issued));
 			goto ack;
@@ -1670,8 +1699,8 @@ ack:
 		sent++;
 
 		/* __send_cap drops i_ceph_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
-				      retain, flushing, NULL);
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
+				      want, retain, flushing, NULL);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
 
@@ -1961,8 +1990,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
 	cap = ci->i_auth_cap;
 	dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
 	     ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+
 	__ceph_flush_snaps(ci, &session, 1);
+
 	if (ci->i_flushing_caps) {
+		spin_lock(&mdsc->cap_dirty_lock);
+		list_move_tail(&ci->i_flushing_item,
+			       &cap->session->s_cap_flushing);
+		spin_unlock(&mdsc->cap_dirty_lock);
+
 		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
 				     __ceph_caps_used(ci),
 				     __ceph_caps_wanted(ci),
@@ -2033,6 +2069,17 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 		goto out;
 	}
 
+	/* finish pending truncate */
+	while (ci->i_truncate_pending) {
+		spin_unlock(&ci->i_ceph_lock);
+		if (!(need & CEPH_CAP_FILE_WR))
+			mutex_lock(&inode->i_mutex);
+		__ceph_do_pending_vmtruncate(inode);
+		if (!(need & CEPH_CAP_FILE_WR))
+			mutex_unlock(&inode->i_mutex);
+		spin_lock(&ci->i_ceph_lock);
+	}
+
 	if (need & CEPH_CAP_FILE_WR) {
 		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
 			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
@@ -2054,12 +2101,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 	}
 	have = __ceph_caps_issued(ci, &implemented);
 
-	/*
-	 * disallow writes while a truncate is pending
-	 */
-	if (ci->i_truncate_pending)
-		have &= ~CEPH_CAP_FILE_WR;
-
 	if ((have & need) == need) {
 		/*
 		 * Look at (implemented & ~have & not) so that we keep waiting
@@ -2359,10 +2400,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 
 	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
 		inode->i_mode = le32_to_cpu(grant->mode);
-		inode->i_uid = le32_to_cpu(grant->uid);
-		inode->i_gid = le32_to_cpu(grant->gid);
+		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
+		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
 		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
-		     inode->i_uid, inode->i_gid);
+		     from_kuid(&init_user_ns, inode->i_uid),
+		     from_kgid(&init_user_ns, inode->i_gid));
 	}
 
 	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
@@ -2416,7 +2458,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 		dout("mds wanted %s -> %s\n",
 		     ceph_cap_string(le32_to_cpu(grant->wanted)),
 		     ceph_cap_string(wanted));
-		grant->wanted = cpu_to_le32(wanted);
+		/* imported cap may not have correct mds_wanted */
+		if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
+			check_caps = 1;
 	}
 
 	cap->seq = seq;
@@ -2450,6 +2494,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	} else {
 		dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
 		     ceph_cap_string(newcaps));
+		/* non-auth MDS is revoking the newly grant caps ? */
+		if (cap == ci->i_auth_cap &&
+		    __ceph_caps_revoking_other(ci, cap, newcaps))
+		    check_caps = 2;
+
 		cap->issued = newcaps;
 		cap->implemented |= newcaps; /* add bits only, to
 					      * avoid stepping on a
@@ -2820,6 +2869,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
 	     (unsigned)seq);
 
+	if (op == CEPH_CAP_OP_IMPORT)
+		ceph_add_cap_releases(mdsc, session);
+
 	/* lookup ino */
 	inode = ceph_find_inode(sb, vino);
 	ci = ceph_inode(inode);
@@ -3016,21 +3068,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 		     (cap->issued & unless) == 0)) {
 			if ((cap->issued & drop) &&
 			    (cap->issued & unless) == 0) {
-				dout("encode_inode_release %p cap %p %s -> "
-				     "%s\n", inode, cap,
+				int wanted = __ceph_caps_wanted(ci);
+				if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
+					wanted |= cap->mds_wanted;
+				dout("encode_inode_release %p cap %p "
+				     "%s -> %s, wanted %s -> %s\n", inode, cap,
 				     ceph_cap_string(cap->issued),
-				     ceph_cap_string(cap->issued & ~drop));
+				     ceph_cap_string(cap->issued & ~drop),
+				     ceph_cap_string(cap->mds_wanted),
+				     ceph_cap_string(wanted));
+
 				cap->issued &= ~drop;
 				cap->implemented &= ~drop;
-				if (ci->i_ceph_flags & CEPH_I_NODELAY) {
-					int wanted = __ceph_caps_wanted(ci);
-					dout("  wanted %s -> %s (act %s)\n",
-					     ceph_cap_string(cap->mds_wanted),
-					     ceph_cap_string(cap->mds_wanted &
-							     ~wanted),
-					     ceph_cap_string(wanted));
-					cap->mds_wanted &= wanted;
-				}
+				cap->mds_wanted = wanted;
 			} else {
 				dout("encode_inode_release %p cap %p %s"
 				     " (force)\n", inode, cap,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 8c1aabe93b6..a40ceda47a3 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -107,15 +107,14 @@ static unsigned fpos_off(loff_t p)
  * falling back to a "normal" sync readdir if any dentries in the dir
  * are dropped.
  *
- * D_COMPLETE tells indicates we have all dentries in the dir.  It is
+ * Complete dir indicates that we have all dentries in the dir.  It is
  * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
  * the MDS if/when the directory is modified).
  */
-static int __dcache_readdir(struct file *filp,
-			    void *dirent, filldir_t filldir)
+static int __dcache_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct ceph_file_info *fi = filp->private_data;
-	struct dentry *parent = filp->f_dentry;
+	struct ceph_file_info *fi = file->private_data;
+	struct dentry *parent = file->f_dentry;
 	struct inode *dir = parent->d_inode;
 	struct list_head *p;
 	struct dentry *dentry, *last;
@@ -126,14 +125,14 @@ static int __dcache_readdir(struct file *filp,
 	last = fi->dentry;
 	fi->dentry = NULL;
 
-	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+	dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
 	     last);
 
 	spin_lock(&parent->d_lock);
 
 	/* start at beginning? */
-	if (filp->f_pos == 2 || last == NULL ||
-	    filp->f_pos < ceph_dentry(last)->offset) {
+	if (ctx->pos == 2 || last == NULL ||
+	    ctx->pos < ceph_dentry(last)->offset) {
 		if (list_empty(&parent->d_subdirs))
 			goto out_unlock;
 		p = parent->d_subdirs.prev;
@@ -157,11 +156,11 @@ more:
 		if (!d_unhashed(dentry) && dentry->d_inode &&
 		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
 		    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
-		    filp->f_pos <= di->offset)
+		    ctx->pos <= di->offset)
 			break;
 		dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
 		     dentry->d_name.len, dentry->d_name.name, di->offset,
-		     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+		     ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
 		     !dentry->d_inode ? " null" : "");
 		spin_unlock(&dentry->d_lock);
 		p = p->prev;
@@ -173,33 +172,31 @@ more:
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
 
-	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
 	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-	filp->f_pos = di->offset;
-	err = filldir(dirent, dentry->d_name.name,
-		      dentry->d_name.len, di->offset,
+	ctx->pos = di->offset;
+	if (!dir_emit(ctx, dentry->d_name.name,
+		      dentry->d_name.len,
 		      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
-		      dentry->d_inode->i_mode >> 12);
-
-	if (last) {
-		if (err < 0) {
+		      dentry->d_inode->i_mode >> 12)) {
+		if (last) {
 			/* remember our position */
 			fi->dentry = last;
 			fi->next_offset = di->offset;
-		} else {
-			dput(last);
 		}
+		dput(dentry);
+		return 0;
 	}
-	last = dentry;
 
-	if (err < 0)
-		goto out;
+	if (last)
+		dput(last);
+	last = dentry;
 
-	filp->f_pos++;
+	ctx->pos++;
 
 	/* make sure a dentry wasn't dropped while we didn't have parent lock */
-	if (!ceph_dir_test_complete(dir)) {
-		dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
+	if (!ceph_dir_is_complete(dir)) {
+		dout(" lost dir complete on %p; falling back to mds\n", dir);
 		err = -EAGAIN;
 		goto out;
 	}
@@ -235,59 +232,59 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
 	return 0;
 }
 
-static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct ceph_file_info *fi = filp->private_data;
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
-	unsigned frag = fpos_frag(filp->f_pos);
-	int off = fpos_off(filp->f_pos);
+	unsigned frag = fpos_frag(ctx->pos);
+	int off = fpos_off(ctx->pos);
 	int err;
 	u32 ftype;
 	struct ceph_mds_reply_info_parsed *rinfo;
 	const int max_entries = fsc->mount_options->max_readdir;
 	const int max_bytes = fsc->mount_options->max_readdir_bytes;
 
-	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+	dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
 	if (fi->flags & CEPH_F_ATEND)
 		return 0;
 
 	/* always start with . and .. */
-	if (filp->f_pos == 0) {
+	if (ctx->pos == 0) {
 		/* note dir version at start of readdir so we can tell
 		 * if any dentries get dropped */
-		fi->dir_release_count = ci->i_release_count;
+		fi->dir_release_count = atomic_read(&ci->i_release_count);
 
 		dout("readdir off 0 -> '.'\n");
-		if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+		if (!dir_emit(ctx, ".", 1, 
 			    ceph_translate_ino(inode->i_sb, inode->i_ino),
-			    inode->i_mode >> 12) < 0)
+			    inode->i_mode >> 12))
 			return 0;
-		filp->f_pos = 1;
+		ctx->pos = 1;
 		off = 1;
 	}
-	if (filp->f_pos == 1) {
-		ino_t ino = parent_ino(filp->f_dentry);
+	if (ctx->pos == 1) {
+		ino_t ino = parent_ino(file->f_dentry);
 		dout("readdir off 1 -> '..'\n");
-		if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+		if (!dir_emit(ctx, "..", 2,
 			    ceph_translate_ino(inode->i_sb, ino),
-			    inode->i_mode >> 12) < 0)
+			    inode->i_mode >> 12))
 			return 0;
-		filp->f_pos = 2;
+		ctx->pos = 2;
 		off = 2;
 	}
 
 	/* can we use the dcache? */
 	spin_lock(&ci->i_ceph_lock);
-	if ((filp->f_pos == 2 || fi->dentry) &&
+	if ((ctx->pos == 2 || fi->dentry) &&
 	    !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
 	    ceph_snap(inode) != CEPH_SNAPDIR &&
-	    ceph_dir_test_complete(inode) &&
+	    __ceph_dir_is_complete(ci) &&
 	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
 		spin_unlock(&ci->i_ceph_lock);
-		err = __dcache_readdir(filp, dirent, filldir);
+		err = __dcache_readdir(file, ctx);
 		if (err != -EAGAIN)
 			return err;
 	} else {
@@ -327,7 +324,7 @@ more:
 			return PTR_ERR(req);
 		req->r_inode = inode;
 		ihold(inode);
-		req->r_dentry = dget(filp->f_dentry);
+		req->r_dentry = dget(file->f_dentry);
 		/* hints to request -> mds selection code */
 		req->r_direct_mode = USE_AUTH_MDS;
 		req->r_direct_hash = ceph_frag_value(frag);
@@ -350,7 +347,8 @@ more:
 
 		if (!req->r_did_prepopulate) {
 			dout("readdir !did_prepopulate");
-			fi->dir_release_count--;    /* preclude D_COMPLETE */
+			/* preclude from marking dir complete */
+			fi->dir_release_count--;
 		}
 
 		/* note next offset and last dentry name */
@@ -378,15 +376,16 @@ more:
 	rinfo = &fi->last_readdir->r_reply_info;
 	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
 	     rinfo->dir_nr, off, fi->offset);
+
+	ctx->pos = ceph_make_fpos(frag, off);
 	while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
-		u64 pos = ceph_make_fpos(frag, off);
 		struct ceph_mds_reply_inode *in =
 			rinfo->dir_in[off - fi->offset].in;
 		struct ceph_vino vino;
 		ino_t ino;
 
 		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
-		     off, off - fi->offset, rinfo->dir_nr, pos,
+		     off, off - fi->offset, rinfo->dir_nr, ctx->pos,
 		     rinfo->dir_dname_len[off - fi->offset],
 		     rinfo->dir_dname[off - fi->offset], in);
 		BUG_ON(!in);
@@ -394,16 +393,15 @@ more:
 		vino.ino = le64_to_cpu(in->ino);
 		vino.snap = le64_to_cpu(in->snapid);
 		ino = ceph_vino_to_ino(vino);
-		if (filldir(dirent,
+		if (!dir_emit(ctx,
 			    rinfo->dir_dname[off - fi->offset],
 			    rinfo->dir_dname_len[off - fi->offset],
-			    pos,
-			    ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
+			    ceph_translate_ino(inode->i_sb, ino), ftype)) {
 			dout("filldir stopping us...\n");
 			return 0;
 		}
 		off++;
-		filp->f_pos = pos + 1;
+		ctx->pos++;
 	}
 
 	if (fi->last_name) {
@@ -416,7 +414,7 @@ more:
 	if (!ceph_frag_is_rightmost(frag)) {
 		frag = ceph_frag_next(frag);
 		off = 0;
-		filp->f_pos = ceph_make_fpos(frag, off);
+		ctx->pos = ceph_make_fpos(frag, off);
 		dout("readdir next frag is %x\n", frag);
 		goto more;
 	}
@@ -428,13 +426,14 @@ more:
 	 * the complete dir contents in our cache.
 	 */
 	spin_lock(&ci->i_ceph_lock);
-	if (ci->i_release_count == fi->dir_release_count) {
-		ceph_dir_set_complete(inode);
-		ci->i_max_offset = filp->f_pos;
+	if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
+		dout(" marking %p complete\n", inode);
+		__ceph_dir_set_complete(ci, fi->dir_release_count);
+		ci->i_max_offset = ctx->pos;
 	}
 	spin_unlock(&ci->i_ceph_lock);
 
-	dout("readdir %p filp %p done.\n", inode, filp);
+	dout("readdir %p file %p done.\n", inode, file);
 	return 0;
 }
 
@@ -604,7 +603,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 			    fsc->mount_options->snapdir_name,
 			    dentry->d_name.len) &&
 		    !is_root_ceph_dentry(dir, dentry) &&
-		    ceph_dir_test_complete(dir) &&
+		    __ceph_dir_is_complete(ci) &&
 		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
 			spin_unlock(&ci->i_ceph_lock);
 			dout(" dir %p complete, -ENOENT\n", dir);
@@ -1065,44 +1064,6 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
 }
 
 /*
- * Set/clear/test dir complete flag on the dir's dentry.
- */
-void ceph_dir_set_complete(struct inode *inode)
-{
-	struct dentry *dentry = d_find_any_alias(inode);
-	
-	if (dentry && ceph_dentry(dentry) &&
-	    ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
-		dout(" marking %p (%p) complete\n", inode, dentry);
-		set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
-	}
-	dput(dentry);
-}
-
-void ceph_dir_clear_complete(struct inode *inode)
-{
-	struct dentry *dentry = d_find_any_alias(inode);
-
-	if (dentry && ceph_dentry(dentry)) {
-		dout(" marking %p (%p) complete\n", inode, dentry);
-		set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
-	}
-	dput(dentry);
-}
-
-bool ceph_dir_test_complete(struct inode *inode)
-{
-	struct dentry *dentry = d_find_any_alias(inode);
-
-	if (dentry && ceph_dentry(dentry)) {
-		dout(" marking %p (%p) NOT complete\n", inode, dentry);
-		clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
-	}
-	dput(dentry);
-	return false;
-}
-
-/*
  * When the VFS prunes a dentry from the cache, we need to clear the
  * complete flag on the parent directory.
  *
@@ -1110,15 +1071,13 @@ bool ceph_dir_test_complete(struct inode *inode)
  */
 static void ceph_d_prune(struct dentry *dentry)
 {
-	struct ceph_dentry_info *di;
-
 	dout("ceph_d_prune %p\n", dentry);
 
 	/* do we have a valid parent? */
 	if (IS_ROOT(dentry))
 		return;
 
-	/* if we are not hashed, we don't affect D_COMPLETE */
+	/* if we are not hashed, we don't affect dir's completeness */
 	if (d_unhashed(dentry))
 		return;
 
@@ -1126,8 +1085,7 @@ static void ceph_d_prune(struct dentry *dentry)
 	 * we hold d_lock, so d_parent is stable, and d_fsdata is never
 	 * cleared until d_release
 	 */
-	di = ceph_dentry(dentry->d_parent);
-	clear_bit(CEPH_D_COMPLETE, &di->flags);
+	ceph_dir_clear_complete(dentry->d_parent->d_inode);
 }
 
 /*
@@ -1138,7 +1096,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 			     loff_t *ppos)
 {
 	struct ceph_file_info *cf = file->private_data;
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int left;
 	const int bufsize = 1024;
@@ -1188,7 +1146,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
 			  int datasync)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct list_head *head = &ci->i_unsafe_dirops;
 	struct ceph_mds_request *req;
@@ -1307,7 +1265,7 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
 
 const struct file_operations ceph_dir_fops = {
 	.read = ceph_read_dir,
-	.readdir = ceph_readdir,
+	.iterate = ceph_readdir,
 	.llseek = ceph_dir_llseek,
 	.open = ceph_open,
 	.release = ceph_release,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index ca3ab3f9ca7..16796be53ca 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -81,7 +81,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 		if (parent_inode) {
 			/* nfsd wants connectable */
 			*max_len = connected_handle_length;
-			type = 255;
+			type = FILEID_INVALID;
 		} else {
 			dout("encode_fh %p\n", dentry);
 			fh->ino = ceph_ino(inode);
@@ -90,7 +90,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
 		}
 	} else {
 		*max_len = handle_length;
-		type = 255;
+		type = FILEID_INVALID;
 	}
 	if (dentry)
 		dput(dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e51558fca3a..2ddf061c1c4 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,6 +7,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/writeback.h>
+#include <linux/aio.h>
 
 #include "super.h"
 #include "mds_client.h"
@@ -243,6 +244,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	err = ceph_mdsc_do_request(mdsc,
 				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
 				   req);
+	if (err)
+		goto out_err;
+
 	err = ceph_handle_snapdir(req, dentry, err);
 	if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
 		err = ceph_handle_notrace_create(dir, dentry);
@@ -263,6 +267,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		err = finish_no_open(file, dn);
 	} else {
 		dout("atomic_open finish_open on dn %p\n", dn);
+		if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+			*opened |= FILE_CREATED;
+		}
 		err = finish_open(file, dentry, ceph_open, opened);
 	}
 
@@ -393,7 +400,7 @@ more:
 static ssize_t ceph_sync_read(struct file *file, char __user *data,
 			      unsigned len, loff_t *poff, int *checkeof)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct page **pages;
 	u64 off = *poff;
 	int num_pages, ret;
@@ -440,19 +447,35 @@ done:
 }
 
 /*
- * Write commit callback, called if we requested both an ACK and
- * ONDISK commit reply from the OSD.
+ * Write commit request unsafe callback, called to tell us when a
+ * request is unsafe (that is, in flight--has been handed to the
+ * messenger to send to its target osd).  It is called again when
+ * we've received a response message indicating the request is
+ * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
+ * is completed early (and unsuccessfully) due to a timeout or
+ * interrupt.
+ *
+ * This is used if we requested both an ACK and ONDISK commit reply
+ * from the OSD.
  */
-static void sync_write_commit(struct ceph_osd_request *req,
-			      struct ceph_msg *msg)
+static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 {
 	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
 
-	dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
-	spin_lock(&ci->i_unsafe_lock);
-	list_del_init(&req->r_unsafe_item);
-	spin_unlock(&ci->i_unsafe_lock);
-	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+	dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
+		unsafe ? "un" : "");
+	if (unsafe) {
+		ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+		spin_lock(&ci->i_unsafe_lock);
+		list_add_tail(&req->r_unsafe_item,
+			      &ci->i_unsafe_writes);
+		spin_unlock(&ci->i_unsafe_lock);
+	} else {
+		spin_lock(&ci->i_unsafe_lock);
+		list_del_init(&req->r_unsafe_item);
+		spin_unlock(&ci->i_unsafe_lock);
+		ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+	}
 }
 
 /*
@@ -464,36 +487,33 @@ static void sync_write_commit(struct ceph_osd_request *req,
  * objects, rollback on failure, etc.)
  */
 static ssize_t ceph_sync_write(struct file *file, const char __user *data,
-			       size_t left, loff_t *offset)
+			       size_t left, loff_t pos, loff_t *ppos)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_snap_context *snapc;
+	struct ceph_vino vino;
 	struct ceph_osd_request *req;
+	int num_ops = 1;
 	struct page **pages;
 	int num_pages;
-	long long unsigned pos;
 	u64 len;
 	int written = 0;
 	int flags;
-	int do_sync = 0;
 	int check_caps = 0;
 	int page_align, io_align;
 	unsigned long buf_align;
 	int ret;
 	struct timespec mtime = CURRENT_TIME;
+	bool own_pages = false;
 
-	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
+	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
 
-	dout("sync_write on file %p %lld~%u %s\n", file, *offset,
+	dout("sync_write on file %p %lld~%u %s\n", file, pos,
 	     (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
 
-	if (file->f_flags & O_APPEND)
-		pos = i_size_read(inode);
-	else
-		pos = *offset;
-
 	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
 	if (ret < 0)
 		return ret;
@@ -510,7 +530,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 	if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
 		flags |= CEPH_OSD_FLAG_ACK;
 	else
-		do_sync = 1;
+		num_ops++;	/* Also include a 'startsync' command. */
 
 	/*
 	 * we may need to do multiple writes here if we span an object
@@ -520,25 +540,20 @@ more:
 	io_align = pos & ~PAGE_MASK;
 	buf_align = (unsigned long)data & ~PAGE_MASK;
 	len = left;
-	if (file->f_flags & O_DIRECT) {
-		/* write from beginning of first page, regardless of
-		   io alignment */
-		page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
-		num_pages = calc_pages_for((unsigned long)data, len);
-	} else {
-		page_align = pos & ~PAGE_MASK;
-		num_pages = calc_pages_for(pos, len);
-	}
+
+	snapc = ci->i_snap_realm->cached_context;
+	vino = ceph_vino(inode);
 	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-				    ceph_vino(inode), pos, &len,
-				    CEPH_OSD_OP_WRITE, flags,
-				    ci->i_snap_realm->cached_context,
-				    do_sync,
+				    vino, pos, &len, num_ops,
+				    CEPH_OSD_OP_WRITE, flags, snapc,
 				    ci->i_truncate_seq, ci->i_truncate_size,
-				    &mtime, false, 2, page_align);
+				    false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	/* write from beginning of first page, regardless of io alignment */
+	page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
+	num_pages = calc_pages_for(page_align, len);
 	if (file->f_flags & O_DIRECT) {
 		pages = ceph_get_direct_page_vector(data, num_pages, false);
 		if (IS_ERR(pages)) {
@@ -566,36 +581,20 @@ more:
 
 		if ((file->f_flags & O_SYNC) == 0) {
 			/* get a second commit callback */
-			req->r_safe_callback = sync_write_commit;
-			req->r_own_pages = 1;
+			req->r_unsafe_callback = ceph_sync_write_unsafe;
+			req->r_inode = inode;
+			own_pages = true;
 		}
 	}
-	req->r_pages = pages;
-	req->r_num_pages = num_pages;
-	req->r_inode = inode;
+	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+					false, own_pages);
+
+	/* BUG_ON(vino.snap != CEPH_NOSNAP); */
+	ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
 
 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-	if (!ret) {
-		if (req->r_safe_callback) {
-			/*
-			 * Add to inode unsafe list only after we
-			 * start_request so that a tid has been assigned.
-			 */
-			spin_lock(&ci->i_unsafe_lock);
-			list_add_tail(&req->r_unsafe_item,
-				      &ci->i_unsafe_writes);
-			spin_unlock(&ci->i_unsafe_lock);
-			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
-		}
-		
+	if (!ret)
 		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
-		if (ret < 0 && req->r_safe_callback) {
-			spin_lock(&ci->i_unsafe_lock);
-			list_del_init(&req->r_unsafe_item);
-			spin_unlock(&ci->i_unsafe_lock);
-			ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
-		}
-	}
 
 	if (file->f_flags & O_DIRECT)
 		ceph_put_page_vector(pages, num_pages, false);
@@ -608,12 +607,12 @@ out:
 		pos += len;
 		written += len;
 		left -= len;
-		data += written;
+		data += len;
 		if (left)
 			goto more;
 
 		ret = written;
-		*offset = pos;
+		*ppos = pos;
 		if (pos > i_size_read(inode))
 			check_caps = ceph_inode_set_size(inode, pos);
 		if (check_caps)
@@ -637,7 +636,7 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	struct ceph_file_info *fi = filp->private_data;
 	loff_t *ppos = &iocb->ki_pos;
 	size_t len = iov->iov_len;
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	void __user *base = iov->iov_base;
 	ssize_t ret;
@@ -647,7 +646,6 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
 	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);
 again:
-	__ceph_do_pending_vmtruncate(inode);
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
 	else
@@ -707,59 +705,78 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *file = iocb->ki_filp;
 	struct ceph_file_info *fi = file->private_data;
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
-	loff_t endoff = pos + iov->iov_len;
-	int got = 0;
-	int ret, err, written;
+	ssize_t count, written = 0;
+	int err, want, got;
+	bool hold_mutex;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
-retry_snap:
-	written = 0;
-	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
-		return -ENOSPC;
-	__ceph_do_pending_vmtruncate(inode);
+	mutex_lock(&inode->i_mutex);
+	hold_mutex = true;
 
-	/*
-	 * try to do a buffered write.  if we don't have sufficient
-	 * caps, we'll get -EAGAIN from generic_file_aio_write, or a
-	 * short write if we only get caps for some pages.
-	 */
-	if (!(iocb->ki_filp->f_flags & O_DIRECT) &&
-	    !(inode->i_sb->s_flags & MS_SYNCHRONOUS) &&
-	    !(fi->flags & CEPH_F_SYNC)) {
-		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-		if (ret >= 0)
-			written = ret;
-
-		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
-		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
-		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
-			err = vfs_fsync_range(file, pos, pos + written - 1, 1);
-			if (err < 0)
-				ret = err;
-		}
-		if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff)
-			goto out;
+	err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+	if (err)
+		goto out;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = file->f_mapping->backing_dev_info;
+
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+
+	if (count == 0)
+		goto out;
+
+	err = file_remove_suid(file);
+	if (err)
+		goto out;
+
+	err = file_update_time(file);
+	if (err)
+		goto out;
+
+retry_snap:
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) {
+		err = -ENOSPC;
+		goto out;
 	}
 
-	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
-	     inode, ceph_vinop(inode), pos + written,
-	     (unsigned)iov->iov_len - written, inode->i_size);
-	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff);
-	if (ret < 0)
+	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
+	     inode, ceph_vinop(inode), pos, count, inode->i_size);
+	if (fi->fmode & CEPH_FILE_MODE_LAZY)
+		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+	else
+		want = CEPH_CAP_FILE_BUFFER;
+	got = 0;
+	err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count);
+	if (err < 0)
 		goto out;
 
-	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos + written,
-	     (unsigned)iov->iov_len - written, ceph_cap_string(got));
-	ret = ceph_sync_write(file, iov->iov_base + written,
-			      iov->iov_len - written, &iocb->ki_pos);
-	if (ret >= 0) {
+	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
+
+	if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
+	    (iocb->ki_filp->f_flags & O_DIRECT) ||
+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
+	    (fi->flags & CEPH_F_SYNC)) {
+		mutex_unlock(&inode->i_mutex);
+		written = ceph_sync_write(file, iov->iov_base, count,
+					  pos, &iocb->ki_pos);
+	} else {
+		written = generic_file_buffered_write(iocb, iov, nr_segs,
+						      pos, &iocb->ki_pos,
+						      count, 0);
+		mutex_unlock(&inode->i_mutex);
+	}
+	hold_mutex = false;
+
+	if (written >= 0) {
 		int dirty;
 		spin_lock(&ci->i_ceph_lock);
 		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
@@ -767,18 +784,33 @@ retry_snap:
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
 	}
+
 	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
-	     inode, ceph_vinop(inode), pos + written,
-	     (unsigned)iov->iov_len - written, ceph_cap_string(got));
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     ceph_cap_string(got));
 	ceph_put_cap_refs(ci, got);
-out:
-	if (ret == -EOLDSNAPC) {
+
+	if (written >= 0 &&
+	    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) ||
+	     ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
+		err = vfs_fsync_range(file, pos, pos + written - 1, 1);
+		if (err < 0)
+			written = err;
+	}
+
+	if (written == -EOLDSNAPC) {
 		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
 		     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
+		mutex_lock(&inode->i_mutex);
+		hold_mutex = true;
 		goto retry_snap;
 	}
+out:
+	if (hold_mutex)
+		mutex_unlock(&inode->i_mutex);
+	current->backing_dev_info = NULL;
 
-	return ret;
+	return written ? written : err;
 }
 
 /*
@@ -832,16 +864,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 		break;
 	}
 
-	if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
-		offset = -EINVAL;
-		goto out;
-	}
-
-	/* Special lock needed here? */
-	if (offset != file->f_pos) {
-		file->f_pos = offset;
-		file->f_version = 0;
-	}
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out:
 	mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 2971eaa65cd..f3a2abf28a7 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -302,7 +302,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_version = 0;
 	ci->i_time_warp_seq = 0;
 	ci->i_ceph_flags = 0;
-	ci->i_release_count = 0;
+	atomic_set(&ci->i_release_count, 1);
+	atomic_set(&ci->i_complete_count, 0);
 	ci->i_symlink = NULL;
 
 	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
@@ -561,7 +562,6 @@ static int fill_inode(struct inode *inode,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int i;
 	int issued = 0, implemented;
-	int updating_inode = 0;
 	struct timespec mtime, atime, ctime;
 	u32 nsplits;
 	struct ceph_buffer *xattr_blob = NULL;
@@ -601,7 +601,6 @@ static int fill_inode(struct inode *inode,
 	    (ci->i_version & ~1) >= le64_to_cpu(info->version))
 		goto no_change;
 	
-	updating_inode = 1;
 	issued = __ceph_caps_issued(ci, &implemented);
 	issued |= implemented | __ceph_caps_dirty(ci);
 
@@ -612,10 +611,11 @@ static int fill_inode(struct inode *inode,
 
 	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
 		inode->i_mode = le32_to_cpu(info->mode);
-		inode->i_uid = le32_to_cpu(info->uid);
-		inode->i_gid = le32_to_cpu(info->gid);
+		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
+		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
 		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
-		     inode->i_uid, inode->i_gid);
+		     from_kuid(&init_user_ns, inode->i_uid),
+		     from_kgid(&init_user_ns, inode->i_gid));
 	}
 
 	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
@@ -716,6 +716,17 @@ static int fill_inode(struct inode *inode,
 		       ceph_vinop(inode), inode->i_mode);
 	}
 
+	/* set dir completion flag? */
+	if (S_ISDIR(inode->i_mode) &&
+	    ci->i_files == 0 && ci->i_subdirs == 0 &&
+	    ceph_snap(inode) == CEPH_NOSNAP &&
+	    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+	    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+	    !__ceph_dir_is_complete(ci)) {
+		dout(" marking %p complete (empty)\n", inode);
+		__ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
+		ci->i_max_offset = 2;
+	}
 no_change:
 	spin_unlock(&ci->i_ceph_lock);
 
@@ -766,19 +777,6 @@ no_change:
 		__ceph_get_fmode(ci, cap_fmode);
 	}
 
-	/* set dir completion flag? */
-	if (S_ISDIR(inode->i_mode) &&
-	    updating_inode &&                 /* didn't jump to no_change */
-	    ci->i_files == 0 && ci->i_subdirs == 0 &&
-	    ceph_snap(inode) == CEPH_NOSNAP &&
-	    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
-	    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
-	    !ceph_dir_test_complete(inode)) {
-		dout(" marking %p complete (empty)\n", inode);
-		ceph_dir_set_complete(inode);
-		ci->i_max_offset = 2;
-	}
-
 	/* update delegation info? */
 	if (dirinfo)
 		ceph_fill_dirfrag(inode, dirinfo);
@@ -860,7 +858,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 	di = ceph_dentry(dn);
 
 	spin_lock(&ci->i_ceph_lock);
-	if (!ceph_dir_test_complete(inode)) {
+	if (!__ceph_dir_is_complete(ci)) {
 		spin_unlock(&ci->i_ceph_lock);
 		return;
 	}
@@ -905,8 +903,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 	} else if (realdn) {
 		dout("dn %p (%d) spliced with %p (%d) "
 		     "inode %p ino %llx.%llx\n",
-		     dn, dn->d_count,
-		     realdn, realdn->d_count,
+		     dn, d_count(dn),
+		     realdn, d_count(realdn),
 		     realdn->d_inode, ceph_vinop(realdn->d_inode));
 		dput(dn);
 		dn = realdn;
@@ -1064,8 +1062,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			/*
 			 * d_move() puts the renamed dentry at the end of
 			 * d_subdirs.  We need to assign it an appropriate
-			 * directory offset so we can behave when holding
-			 * D_COMPLETE.
+			 * directory offset so we can behave when dir is
+			 * complete.
 			 */
 			ceph_set_dentry_offset(req->r_old_dentry);
 			dout("dn %p gets new offset %lld\n", req->r_old_dentry, 
@@ -1130,8 +1128,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 					    req->r_request_started);
 		dout(" final dn %p\n", dn);
 		i++;
-	} else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
-		   req->r_op == CEPH_MDS_OP_MKSNAP) {
+	} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+		   req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) {
 		struct dentry *dn = req->r_dentry;
 
 		/* fill out a snapdir LOOKUPSNAP dentry */
@@ -1195,6 +1193,39 @@ done:
 /*
  * Prepopulate our cache with readdir results, leases, etc.
  */
+static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
+					   struct ceph_mds_session *session)
+{
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	int i, err = 0;
+
+	for (i = 0; i < rinfo->dir_nr; i++) {
+		struct ceph_vino vino;
+		struct inode *in;
+		int rc;
+
+		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+
+		in = ceph_get_inode(req->r_dentry->d_sb, vino);
+		if (IS_ERR(in)) {
+			err = PTR_ERR(in);
+			dout("new_inode badness got %d\n", err);
+			continue;
+		}
+		rc = fill_inode(in, &rinfo->dir_in[i], NULL, session,
+				req->r_request_started, -1,
+				&req->r_caps_reservation);
+		if (rc < 0) {
+			pr_err("fill_inode badness on %p got %d\n", in, rc);
+			err = rc;
+			continue;
+		}
+	}
+
+	return err;
+}
+
 int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 			     struct ceph_mds_session *session)
 {
@@ -1209,6 +1240,9 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	u64 frag = le32_to_cpu(rhead->args.readdir.frag);
 	struct ceph_dentry_info *di;
 
+	if (req->r_aborted)
+		return readdir_prepopulate_inodes_only(req, session);
+
 	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
 		snapdir = ceph_get_snapdir(parent->d_inode);
 		parent = d_find_alias(snapdir);
@@ -1420,7 +1454,7 @@ out:
 
 
 /*
- * called by trunc_wq; take i_mutex ourselves
+ * called by trunc_wq;
  *
  * We also truncate in a separate thread as well.
  */
@@ -1457,8 +1491,6 @@ void ceph_queue_vmtruncate(struct inode *inode)
 }
 
 /*
- * called with i_mutex held.
- *
  * Make sure any pending truncation is applied before doing anything
  * that may depend on it.
  */
@@ -1526,6 +1558,12 @@ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
 static const struct inode_operations ceph_symlink_iops = {
 	.readlink = generic_readlink,
 	.follow_link = ceph_sym_follow_link,
+	.setattr = ceph_setattr,
+	.getattr = ceph_getattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
 };
 
 /*
@@ -1565,26 +1603,30 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (ia_valid & ATTR_UID) {
 		dout("setattr %p uid %d -> %d\n", inode,
-		     inode->i_uid, attr->ia_uid);
+		     from_kuid(&init_user_ns, inode->i_uid),
+		     from_kuid(&init_user_ns, attr->ia_uid));
 		if (issued & CEPH_CAP_AUTH_EXCL) {
 			inode->i_uid = attr->ia_uid;
 			dirtied |= CEPH_CAP_AUTH_EXCL;
 		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
-			   attr->ia_uid != inode->i_uid) {
-			req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
+			   !uid_eq(attr->ia_uid, inode->i_uid)) {
+			req->r_args.setattr.uid = cpu_to_le32(
+				from_kuid(&init_user_ns, attr->ia_uid));
 			mask |= CEPH_SETATTR_UID;
 			release |= CEPH_CAP_AUTH_SHARED;
 		}
 	}
 	if (ia_valid & ATTR_GID) {
 		dout("setattr %p gid %d -> %d\n", inode,
-		     inode->i_gid, attr->ia_gid);
+		     from_kgid(&init_user_ns, inode->i_gid),
+		     from_kgid(&init_user_ns, attr->ia_gid));
 		if (issued & CEPH_CAP_AUTH_EXCL) {
 			inode->i_gid = attr->ia_gid;
 			dirtied |= CEPH_CAP_AUTH_EXCL;
 		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
-			   attr->ia_gid != inode->i_gid) {
-			req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
+			   !gid_eq(attr->ia_gid, inode->i_gid)) {
+			req->r_args.setattr.gid = cpu_to_le32(
+				from_kgid(&init_user_ns, attr->ia_gid));
 			mask |= CEPH_SETATTR_GID;
 			release |= CEPH_CAP_AUTH_SHARED;
 		}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 36549a46e31..e0b4ef31d3c 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -16,11 +16,11 @@
  */
 static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
 {
-	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
 	struct ceph_ioctl_layout l;
 	int err;
 
-	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+	err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);
 	if (!err) {
 		l.stripe_unit = ceph_file_layout_su(ci->i_layout);
 		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
@@ -63,12 +63,12 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
 
 static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct inode *parent_inode;
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
-	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+	struct ceph_inode_info *ci = ceph_inode(file_inode(file));
 	struct ceph_ioctl_layout nl;
 	int err;
 
@@ -76,7 +76,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 		return -EFAULT;
 
 	/* validate changed params against current layout */
-	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+	err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT);
 	if (err)
 		return err;
 
@@ -136,7 +136,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
  */
 static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_mds_request *req;
 	struct ceph_ioctl_layout l;
 	int err;
@@ -179,13 +179,12 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
 static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 {
 	struct ceph_ioctl_dataloc dl;
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
 	u64 len = 1, olen;
 	u64 tmp;
-	struct ceph_object_layout ol;
 	struct ceph_pg pgid;
 	int r;
 
@@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		return -EFAULT;
 
 	down_read(&osdc->map_sem);
-	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+	r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
 					  &dl.object_no, &dl.object_offset,
 					  &olen);
 	if (r < 0)
@@ -209,10 +208,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 
 	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
 		 ceph_ino(inode), dl.object_no);
-	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
-				osdc->osdmap);
 
-	pgid = ol.ol_pgid;
+	ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
+		ceph_file_layout_pg_pool(ci->i_layout));
+
 	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
 	if (dl.osd >= 0) {
 		struct ceph_entity_addr *a =
@@ -234,7 +233,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 static long ceph_ioctl_lazyio(struct file *file)
 {
 	struct ceph_file_info *fi = file->private_data;
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
 	if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 80576d05d68..ae6d14e82b0 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -13,7 +13,7 @@
 static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 			     int cmd, u8 wait, struct file_lock *fl)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ceph_mds_client *mdsc =
 		ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_mds_request *req;
@@ -169,7 +169,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 }
 
 /**
- * Must be called with BKL already held. Fills in the passed
+ * Must be called with lock_flocks() already held. Fills in the passed
  * counter variables, so you can prepare pagelist metadata before calling
  * ceph_encode_locks.
  */
@@ -191,27 +191,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 }
 
 /**
- * Encode the flock and fcntl locks for the given inode into the pagelist.
- * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
- * sequential flock locks.
- * Must be called with lock_flocks() already held.
- * If we encounter more of a specific lock type than expected,
- * we return the value 1.
+ * Encode the flock and fcntl locks for the given inode into the ceph_filelock
+ * array. Must be called with inode->i_lock already held.
+ * If we encounter more of a specific lock type than expected, return -ENOSPC.
  */
-int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
-		      int num_fcntl_locks, int num_flock_locks)
+int ceph_encode_locks_to_buffer(struct inode *inode,
+				struct ceph_filelock *flocks,
+				int num_fcntl_locks, int num_flock_locks)
 {
 	struct file_lock *lock;
-	struct ceph_filelock cephlock;
 	int err = 0;
 	int seen_fcntl = 0;
 	int seen_flock = 0;
+	int l = 0;
 
 	dout("encoding %d flock and %d fcntl locks", num_flock_locks,
 	     num_fcntl_locks);
-	err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
-	if (err)
-		goto fail;
+
 	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
 		if (lock->fl_flags & FL_POSIX) {
 			++seen_fcntl;
@@ -219,19 +215,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 				err = -ENOSPC;
 				goto fail;
 			}
-			err = lock_to_ceph_filelock(lock, &cephlock);
+			err = lock_to_ceph_filelock(lock, &flocks[l]);
 			if (err)
 				goto fail;
-			err = ceph_pagelist_append(pagelist, &cephlock,
-					   sizeof(struct ceph_filelock));
+			++l;
 		}
-		if (err)
-			goto fail;
 	}
-
-	err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
-	if (err)
-		goto fail;
 	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
 		if (lock->fl_flags & FL_FLOCK) {
 			++seen_flock;
@@ -239,19 +228,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
 				err = -ENOSPC;
 				goto fail;
 			}
-			err = lock_to_ceph_filelock(lock, &cephlock);
+			err = lock_to_ceph_filelock(lock, &flocks[l]);
 			if (err)
 				goto fail;
-			err = ceph_pagelist_append(pagelist, &cephlock,
-					   sizeof(struct ceph_filelock));
+			++l;
 		}
-		if (err)
-			goto fail;
 	}
 fail:
 	return err;
 }
 
+/**
+ * Copy the encoded flock and fcntl locks into the pagelist.
+ * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
+ * sequential flock locks.
+ * Returns zero on success.
+ */
+int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+			   struct ceph_pagelist *pagelist,
+			   int num_fcntl_locks, int num_flock_locks)
+{
+	int err = 0;
+	__le32 nlocks;
+
+	nlocks = cpu_to_le32(num_fcntl_locks);
+	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+	if (err)
+		goto out_fail;
+
+	err = ceph_pagelist_append(pagelist, flocks,
+				   num_fcntl_locks * sizeof(*flocks));
+	if (err)
+		goto out_fail;
+
+	nlocks = cpu_to_le32(num_flock_locks);
+	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
+	if (err)
+		goto out_fail;
+
+	err = ceph_pagelist_append(pagelist,
+				   &flocks[num_fcntl_locks],
+				   num_flock_locks * sizeof(*flocks));
+out_fail:
+	return err;
+}
+
 /*
  * Given a pointer to a lock, convert it to a ceph filelock
  */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9165eb8309e..187bf214444 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -233,6 +233,30 @@ bad:
 }
 
 /*
+ * parse create results
+ */
+static int parse_reply_info_create(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info,
+				  int features)
+{
+	if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
+		if (*p == end) {
+			info->has_create_ino = false;
+		} else {
+			info->has_create_ino = true;
+			info->ino = ceph_decode_64(p);
+		}
+	}
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	return -EIO;
+}
+
+/*
  * parse extra results
  */
 static int parse_reply_info_extra(void **p, void *end,
@@ -241,8 +265,13 @@ static int parse_reply_info_extra(void **p, void *end,
 {
 	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
 		return parse_reply_info_filelock(p, end, info, features);
-	else
+	else if (info->head->op == CEPH_MDS_OP_READDIR ||
+		 info->head->op == CEPH_MDS_OP_LSSNAP)
 		return parse_reply_info_dir(p, end, info, features);
+	else if (info->head->op == CEPH_MDS_OP_CREATE)
+		return parse_reply_info_create(p, end, info, features);
+	else
+		return -EIO;
 }
 
 /*
@@ -336,9 +365,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
 	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
 	if (atomic_dec_and_test(&s->s_ref)) {
 		if (s->s_auth.authorizer)
-		     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
-			     s->s_mdsc->fsc->client->monc.auth,
-			     s->s_auth.authorizer);
+			ceph_auth_destroy_authorizer(
+				s->s_mdsc->fsc->client->monc.auth,
+				s->s_auth.authorizer);
 		kfree(s);
 	}
 }
@@ -1168,6 +1197,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 	session->s_trim_caps--;
 	if (oissued) {
 		/* we aren't the only cap.. just remove us */
+		__queue_cap_release(session, ceph_ino(inode), cap->cap_id,
+				    cap->mseq, cap->issue_seq);
 		__ceph_remove_cap(cap);
 	} else {
 		/* try to drop referring dentries */
@@ -1360,6 +1391,7 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
 	num = le32_to_cpu(head->num);
 	dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
 	head->num = cpu_to_le32(0);
+	msg->front.iov_len = sizeof(*head);
 	session->s_num_cap_releases += num;
 
 	/* requeue completed messages */
@@ -1522,7 +1554,7 @@ retry:
 	*base = ceph_ino(temp->d_inode);
 	*plen = len;
 	dout("build_path on %p %d built %llx '%.*s'\n",
-	     dentry, dentry->d_count, *base, len, path);
+	     dentry, d_count(dentry), *base, len, path);
 	return path;
 }
 
@@ -1658,8 +1690,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 
 	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
 	head->op = cpu_to_le32(req->r_op);
-	head->caller_uid = cpu_to_le32(req->r_uid);
-	head->caller_gid = cpu_to_le32(req->r_gid);
+	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
+	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
 	head->args = req->r_args;
 
 	ceph_encode_filepath(&p, end, ino1, path1);
@@ -1690,8 +1722,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	msg->front.iov_len = p - msg->front.iov_base;
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 
-	msg->pages = req->r_pages;
-	msg->nr_pages = req->r_num_pages;
+	if (req->r_data_len) {
+		/* outbound data set only by ceph_sync_setxattr() */
+		BUG_ON(!req->r_pages);
+		ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0);
+	}
+
 	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
 	msg->hdr.data_off = cpu_to_le16(0);
 
@@ -1885,6 +1921,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 		req = list_entry(tmp_list.next,
 				 struct ceph_mds_request, r_wait);
 		list_del_init(&req->r_wait);
+		dout(" wake request %p tid %llu\n", req, req->r_tid);
 		__do_request(mdsc, req);
 	}
 }
@@ -1998,20 +2035,16 @@ out:
 }
 
 /*
- * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
+ * Invalidate dir's completeness, dentry lease state on an aborted MDS
  * namespace request.
  */
 void ceph_invalidate_dir_request(struct ceph_mds_request *req)
 {
 	struct inode *inode = req->r_locked_dir;
-	struct ceph_inode_info *ci = ceph_inode(inode);
 
-	dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
-	spin_lock(&ci->i_ceph_lock);
-	ceph_dir_clear_complete(inode);
-	ci->i_release_count++;
-	spin_unlock(&ci->i_ceph_lock);
+	dout("invalidate_dir_request %p (complete, lease(s))\n", inode);
 
+	ceph_dir_clear_complete(inode);
 	if (req->r_dentry)
 		ceph_invalidate_dentry_lease(req->r_dentry);
 	if (req->r_old_dentry)
@@ -2170,7 +2203,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	mutex_lock(&req->r_fill_mutex);
 	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
 	if (err == 0) {
-		if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
+				    req->r_op == CEPH_MDS_OP_LSSNAP) &&
 		    rinfo->dir_nr)
 			ceph_readdir_prepopulate(req, req->r_session);
 		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
@@ -2421,6 +2455,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	spin_lock(&ci->i_ceph_lock);
 	cap->seq = 0;        /* reset cap seq */
 	cap->issue_seq = 0;  /* and issue_seq */
+	cap->mseq = 0;       /* and migrate_seq */
 
 	if (recon_state->flock) {
 		rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -2445,39 +2480,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 
 	if (recon_state->flock) {
 		int num_fcntl_locks, num_flock_locks;
-		struct ceph_pagelist_cursor trunc_point;
-
-		ceph_pagelist_set_cursor(pagelist, &trunc_point);
-		do {
-			lock_flocks();
-			ceph_count_locks(inode, &num_fcntl_locks,
-					 &num_flock_locks);
-			rec.v2.flock_len = (2*sizeof(u32) +
-					    (num_fcntl_locks+num_flock_locks) *
-					    sizeof(struct ceph_filelock));
-			unlock_flocks();
-
-			/* pre-alloc pagelist */
-			ceph_pagelist_truncate(pagelist, &trunc_point);
-			err = ceph_pagelist_append(pagelist, &rec, reclen);
-			if (!err)
-				err = ceph_pagelist_reserve(pagelist,
-							    rec.v2.flock_len);
-
-			/* encode locks */
-			if (!err) {
-				lock_flocks();
-				err = ceph_encode_locks(inode,
-							pagelist,
-							num_fcntl_locks,
-							num_flock_locks);
-				unlock_flocks();
-			}
-		} while (err == -ENOSPC);
+		struct ceph_filelock *flocks;
+
+encode_again:
+		spin_lock(&inode->i_lock);
+		ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+		spin_unlock(&inode->i_lock);
+		flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
+				 sizeof(struct ceph_filelock), GFP_NOFS);
+		if (!flocks) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+		spin_lock(&inode->i_lock);
+		err = ceph_encode_locks_to_buffer(inode, flocks,
+						  num_fcntl_locks,
+						  num_flock_locks);
+		spin_unlock(&inode->i_lock);
+		if (err) {
+			kfree(flocks);
+			if (err == -ENOSPC)
+				goto encode_again;
+			goto out_free;
+		}
+		/*
+		 * number of encoded locks is stable, so copy to pagelist
+		 */
+		rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
+				    (num_fcntl_locks+num_flock_locks) *
+				    sizeof(struct ceph_filelock));
+		err = ceph_pagelist_append(pagelist, &rec, reclen);
+		if (!err)
+			err = ceph_locks_to_pagelist(flocks, pagelist,
+						     num_fcntl_locks,
+						     num_flock_locks);
+		kfree(flocks);
 	} else {
 		err = ceph_pagelist_append(pagelist, &rec, reclen);
 	}
-
 out_free:
 	kfree(path);
 out_dput:
@@ -2570,11 +2610,13 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 			goto fail;
 	}
 
-	reply->pagelist = pagelist;
 	if (recon_state.flock)
 		reply->hdr.version = cpu_to_le16(2);
-	reply->hdr.data_len = cpu_to_le32(pagelist->length);
-	reply->nr_pages = calc_pages_for(0, pagelist->length);
+	if (pagelist->length) {
+		/* set up outbound data if we have any */
+		reply->hdr.data_len = cpu_to_le32(pagelist->length);
+		ceph_msg_data_add_pagelist(reply, pagelist);
+	}
 	ceph_con_send(&session->s_con, reply);
 
 	mutex_unlock(&session->s_mutex);
@@ -3000,8 +3042,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	fsc->mdsc = mdsc;
 	mutex_init(&mdsc->mutex);
 	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
-	if (mdsc->mdsmap == NULL)
+	if (mdsc->mdsmap == NULL) {
+		kfree(mdsc);
 		return -ENOMEM;
+	}
 
 	init_completion(&mdsc->safe_umount_waiters);
 	init_waitqueue_head(&mdsc->session_close_wq);
@@ -3404,13 +3448,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 	struct ceph_auth_handshake *auth = &s->s_auth;
 
 	if (force_new && auth->authorizer) {
-		if (ac->ops && ac->ops->destroy_authorizer)
-			ac->ops->destroy_authorizer(ac, auth->authorizer);
+		ceph_auth_destroy_authorizer(ac, auth->authorizer);
 		auth->authorizer = NULL;
 	}
-	if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
-		int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
-							auth);
+	if (!auth->authorizer) {
+		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+						      auth);
+		if (ret)
+			return ERR_PTR(ret);
+	} else {
+		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+						      auth);
 		if (ret)
 			return ERR_PTR(ret);
 	}
@@ -3426,7 +3474,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
-	return ac->ops->verify_authorizer_reply(ac, s->s_auth.authorizer, len);
+	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
@@ -3435,12 +3483,32 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
-	if (ac->ops->invalidate_authorizer)
-		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
 
 	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
 
+static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr, int *skip)
+{
+	struct ceph_msg *msg;
+	int type = (int) le16_to_cpu(hdr->type);
+	int front_len = (int) le32_to_cpu(hdr->front_len);
+
+	if (con->in_msg)
+		return con->in_msg;
+
+	*skip = 0;
+	msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
+	if (!msg) {
+		pr_err("unable to allocate msg type %d len %d\n",
+		       type, front_len);
+		return NULL;
+	}
+
+	return msg;
+}
+
 static const struct ceph_connection_operations mds_con_ops = {
 	.get = con_get,
 	.put = con_put,
@@ -3449,6 +3517,7 @@ static const struct ceph_connection_operations mds_con_ops = {
 	.verify_authorizer_reply = verify_authorizer_reply,
 	.invalidate_authorizer = invalidate_authorizer,
 	.peer_reset = peer_reset,
+	.alloc_msg = mds_alloc_msg,
 };
 
 /* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index dd26846dd71..c2a19fbbe51 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_info_in *dir_in;
 			u8                            dir_complete, dir_end;
 		};
+
+		/* for create results */
+		struct {
+			bool has_create_ino;
+			u64 ino;
+		};
 	};
 
 	/* encoded blob describing snapshot contexts for certain
@@ -184,8 +190,8 @@ struct ceph_mds_request {
 
 	union ceph_mds_request_args r_args;
 	int r_fmode;        /* file mode, if expecting cap */
-	uid_t r_uid;
-	gid_t r_gid;
+	kuid_t r_uid;
+	kgid_t r_gid;
 
 	/* for choosing which mds to send this request to */
 	int r_direct_mode;
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 73b7d44e8a3..132b64eeecd 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -20,7 +20,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
 {
 	int n = 0;
 	int i;
-	char r;
+
+	/* special case for one mds */
+	if (1 == m->m_max_mds && m->m_info[0].state > 0)
+		return 0;
 
 	/* count */
 	for (i = 0; i < m->m_max_mds; i++)
@@ -30,8 +33,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
 		return -1;
 
 	/* pick */
-	get_random_bytes(&r, 1);
-	n = r % n;
+	n = prandom_u32() % n;
 	i = 0;
 	for (i = 0; n > 0; i++, n--)
 		while (m->m_info[i].state <= 0)
@@ -59,6 +61,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		return ERR_PTR(-ENOMEM);
 
 	ceph_decode_16_safe(p, end, version, bad);
+	if (version > 3) {
+		pr_warning("got mdsmap version %d > 3, failing", version);
+		goto bad;
+	}
 
 	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
 	m->m_epoch = ceph_decode_32(p);
@@ -86,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		u32 num_export_targets;
 		void *pexport_targets = NULL;
 		struct ceph_timespec laggy_since;
+		struct ceph_mds_info *info;
 
 		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
 		global_id = ceph_decode_64(p);
@@ -120,37 +127,40 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		     i+1, n, global_id, mds, inc,
 		     ceph_pr_addr(&addr.in_addr),
 		     ceph_mds_state_name(state));
-		if (mds >= 0 && mds < m->m_max_mds && state > 0) {
-			m->m_info[mds].global_id = global_id;
-			m->m_info[mds].state = state;
-			m->m_info[mds].addr = addr;
-			m->m_info[mds].laggy =
-				(laggy_since.tv_sec != 0 ||
-				 laggy_since.tv_nsec != 0);
-			m->m_info[mds].num_export_targets = num_export_targets;
-			if (num_export_targets) {
-				m->m_info[mds].export_targets =
-					kcalloc(num_export_targets, sizeof(u32),
-						GFP_NOFS);
-				for (j = 0; j < num_export_targets; j++)
-					m->m_info[mds].export_targets[j] =
-					       ceph_decode_32(&pexport_targets);
-			} else {
-				m->m_info[mds].export_targets = NULL;
-			}
+
+		if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+			continue;
+
+		info = &m->m_info[mds];
+		info->global_id = global_id;
+		info->state = state;
+		info->addr = addr;
+		info->laggy = (laggy_since.tv_sec != 0 ||
+			       laggy_since.tv_nsec != 0);
+		info->num_export_targets = num_export_targets;
+		if (num_export_targets) {
+			info->export_targets = kcalloc(num_export_targets,
+						       sizeof(u32), GFP_NOFS);
+			if (info->export_targets == NULL)
+				goto badmem;
+			for (j = 0; j < num_export_targets; j++)
+				info->export_targets[j] =
+				       ceph_decode_32(&pexport_targets);
+		} else {
+			info->export_targets = NULL;
 		}
 	}
 
 	/* pg_pools */
 	ceph_decode_32_safe(p, end, n, bad);
 	m->m_num_data_pg_pools = n;
-	m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+	m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
 	if (!m->m_data_pg_pools)
 		goto badmem;
-	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+	ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
 	for (i = 0; i < n; i++)
-		m->m_data_pg_pools[i] = ceph_decode_32(p);
-	m->m_cas_pg_pool = ceph_decode_32(p);
+		m->m_data_pg_pools[i] = ceph_decode_64(p);
+	m->m_cas_pg_pool = ceph_decode_64(p);
 
 	/* ok, we don't care about the rest. */
 	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
@@ -164,7 +174,7 @@ bad:
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       start, end - start, true);
 	ceph_mdsmap_destroy(m);
-	return ERR_PTR(-EINVAL);
+	return ERR_PTR(err);
 }
 
 void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index cbb2f54a301..f01645a2775 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -332,10 +332,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 	err = -ENOMEM;
 	if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
 		goto fail;
-	snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
+	snapc = ceph_create_snap_context(num, GFP_NOFS);
 	if (!snapc)
 		goto fail;
-	atomic_set(&snapc->nref, 1);
 
 	/* build (reverse sorted) snap vector */
 	num = 0;
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index cd5097d7c80..89fa4a940a0 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)
 	case CEPH_MDS_STATE_BOOT:       return "up:boot";
 	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
 	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
 	case CEPH_MDS_STATE_CREATING:   return "up:creating";
 	case CEPH_MDS_STATE_STARTING:   return "up:starting";
 		/* up and in */
@@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op)
 	case CEPH_MDS_OP_LOOKUP:  return "lookup";
 	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
 	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
 	case CEPH_MDS_OP_GETATTR:  return "getattr";
 	case CEPH_MDS_OP_SETXATTR: return "setxattr";
 	case CEPH_MDS_OP_SETATTR: return "setattr";
 	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
+	case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
 	case CEPH_MDS_OP_READDIR: return "readdir";
 	case CEPH_MDS_OP_MKNOD: return "mknod";
 	case CEPH_MDS_OP_LINK: return "link";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e86aa994812..6627b26a800 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	/*
 	 * express utilization in terms of large blocks to avoid
 	 * overflow on 32-bit machines.
+	 *
+	 * NOTE: for the time being, we make bsize == frsize to humor
+	 * not-yet-ancient versions of glibc that are broken.
+	 * Someday, we will probably want to report a real block
+	 * size...  whatever that may mean for a network file system!
 	 */
 	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+	buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
 	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
 	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
 	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
@@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = le64_to_cpu(st.num_objects);
 	buf->f_ffree = -1;
 	buf->f_namelen = NAME_MAX;
-	buf->f_frsize = PAGE_CACHE_SIZE;
 
 	/* leave fsid little-endian, regardless of host endianness */
 	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
@@ -352,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 	}
 	err = -EINVAL;
 	dev_name_end--;		/* back up to ':' separator */
-	if (*dev_name_end != ':') {
+	if (dev_name_end < dev_name || *dev_name_end != ':') {
 		pr_err("device name is missing path (no : separator in %s)\n",
 				dev_name);
 		goto out;
@@ -474,6 +479,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 		CEPH_FEATURE_FLOCK |
 		CEPH_FEATURE_DIRLAYOUTHASH;
 	const unsigned required_features = 0;
+	int page_count;
+	size_t size;
 	int err = -ENOMEM;
 
 	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
@@ -517,8 +524,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 
 	/* set up mempools */
 	err = -ENOMEM;
-	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
-			      fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
+	page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT;
+	size = sizeof (struct page *) * (page_count ? page_count : 1);
+	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
 	if (!fsc->wb_pagevec_pool)
 		goto fail_trunc_wq;
 
@@ -947,6 +955,7 @@ static struct file_system_type ceph_fs_type = {
 	.kill_sb	= ceph_kill_sb,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE,
 };
+MODULE_ALIAS_FS("ceph");
 
 #define _STRINGIFY(x) #x
 #define STRINGIFY(x) _STRINGIFY(x)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 66ebe720e40..cbded572345 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -21,7 +21,7 @@
 
 /* large granularity for statfs utilization stats to facilitate
  * large volume sizes on 32-bit machines. */
-#define CEPH_BLOCK_SHIFT   20  /* 1 MB */
+#define CEPH_BLOCK_SHIFT   22  /* 4 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
 
 #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
@@ -138,8 +138,8 @@ struct ceph_cap_snap {
 	struct ceph_snap_context *context;
 
 	umode_t mode;
-	uid_t uid;
-	gid_t gid;
+	kuid_t uid;
+	kgid_t gid;
 
 	struct ceph_buffer *xattr_blob;
 	u64 xattr_version;
@@ -204,7 +204,6 @@ struct ceph_inode_xattr {
  * Ceph dentry state
  */
 struct ceph_dentry_info {
-	unsigned long flags;
 	struct ceph_mds_session *lease_session;
 	u32 lease_gen, lease_shared_gen;
 	u32 lease_seq;
@@ -215,18 +214,6 @@ struct ceph_dentry_info {
 	u64 offset;
 };
 
-/*
- * dentry flags
- *
- * The locking for D_COMPLETE is a bit odd:
- *  - we can clear it at almost any time (see ceph_d_prune)
- *  - it is only meaningful if:
- *    - we hold dir inode i_ceph_lock
- *    - we hold dir FILE_SHARED caps
- *    - the dentry D_COMPLETE is set
- */
-#define CEPH_D_COMPLETE 1  /* if set, d_u.d_subdirs is complete directory */
-
 struct ceph_inode_xattrs_info {
 	/*
 	 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -257,7 +244,8 @@ struct ceph_inode_info {
 	u32 i_time_warp_seq;
 
 	unsigned i_ceph_flags;
-	unsigned long i_release_count;
+	atomic_t i_release_count;
+	atomic_t i_complete_count;
 
 	struct ceph_dir_layout i_dir_layout;
 	struct ceph_file_layout i_layout;
@@ -267,7 +255,7 @@ struct ceph_inode_info {
 	struct timespec i_rctime;
 	u64 i_rbytes, i_rfiles, i_rsubdirs;
 	u64 i_files, i_subdirs;
-	u64 i_max_offset;  /* largest readdir offset, set with D_COMPLETE */
+	u64 i_max_offset;  /* largest readdir offset, set with complete dir */
 
 	struct rb_root i_fragtree;
 	struct mutex i_fragtree_mutex;
@@ -436,33 +424,31 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
 #define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
 
-static inline void ceph_i_clear(struct inode *inode, unsigned mask)
+static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
+					   int release_count)
 {
-	struct ceph_inode_info *ci = ceph_inode(inode);
-
-	spin_lock(&ci->i_ceph_lock);
-	ci->i_ceph_flags &= ~mask;
-	spin_unlock(&ci->i_ceph_lock);
+	atomic_set(&ci->i_complete_count, release_count);
 }
 
-static inline void ceph_i_set(struct inode *inode, unsigned mask)
+static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
 {
-	struct ceph_inode_info *ci = ceph_inode(inode);
+	atomic_inc(&ci->i_release_count);
+}
 
-	spin_lock(&ci->i_ceph_lock);
-	ci->i_ceph_flags |= mask;
-	spin_unlock(&ci->i_ceph_lock);
+static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
+{
+	return atomic_read(&ci->i_complete_count) ==
+		atomic_read(&ci->i_release_count);
 }
 
-static inline bool ceph_i_test(struct inode *inode, unsigned mask)
+static inline void ceph_dir_clear_complete(struct inode *inode)
 {
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	bool r;
+	__ceph_dir_clear_complete(ceph_inode(inode));
+}
 
-	spin_lock(&ci->i_ceph_lock);
-	r = (ci->i_ceph_flags & mask) == mask;
-	spin_unlock(&ci->i_ceph_lock);
-	return r;
+static inline bool ceph_dir_is_complete(struct inode *inode)
+{
+	return __ceph_dir_is_complete(ceph_inode(inode));
 }
 
 
@@ -489,13 +475,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
 }
 
 /*
- * set/clear directory D_COMPLETE flag
- */
-void ceph_dir_set_complete(struct inode *inode);
-void ceph_dir_clear_complete(struct inode *inode);
-bool ceph_dir_test_complete(struct inode *inode);
-
-/*
  * caps helpers
  */
 static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
@@ -555,7 +534,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
 extern void ceph_caps_init(struct ceph_mds_client *mdsc);
 extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
 extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
-extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+extern void ceph_reserve_caps(struct ceph_mds_client *mdsc,
 			     struct ceph_cap_reservation *ctx, int need);
 extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
 			       struct ceph_cap_reservation *ctx);
@@ -584,7 +563,7 @@ struct ceph_file_info {
 	u64 next_offset;       /* offset of next chunk (last_name's + 1) */
 	char *last_name;       /* last entry in previous chunk */
 	struct dentry *dentry; /* next dentry (for dcache readdir) */
-	unsigned long dir_release_count;
+	int dir_release_count;
 
 	/* used for -o dirstat read() on directory thing */
 	char *dir_info;
@@ -755,6 +734,8 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
 extern void ceph_put_cap(struct ceph_mds_client *mdsc,
 			 struct ceph_cap *cap);
 
+extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
+				u64 cap_id, u32 migrate_seq, u32 issue_seq);
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
 extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
@@ -798,13 +779,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
 extern const struct address_space_operations ceph_aops;
-extern int ceph_copy_to_page_vector(struct page **pages,
-				    const char *data,
-				    loff_t off, size_t len);
-extern int ceph_copy_from_page_vector(struct page **pages,
-				    char *data,
-				    loff_t off, size_t len);
-extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 			    struct file *file, unsigned flags, umode_t mode,
@@ -847,8 +822,13 @@ extern const struct export_operations ceph_export_ops;
 extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
 extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
 extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
-extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
-			     int p_locks, int f_locks);
+extern int ceph_encode_locks_to_buffer(struct inode *inode,
+				       struct ceph_filelock *flocks,
+				       int num_fcntl_locks,
+				       int num_flock_locks);
+extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
+				  struct ceph_pagelist *pagelist,
+				  int num_fcntl_locks, int num_flock_locks);
 extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
 
 /* debugfs.c */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2c2ae5be990..be661d8f532 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -29,9 +29,94 @@ struct ceph_vxattr {
 	size_t name_size;	/* strlen(name) + 1 (for '\0') */
 	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
 			      size_t size);
-	bool readonly;
+	bool readonly, hidden;
+	bool (*exists_cb)(struct ceph_inode_info *ci);
 };
 
+/* layouts */
+
+static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
+{
+	size_t s;
+	char *p = (char *)&ci->i_layout;
+
+	for (s = 0; s < sizeof(ci->i_layout); s++, p++)
+		if (*p)
+			return true;
+	return false;
+}
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	int ret;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	const char *pool_name;
+
+	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+	down_read(&osdc->map_sem);
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+	if (pool_name)
+		ret = snprintf(val, size,
+		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+		pool_name);
+	else
+		ret = snprintf(val, size,
+		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+		(unsigned long long)pool);
+
+	up_read(&osdc->map_sem);
+	return ret;
+}
+
+static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
+					       char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+			(unsigned long long)ceph_file_layout_su(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
+						char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+	       (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
+					       char *val, size_t size)
+{
+	return snprintf(val, size, "%lld",
+	       (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+}
+
+static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
+					char *val, size_t size)
+{
+	int ret;
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	const char *pool_name;
+
+	down_read(&osdc->map_sem);
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
+	if (pool_name)
+		ret = snprintf(val, size, "%s", pool_name);
+	else
+		ret = snprintf(val, size, "%lld", (unsigned long long)pool);
+	up_read(&osdc->map_sem);
+	return ret;
+}
+
 /* directories */
 
 static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -83,17 +168,43 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
 			(long)ci->i_rctime.tv_nsec);
 }
 
-#define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
 
-#define XATTR_NAME_CEPH(_type, _name) \
-		{ \
-			.name = CEPH_XATTR_NAME(_type, _name), \
-			.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
-			.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
-			.readonly = true, \
-		}
+#define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
+#define CEPH_XATTR_NAME2(_type, _name, _name2)	\
+	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
+
+#define XATTR_NAME_CEPH(_type, _name)					\
+	{								\
+		.name = CEPH_XATTR_NAME(_type, _name),			\
+		.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+		.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+		.readonly = true,				\
+		.hidden = false,				\
+		.exists_cb = NULL,			\
+	}
+#define XATTR_LAYOUT_FIELD(_type, _name, _field)			\
+	{								\
+		.name = CEPH_XATTR_NAME2(_type, _name, _field),	\
+		.name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
+		.getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
+		.readonly = false,				\
+		.hidden = true,			\
+		.exists_cb = ceph_vxattrcb_layout_exists,	\
+	}
 
 static struct ceph_vxattr ceph_dir_vxattrs[] = {
+	{
+		.name = "ceph.dir.layout",
+		.name_size = sizeof("ceph.dir.layout"),
+		.getxattr_cb = ceph_vxattrcb_layout,
+		.readonly = false,
+		.hidden = false,
+		.exists_cb = ceph_vxattrcb_layout_exists,
+	},
+	XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
+	XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
+	XATTR_LAYOUT_FIELD(dir, layout, object_size),
+	XATTR_LAYOUT_FIELD(dir, layout, pool),
 	XATTR_NAME_CEPH(dir, entries),
 	XATTR_NAME_CEPH(dir, files),
 	XATTR_NAME_CEPH(dir, subdirs),
@@ -102,35 +213,26 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	XATTR_NAME_CEPH(dir, rsubdirs),
 	XATTR_NAME_CEPH(dir, rbytes),
 	XATTR_NAME_CEPH(dir, rctime),
-	{ 0 }	/* Required table terminator */
+	{ .name = NULL, 0 }	/* Required table terminator */
 };
 static size_t ceph_dir_vxattrs_name_size;	/* total size of all names */
 
 /* files */
 
-static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
-				   size_t size)
-{
-	int ret;
-
-	ret = snprintf(val, size,
-		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
-		(unsigned long long)ceph_file_layout_su(ci->i_layout),
-		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
-		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-	return ret;
-}
-
 static struct ceph_vxattr ceph_file_vxattrs[] = {
-	XATTR_NAME_CEPH(file, layout),
-	/* The following extended attribute name is deprecated */
 	{
-		.name = XATTR_CEPH_PREFIX "layout",
-		.name_size = sizeof (XATTR_CEPH_PREFIX "layout"),
-		.getxattr_cb = ceph_vxattrcb_file_layout,
-		.readonly = true,
+		.name = "ceph.file.layout",
+		.name_size = sizeof("ceph.file.layout"),
+		.getxattr_cb = ceph_vxattrcb_layout,
+		.readonly = false,
+		.hidden = false,
+		.exists_cb = ceph_vxattrcb_layout_exists,
 	},
-	{ 0 }	/* Required table terminator */
+	XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
+	XATTR_LAYOUT_FIELD(file, layout, stripe_count),
+	XATTR_LAYOUT_FIELD(file, layout, object_size),
+	XATTR_LAYOUT_FIELD(file, layout, pool),
+	{ .name = NULL, 0 }	/* Required table terminator */
 };
 static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
 
@@ -164,7 +266,8 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
 	size_t size = 0;
 
 	for (vxattr = vxattrs; vxattr->name; vxattr++)
-		size += vxattr->name_size;
+		if (!vxattr->hidden)
+			size += vxattr->name_size;
 
 	return size;
 }
@@ -572,8 +675,13 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 	if (!ceph_is_valid_xattr(name))
 		return -ENODATA;
 
+
 	/* let's see if a virtual xattr was requested */
 	vxattr = ceph_match_vxattr(inode, name);
+	if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
+		err = vxattr->getxattr_cb(ci, value, size);
+		return err;
+	}
 
 	spin_lock(&ci->i_ceph_lock);
 	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
@@ -592,11 +700,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 
 	spin_lock(&ci->i_ceph_lock);
 
-	if (vxattr && vxattr->readonly) {
-		err = vxattr->getxattr_cb(ci, value, size);
-		goto out;
-	}
-
 	err = __build_xattrs(inode);
 	if (err < 0)
 		goto out;
@@ -604,11 +707,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
 get_xattr:
 	err = -ENODATA;  /* == ENOATTR */
 	xattr = __get_xattr(ci, name);
-	if (!xattr) {
-		if (vxattr)
-			err = vxattr->getxattr_cb(ci, value, size);
+	if (!xattr)
 		goto out;
-	}
 
 	err = -ERANGE;
 	if (size && size < xattr->val_len)
@@ -664,23 +764,30 @@ list_xattr:
 	vir_namelen = ceph_vxattrs_name_size(vxattrs);
 
 	/* adding 1 byte per each variable due to the null termination */
-	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+	namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
 	err = -ERANGE;
-	if (size && namelen > size)
+	if (size && vir_namelen + namelen > size)
 		goto out;
 
-	err = namelen;
+	err = namelen + vir_namelen;
 	if (size == 0)
 		goto out;
 
 	names = __copy_xattr_names(ci, names);
 
 	/* virtual xattr names, too */
-	if (vxattrs)
+	err = namelen;
+	if (vxattrs) {
 		for (i = 0; vxattrs[i].name; i++) {
-			len = sprintf(names, "%s", vxattrs[i].name);
-			names += len + 1;
+			if (!vxattrs[i].hidden &&
+			    !(vxattrs[i].exists_cb &&
+			      !vxattrs[i].exists_cb(ci))) {
+				len = sprintf(names, "%s", vxattrs[i].name);
+				names += len + 1;
+				err += len + 1;
+			}
 		}
+	}
 
 out:
 	spin_unlock(&ci->i_ceph_lock);
@@ -782,6 +889,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
 	if (vxattr && vxattr->readonly)
 		return -EOPNOTSUPP;
 
+	/* pass any unhandled ceph.* xattrs through to the MDS */
+	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+		goto do_sync_unlocked;
+
 	/* preallocate memory for xattr name, value, index node */
 	err = -ENOMEM;
 	newname = kmemdup(name, name_len + 1, GFP_NOFS);
@@ -838,6 +949,7 @@ retry:
 
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
 	err = ceph_sync_setxattr(dentry, name, value, size, flags);
 out:
 	kfree(newname);
@@ -892,6 +1004,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 	if (vxattr && vxattr->readonly)
 		return -EOPNOTSUPP;
 
+	/* pass any unhandled ceph.* xattrs through to the MDS */
+	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+		goto do_sync_unlocked;
+
 	err = -ENOMEM;
 	spin_lock(&ci->i_ceph_lock);
 retry:
@@ -931,6 +1047,7 @@ retry:
 	return err;
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
+do_sync_unlocked:
 	err = ceph_send_removexattr(dentry, name);
 out:
 	return err;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 21ff76c22a1..603f18a65c1 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -10,6 +10,7 @@ config CIFS
 	select CRYPTO_ECB
 	select CRYPTO_DES
 	select CRYPTO_SHA256
+	select CRYPTO_CMAC
 	help
 	  This is the client VFS module for the Common Internet File System
 	  (CIFS) protocol which is the successor to the Server Message Block
@@ -155,14 +156,14 @@ config CIFS_DFS_UPCALL
 	    points. If unsure, say N.
 
 config CIFS_NFSD_EXPORT
-	  bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)"
-	  depends on CIFS && EXPERIMENTAL && BROKEN
+	  bool "Allow nfsd to export CIFS file system"
+	  depends on CIFS && BROKEN
 	  help
 	   Allows NFS server to export a CIFS mounted share (nfsd over cifs)
 
 config CIFS_SMB2
-	bool "SMB2 network file system support (EXPERIMENTAL)"
-	depends on CIFS && EXPERIMENTAL && INET
+	bool "SMB2 network file system support"
+	depends on CIFS && INET
 	select NLS
 	select KEYS
 	select FSCACHE
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index cfd1ce34e0b..a3b56544c21 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -506,11 +506,11 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 
 	/* GSSAPI header */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding negTokenInit header");
+		cifs_dbg(FYI, "Error decoding negTokenInit header\n");
 		return 0;
 	} else if ((cls != ASN1_APL) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d\n", cls, con, tag);
 		return 0;
 	}
 
@@ -531,52 +531,52 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 
 	/* SPNEGO OID not present or garbled -- bail out */
 	if (!rc) {
-		cFYI(1, "Error decoding negTokenInit header");
+		cifs_dbg(FYI, "Error decoding negTokenInit header\n");
 		return 0;
 	}
 
 	/* SPNEGO */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding negTokenInit");
+		cifs_dbg(FYI, "Error decoding negTokenInit\n");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
-		     cls, con, tag, end, *end);
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n",
+			 cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* negTokenInit */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding negTokenInit");
+		cifs_dbg(FYI, "Error decoding negTokenInit\n");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 		   || (tag != ASN1_SEQ)) {
-		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
-		     cls, con, tag, end, *end);
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n",
+			 cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* sequence */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding 2nd part of negTokenInit");
+		cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
-		     cls, con, tag, end, *end);
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n",
+			 cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* sequence of */
 	if (asn1_header_decode
 	    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding 2nd part of negTokenInit");
+		cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 		   || (tag != ASN1_SEQ)) {
-		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
-		     cls, con, tag, end, *end);
+		cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n",
+			 cls, con, tag, end, *end);
 		return 0;
 	}
 
@@ -584,15 +584,15 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 	while (!asn1_eoc_decode(&ctx, sequence_end)) {
 		rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
 		if (!rc) {
-			cFYI(1, "Error decoding negTokenInit hdr exit2");
+			cifs_dbg(FYI, "Error decoding negTokenInit hdr exit2\n");
 			return 0;
 		}
 		if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
 			if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
 
-				cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
-					"0x%lx 0x%lx", oidlen, *oid,
-					*(oid + 1), *(oid + 2), *(oid + 3));
+				cifs_dbg(FYI, "OID len = %d oid = 0x%lx 0x%lx 0x%lx 0x%lx\n",
+					 oidlen, *oid, *(oid + 1), *(oid + 2),
+					 *(oid + 3));
 
 				if (compare_oid(oid, oidlen, MSKRB5_OID,
 						MSKRB5_OID_LEN))
@@ -610,57 +610,14 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 				kfree(oid);
 			}
 		} else {
-			cFYI(1, "Should be an oid what is going on?");
+			cifs_dbg(FYI, "Should be an oid what is going on?\n");
 		}
 	}
 
-	/* mechlistMIC */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		/* Check if we have reached the end of the blob, but with
-		   no mechListMic (e.g. NTLMSSP instead of KRB5) */
-		if (ctx.error == ASN1_ERR_DEC_EMPTY)
-			goto decode_negtoken_exit;
-		cFYI(1, "Error decoding last part negTokenInit exit3");
-		return 0;
-	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-		/* tag = 3 indicating mechListMIC */
-		cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
-			cls, con, tag, end, *end);
-		return 0;
-	}
-
-	/* sequence */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding last part negTokenInit exit5");
-		return 0;
-	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
-		   || (tag != ASN1_SEQ)) {
-		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)",
-			cls, con, tag, end, *end);
-	}
-
-	/* sequence of */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding last part negTokenInit exit 7");
-		return 0;
-	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-		cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
-			cls, con, tag, end, *end);
-		return 0;
-	}
-
-	/* general string */
-	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, "Error decoding last part negTokenInit exit9");
-		return 0;
-	} else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
-		   || (tag != ASN1_GENSTR)) {
-		cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)",
-			cls, con, tag, end, *end);
-		return 0;
-	}
-	cFYI(1, "Need to call asn1_octets_decode() function for %s",
-		ctx.pointer);	/* is this UTF-8 or ASCII? */
-decode_negtoken_exit:
+	/*
+	 * We currently ignore anything at the end of the SPNEGO blob after
+	 * the mechTypes have been parsed, since none of that info is
+	 * used at the moment.
+	 */
 	return 1;
 }
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 282d6de7e41..6c665bf4a27 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -92,7 +92,7 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data,
 		break;
 
 	default:
-		cERROR(1, "Unknown network family '%d'", sa->sa_family);
+		cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family);
 		key_len = 0;
 		break;
 	}
@@ -152,7 +152,7 @@ static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer,
 
 	sharename = extract_sharename(tcon->treeName);
 	if (IS_ERR(sharename)) {
-		cFYI(1, "%s: couldn't extract sharename", __func__);
+		cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
 		sharename = NULL;
 		return 0;
 	}
@@ -302,7 +302,7 @@ static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
 	pagevec_init(&pvec, 0);
 	first = 0;
 
-	cFYI(1, "%s: cifs inode 0x%p now uncached", __func__, cifsi);
+	cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi);
 
 	for (;;) {
 		nr_pages = pagevec_lookup(&pvec,
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index d9ea6ede6a7..f3ac4154cbb 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -57,15 +57,32 @@ cifs_dump_mem(char *label, void *data, int length)
 	}
 }
 
+#ifdef CONFIG_CIFS_DEBUG
+void cifs_vfs_err(const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_ERR "CIFS VFS: %pV", &vaf);
+
+	va_end(args);
+}
+#endif
+
 void cifs_dump_detail(void *buf)
 {
 #ifdef CONFIG_CIFS_DEBUG2
 	struct smb_hdr *smb = (struct smb_hdr *)buf;
 
-	cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
-		  smb->Command, smb->Status.CifsError,
-		  smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
-	cERROR(1, "smb buf %p len %u", smb, smbCalcSize(smb));
+	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n",
+		 smb->Command, smb->Status.CifsError,
+		 smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
+	cifs_dbg(VFS, "smb buf %p len %u\n", smb, smbCalcSize(smb));
 #endif /* CONFIG_CIFS_DEBUG2 */
 }
 
@@ -78,25 +95,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 	if (server == NULL)
 		return;
 
-	cERROR(1, "Dump pending requests:");
+	cifs_dbg(VFS, "Dump pending requests:\n");
 	spin_lock(&GlobalMid_Lock);
 	list_for_each(tmp, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-		cERROR(1, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu",
-			mid_entry->mid_state,
-			le16_to_cpu(mid_entry->command),
-			mid_entry->pid,
-			mid_entry->callback_data,
-			mid_entry->mid);
+		cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n",
+			 mid_entry->mid_state,
+			 le16_to_cpu(mid_entry->command),
+			 mid_entry->pid,
+			 mid_entry->callback_data,
+			 mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
-		cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
-			mid_entry->large_buf,
-			mid_entry->resp_buf,
-			mid_entry->when_received,
-			jiffies);
+		cifs_dbg(VFS, "IsLarge: %d buf: %p time rcv: %ld now: %ld\n",
+			 mid_entry->large_buf,
+			 mid_entry->resp_buf,
+			 mid_entry->when_received,
+			 jiffies);
 #endif /* STATS2 */
-		cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
-			  mid_entry->multiEnd);
+		cifs_dbg(VFS, "IsMult: %d IsEnd: %d\n",
+			 mid_entry->multiRsp, mid_entry->multiEnd);
 		if (mid_entry->resp_buf) {
 			cifs_dump_detail(mid_entry->resp_buf);
 			cifs_dump_mem("existing buf: ",
@@ -196,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 						   tcon->nativeFileSystem);
 				}
 				seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
-					"\nPathComponentMax: %d Status: 0x%d",
+					"\n\tPathComponentMax: %d Status: 0x%d",
 					le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
 					le32_to_cpu(tcon->fsAttrInfo.Attributes),
 					le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
@@ -207,6 +224,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 					seq_puts(m, " type: CDROM ");
 				else
 					seq_printf(m, " type: %d ", dev_type);
+				if (server->ops->dump_share_caps)
+					server->ops->dump_share_caps(m, tcon);
 
 				if (tcon->need_reconnect)
 					seq_puts(m, "\tDISCONNECTED ");
@@ -578,9 +597,36 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
 	return single_open(file, cifs_security_flags_proc_show, NULL);
 }
 
+/*
+ * Ensure that if someone sets a MUST flag, that we disable all other MAY
+ * flags except for the ones corresponding to the given MUST flag. If there are
+ * multiple MUST flags, then try to prefer more secure ones.
+ */
+static void
+cifs_security_flags_handle_must_flags(unsigned int *flags)
+{
+	unsigned int signflags = *flags & CIFSSEC_MUST_SIGN;
+
+	if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
+		*flags = CIFSSEC_MUST_KRB5;
+	else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
+		*flags = CIFSSEC_MUST_NTLMSSP;
+	else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
+		*flags = CIFSSEC_MUST_NTLMV2;
+	else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
+		*flags = CIFSSEC_MUST_NTLM;
+	else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
+		*flags = CIFSSEC_MUST_LANMAN;
+	else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
+		*flags = CIFSSEC_MUST_PLNTXT;
+
+	*flags |= signflags;
+}
+
 static ssize_t cifs_security_flags_proc_write(struct file *file,
 		const char __user *buffer, size_t count, loff_t *ppos)
 {
+	int rc;
 	unsigned int flags;
 	char flags_string[12];
 	char c;
@@ -603,34 +649,43 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
 			global_secflags = CIFSSEC_MAX;
 			return count;
 		} else if (!isdigit(c)) {
-			cERROR(1, "invalid flag %c", c);
+			cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+					flags_string);
 			return -EINVAL;
 		}
 	}
-	/* else we have a number */
 
-	flags = simple_strtoul(flags_string, NULL, 0);
+	/* else we have a number */
+	rc = kstrtouint(flags_string, 0, &flags);
+	if (rc) {
+		cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
+				flags_string);
+		return rc;
+	}
 
-	cFYI(1, "sec flags 0x%x", flags);
+	cifs_dbg(FYI, "sec flags 0x%x\n", flags);
 
-	if (flags <= 0)  {
-		cERROR(1, "invalid security flags %s", flags_string);
+	if (flags == 0)  {
+		cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string);
 		return -EINVAL;
 	}
 
 	if (flags & ~CIFSSEC_MASK) {
-		cERROR(1, "attempt to set unsupported security flags 0x%x",
-			flags & ~CIFSSEC_MASK);
+		cifs_dbg(VFS, "Unsupported security flags: 0x%x\n",
+			 flags & ~CIFSSEC_MASK);
 		return -EINVAL;
 	}
+
+	cifs_security_flags_handle_must_flags(&flags);
+
 	/* flags look ok - update the global security flags for cifs module */
 	global_secflags = flags;
 	if (global_secflags & CIFSSEC_MUST_SIGN) {
 		/* requiring signing implies signing is allowed */
 		global_secflags |= CIFSSEC_MAY_SIGN;
-		cFYI(1, "packet signing now required");
+		cifs_dbg(FYI, "packet signing now required\n");
 	} else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
-		cFYI(1, "packet signing disabled");
+		cifs_dbg(FYI, "packet signing disabled\n");
 	}
 	/* BB should we turn on MAY flags for other MUST options? */
 	return count;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 69ae3d3c3b3..c99b40fb609 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -25,18 +25,20 @@
 void cifs_dump_mem(char *label, void *data, int length);
 void cifs_dump_detail(void *);
 void cifs_dump_mids(struct TCP_Server_Info *);
-#ifdef CONFIG_CIFS_DEBUG2
-#define DBG2 2
-#else
-#define DBG2 0
-#endif
 extern int traceSMB;		/* flag which enables the function below */
 void dump_smb(void *, int);
 #define CIFS_INFO	0x01
 #define CIFS_RC		0x02
 #define CIFS_TIMER	0x04
 
+#define VFS 1
+#define FYI 2
 extern int cifsFYI;
+#ifdef CONFIG_CIFS_DEBUG2
+#define NOISY 4
+#else
+#define NOISY 0
+#endif
 
 /*
  *	debug ON
@@ -44,31 +46,21 @@ extern int cifsFYI;
  */
 #ifdef CONFIG_CIFS_DEBUG
 
-/* information message: e.g., configuration, major event */
-#define cifsfyi(fmt, ...)						\
-do {									\
-	if (cifsFYI & CIFS_INFO)					\
-		printk(KERN_DEBUG "%s: " fmt "\n",			\
-		       __FILE__, ##__VA_ARGS__);			\
-} while (0)
-
-#define cFYI(set, fmt, ...)						\
-do {									\
-	if (set)							\
-		cifsfyi(fmt, ##__VA_ARGS__);				\
-} while (0)
+__printf(1, 2) void cifs_vfs_err(const char *fmt, ...);
 
-#define cifswarn(fmt, ...)						\
-	printk(KERN_WARNING fmt "\n", ##__VA_ARGS__)
-
-/* error event message: e.g., i/o error */
-#define cifserror(fmt, ...)						\
-	printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__);		\
-
-#define cERROR(set, fmt, ...)						\
+/* information message: e.g., configuration, major event */
+#define cifs_dbg(type, fmt, ...)					\
 do {									\
-	if (set)							\
-		cifserror(fmt, ##__VA_ARGS__);				\
+	if (type == FYI) {						\
+		if (cifsFYI & CIFS_INFO) {				\
+			printk(KERN_DEBUG "%s: " fmt,			\
+			       __FILE__, ##__VA_ARGS__);		\
+		}							\
+	} else if (type == VFS) {					\
+		cifs_vfs_err(fmt, ##__VA_ARGS__);			\
+	} else if (type == NOISY && type != 0) {			\
+		printk(KERN_DEBUG fmt, ##__VA_ARGS__);			\
+	}								\
 } while (0)
 
 /*
@@ -76,27 +68,11 @@ do {									\
  *	---------
  */
 #else		/* _CIFS_DEBUG */
-#define cifsfyi(fmt, ...)						\
+#define cifs_dbg(type, fmt, ...)					\
 do {									\
 	if (0)								\
-		printk(KERN_DEBUG "%s: " fmt "\n",			\
-		       __FILE__, ##__VA_ARGS__);			\
+		printk(KERN_DEBUG fmt, ##__VA_ARGS__);			\
 } while (0)
-#define cFYI(set, fmt, ...)						\
-do {									\
-	if (0 && set)							\
-		cifsfyi(fmt, ##__VA_ARGS__);				\
-} while (0)
-#define cifserror(fmt, ...)						\
-do {									\
-	if (0)								\
-		printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__);	\
-} while (0)
-#define cERROR(set, fmt, ...)						\
-do {									\
-	if (0 && set)							\
-		cifserror(fmt, ##__VA_ARGS__);				\
-} while (0)
-#endif		/* _CIFS_DEBUG */
+#endif
 
 #endif				/* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 210fce2df30..58df174deb1 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vfs.h>
 #include <linux/fs.h>
+#include <linux/inet.h>
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifsfs.h"
@@ -48,58 +49,74 @@ void cifs_dfs_release_automount_timer(void)
 }
 
 /**
- * cifs_get_share_name	-	extracts share name from UNC
- * @node_name:	pointer to UNC string
+ * cifs_build_devname - build a devicename from a UNC and optional prepath
+ * @nodename:	pointer to UNC string
+ * @prepath:	pointer to prefixpath (or NULL if there isn't one)
  *
- * Extracts sharename form full UNC.
- * i.e. strips from UNC trailing path that is not part of share
- * name and fixup missing '\' in the beginning of DFS node refferal
- * if necessary.
- * Returns pointer to share name on success or ERR_PTR on error.
- * Caller is responsible for freeing returned string.
+ * Build a new cifs devicename after chasing a DFS referral. Allocate a buffer
+ * big enough to hold the final thing. Copy the UNC from the nodename, and
+ * concatenate the prepath onto the end of it if there is one.
+ *
+ * Returns pointer to the built string, or a ERR_PTR. Caller is responsible
+ * for freeing the returned string.
  */
-static char *cifs_get_share_name(const char *node_name)
+static char *
+cifs_build_devname(char *nodename, const char *prepath)
 {
-	int len;
-	char *UNC;
-	char *pSep;
-
-	len = strlen(node_name);
-	UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
-			 GFP_KERNEL);
-	if (!UNC)
-		return ERR_PTR(-ENOMEM);
+	size_t pplen;
+	size_t unclen;
+	char *dev;
+	char *pos;
+
+	/* skip over any preceding delimiters */
+	nodename += strspn(nodename, "\\");
+	if (!*nodename)
+		return ERR_PTR(-EINVAL);
 
-	/* get share name and server name */
-	if (node_name[1] != '\\') {
-		UNC[0] = '\\';
-		strncpy(UNC+1, node_name, len);
-		len++;
-		UNC[len] = 0;
-	} else {
-		strncpy(UNC, node_name, len);
-		UNC[len] = 0;
-	}
+	/* get length of UNC and set pos to last char */
+	unclen = strlen(nodename);
+	pos = nodename + unclen - 1;
 
-	/* find server name end */
-	pSep = memchr(UNC+2, '\\', len-2);
-	if (!pSep) {
-		cERROR(1, "%s: no server name end in node name: %s",
-			__func__, node_name);
-		kfree(UNC);
-		return ERR_PTR(-EINVAL);
+	/* trim off any trailing delimiters */
+	while (*pos == '\\') {
+		--pos;
+		--unclen;
 	}
 
-	/* find sharename end */
-	pSep++;
-	pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC));
-	if (pSep) {
-		/* trim path up to sharename end
-		 * now we have share name in UNC */
-		*pSep = 0;
+	/* allocate a buffer:
+	 * +2 for preceding "//"
+	 * +1 for delimiter between UNC and prepath
+	 * +1 for trailing NULL
+	 */
+	pplen = prepath ? strlen(prepath) : 0;
+	dev = kmalloc(2 + unclen + 1 + pplen + 1, GFP_KERNEL);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	pos = dev;
+	/* add the initial "//" */
+	*pos = '/';
+	++pos;
+	*pos = '/';
+	++pos;
+
+	/* copy in the UNC portion from referral */
+	memcpy(pos, nodename, unclen);
+	pos += unclen;
+
+	/* copy the prefixpath remainder (if there is one) */
+	if (pplen) {
+		*pos = '/';
+		++pos;
+		memcpy(pos, prepath, pplen);
+		pos += pplen;
 	}
 
-	return UNC;
+	/* NULL terminator */
+	*pos = '\0';
+
+	convert_delimiter(dev, '/');
+	return dev;
 }
 
 
@@ -123,6 +140,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 {
 	int rc;
 	char *mountdata = NULL;
+	const char *prepath = NULL;
 	int md_len;
 	char *tkn_e;
 	char *srvIP = NULL;
@@ -132,7 +150,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 	if (sb_mountdata == NULL)
 		return ERR_PTR(-EINVAL);
 
-	*devname = cifs_get_share_name(ref->node_name);
+	if (strlen(fullpath) - ref->path_consumed)
+		prepath = fullpath + ref->path_consumed;
+
+	*devname = cifs_build_devname(ref->node_name, prepath);
 	if (IS_ERR(*devname)) {
 		rc = PTR_ERR(*devname);
 		*devname = NULL;
@@ -141,17 +162,19 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 
 	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
 	if (rc < 0) {
-		cFYI(1, "%s: Failed to resolve server part of %s to IP: %d",
-			__func__, *devname, rc);
+		cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n",
+			 __func__, *devname, rc);
 		goto compose_mount_options_err;
 	}
 
-	/* md_len = strlen(...) + 12 for 'sep+prefixpath='
-	 * assuming that we have 'unc=' and 'ip=' in
-	 * the original sb_mountdata
+	/*
+	 * In most cases, we'll be building a shorter string than the original,
+	 * but we do have to assume that the address in the ip= option may be
+	 * much longer than the original. Add the max length of an address
+	 * string to the length of the original string to allow for worst case.
 	 */
-	md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12;
-	mountdata = kzalloc(md_len+1, GFP_KERNEL);
+	md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
+	mountdata = kzalloc(md_len + 1, GFP_KERNEL);
 	if (mountdata == NULL) {
 		rc = -ENOMEM;
 		goto compose_mount_options_err;
@@ -195,29 +218,9 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 		strncat(mountdata, &sep, 1);
 	strcat(mountdata, "ip=");
 	strcat(mountdata, srvIP);
-	strncat(mountdata, &sep, 1);
-	strcat(mountdata, "unc=");
-	strcat(mountdata, *devname);
-
-	/* find & copy prefixpath */
-	tkn_e = strchr(ref->node_name + 2, '\\');
-	if (tkn_e == NULL) {
-		/* invalid unc, missing share name*/
-		rc = -EINVAL;
-		goto compose_mount_options_err;
-	}
-
-	tkn_e = strchr(tkn_e + 1, '\\');
-	if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
-		strncat(mountdata, &sep, 1);
-		strcat(mountdata, "prefixpath=");
-		if (tkn_e)
-			strcat(mountdata, tkn_e + 1);
-		strcat(mountdata, fullpath + ref->path_consumed);
-	}
 
-	/*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
-	/*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
+	/*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
+	/*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
 
 compose_mount_options_out:
 	kfree(srvIP);
@@ -260,11 +263,12 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 
 static void dump_referral(const struct dfs_info3_param *ref)
 {
-	cFYI(1, "DFS: ref path: %s", ref->path_name);
-	cFYI(1, "DFS: node path: %s", ref->node_name);
-	cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
-	cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
-				ref->path_consumed);
+	cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
+	cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
+	cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n",
+		 ref->flags, ref->server_type);
+	cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n",
+		 ref->ref_flag, ref->path_consumed);
 }
 
 /*
@@ -283,7 +287,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 	struct vfsmount *mnt;
 	struct tcon_link *tlink;
 
-	cFYI(1, "in %s", __func__);
+	cifs_dbg(FYI, "in %s\n", __func__);
 	BUG_ON(IS_ROOT(mntpt));
 
 	/*
@@ -320,15 +324,15 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 		/* connect to a node */
 		len = strlen(referrals[i].node_name);
 		if (len < 2) {
-			cERROR(1, "%s: Net Address path too short: %s",
-					__func__, referrals[i].node_name);
+			cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
+				 __func__, referrals[i].node_name);
 			mnt = ERR_PTR(-EINVAL);
 			break;
 		}
 		mnt = cifs_dfs_do_refmount(cifs_sb,
 				full_path, referrals + i);
-		cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
-					referrals[i].node_name, mnt);
+		cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
+			 __func__, referrals[i].node_name, mnt);
 		if (!IS_ERR(mnt))
 			goto success;
 	}
@@ -343,7 +347,7 @@ success:
 free_full_path:
 	kfree(full_path);
 cdda_exit:
-	cFYI(1, "leaving %s" , __func__);
+	cifs_dbg(FYI, "leaving %s\n" , __func__);
 	return mnt;
 }
 
@@ -354,11 +358,11 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path)
 {
 	struct vfsmount *newmnt;
 
-	cFYI(1, "in %s", __func__);
+	cifs_dbg(FYI, "in %s\n", __func__);
 
 	newmnt = cifs_dfs_do_automount(path->dentry);
 	if (IS_ERR(newmnt)) {
-		cFYI(1, "leaving %s [automount failed]" , __func__);
+		cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__);
 		return newmnt;
 	}
 
@@ -366,7 +370,7 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path)
 	mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
 	schedule_delayed_work(&cifs_dfs_automount_task,
 			      cifs_dfs_mountpoint_expiry_timeout);
-	cFYI(1, "leaving %s [ok]" , __func__);
+	cifs_dbg(FYI, "leaving %s [ok]\n" , __func__);
 	return newmnt;
 }
 
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index c865bfdfe81..37e4a72a7d1 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -55,10 +55,10 @@ struct cifs_sb_info {
 	unsigned int wsize;
 	unsigned long actimeo; /* attribute cache timeout (jiffies) */
 	atomic_t active;
-	uid_t	mnt_uid;
-	gid_t	mnt_gid;
-	uid_t	mnt_backupuid;
-	gid_t	mnt_backupgid;
+	kuid_t	mnt_uid;
+	kgid_t	mnt_gid;
+	kuid_t	mnt_backupuid;
+	kgid_t	mnt_backupgid;
 	umode_t	mnt_file_mode;
 	umode_t	mnt_dir_mode;
 	unsigned int mnt_cifs_flags;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 086f381d648..a3e93254761 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -37,12 +37,11 @@ cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 	int ret;
 
 	ret = -ENOMEM;
-	payload = kmalloc(prep->datalen, GFP_KERNEL);
+	payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
 	if (!payload)
 		goto error;
 
 	/* attach the data */
-	memcpy(payload, prep->data, prep->datalen);
 	key->payload.data = payload;
 	ret = 0;
 
@@ -149,10 +148,12 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
 		goto out;
 
 	dp = description + strlen(description);
-	sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
+	sprintf(dp, ";uid=0x%x",
+		from_kuid_munged(&init_user_ns, sesInfo->linux_uid));
 
 	dp = description + strlen(description);
-	sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
+	sprintf(dp, ";creduid=0x%x",
+		from_kuid_munged(&init_user_ns, sesInfo->cred_uid));
 
 	if (sesInfo->user_name) {
 		dp = description + strlen(description);
@@ -162,7 +163,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
 	dp = description + strlen(description);
 	sprintf(dp, ";pid=0x%x", current->pid);
 
-	cFYI(1, "key description = %s", description);
+	cifs_dbg(FYI, "key description = %s\n", description);
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
 
 #ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 71d5d0a5f6b..0227b45ef00 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -227,8 +227,8 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
 	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
 		charlen = codepage->char2uni(from, len, &wchar_to);
 		if (charlen < 1) {
-			cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d",
-				*from, charlen);
+			cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n",
+				 *from, charlen);
 			/* A question mark */
 			wchar_to = 0x003f;
 			charlen = 1;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 4fb097468e2..fe8d6276410 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -327,14 +327,14 @@ UniToupper(register wchar_t uc)
 /*
  * UniStrupr:  Upper case a unicode string
  */
-static inline wchar_t *
-UniStrupr(register wchar_t *upin)
+static inline __le16 *
+UniStrupr(register __le16 *upin)
 {
-	register wchar_t *up;
+	register __le16 *up;
 
 	up = upin;
 	while (*up) {		/* For all characters */
-		*up = UniToupper(*up);
+		*up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
 		up++;
 	}
 	return upin;		/* Return input pointer */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 5cbd00e7406..51f5e0ee723 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -63,11 +63,10 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 		key->datalen = prep->datalen;
 		return 0;
 	}
-	payload = kmalloc(prep->datalen, GFP_KERNEL);
+	payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
 	if (!payload)
 		return -ENOMEM;
 
-	memcpy(payload, prep->data, prep->datalen);
 	key->payload.data = payload;
 	key->datalen = prep->datalen;
 	return 0;
@@ -219,13 +218,13 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
 	sidkey = request_key(&cifs_idmap_key_type, desc, "");
 	if (IS_ERR(sidkey)) {
 		rc = -EINVAL;
-		cFYI(1, "%s: Can't map %cid %u to a SID", __func__,
-			sidtype == SIDOWNER ? 'u' : 'g', cid);
+		cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n",
+			 __func__, sidtype == SIDOWNER ? 'u' : 'g', cid);
 		goto out_revert_creds;
 	} else if (sidkey->datalen < CIFS_SID_BASE_SIZE) {
 		rc = -EIO;
-		cFYI(1, "%s: Downcall contained malformed key "
-			"(datalen=%hu)", __func__, sidkey->datalen);
+		cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n",
+			 __func__, sidkey->datalen);
 		goto invalidate_key;
 	}
 
@@ -241,8 +240,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
 	ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
 	if (ksid_size > sidkey->datalen) {
 		rc = -EIO;
-		cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, "
-			"ksid_size=%u)", __func__, sidkey->datalen, ksid_size);
+		cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu, ksid_size=%u)\n",
+			 __func__, sidkey->datalen, ksid_size);
 		goto invalidate_key;
 	}
 
@@ -266,16 +265,16 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
 	struct key *sidkey;
 	char *sidstr;
 	const struct cred *saved_cred;
-	uid_t fuid = cifs_sb->mnt_uid;
-	gid_t fgid = cifs_sb->mnt_gid;
+	kuid_t fuid = cifs_sb->mnt_uid;
+	kgid_t fgid = cifs_sb->mnt_gid;
 
 	/*
 	 * If we have too many subauthorities, then something is really wrong.
 	 * Just return an error.
 	 */
 	if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) {
-		cFYI(1, "%s: %u subauthorities is too many!", __func__,
-			psid->num_subauth);
+		cifs_dbg(FYI, "%s: %u subauthorities is too many!\n",
+			 __func__, psid->num_subauth);
 		return -EIO;
 	}
 
@@ -287,8 +286,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
 	sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
 	if (IS_ERR(sidkey)) {
 		rc = -EINVAL;
-		cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr,
-			sidtype == SIDOWNER ? 'u' : 'g');
+		cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n",
+			 __func__, sidstr, sidtype == SIDOWNER ? 'u' : 'g');
 		goto out_revert_creds;
 	}
 
@@ -297,18 +296,30 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
 	 * probably a safe assumption but might be better to check based on
 	 * sidtype.
 	 */
+	BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
 	if (sidkey->datalen != sizeof(uid_t)) {
 		rc = -EIO;
-		cFYI(1, "%s: Downcall contained malformed key "
-			"(datalen=%hu)", __func__, sidkey->datalen);
+		cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n",
+			 __func__, sidkey->datalen);
 		key_invalidate(sidkey);
 		goto out_key_put;
 	}
 
-	if (sidtype == SIDOWNER)
-		memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t));
-	else
-		memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t));
+	if (sidtype == SIDOWNER) {
+		kuid_t uid;
+		uid_t id;
+		memcpy(&id, &sidkey->payload.value, sizeof(uid_t));
+		uid = make_kuid(&init_user_ns, id);
+		if (uid_valid(uid))
+			fuid = uid;
+	} else {
+		kgid_t gid;
+		gid_t id;
+		memcpy(&id, &sidkey->payload.value, sizeof(gid_t));
+		gid = make_kgid(&init_user_ns, id);
+		if (gid_valid(gid))
+			fgid = gid;
+	}
 
 out_key_put:
 	key_put(sidkey);
@@ -334,7 +345,8 @@ init_cifs_idmap(void)
 	struct key *keyring;
 	int ret;
 
-	cFYI(1, "Registering the %s key type", cifs_idmap_key_type.name);
+	cifs_dbg(FYI, "Registering the %s key type\n",
+		 cifs_idmap_key_type.name);
 
 	/* create an override credential set with a special thread keyring in
 	 * which requests are cached
@@ -346,7 +358,8 @@ init_cifs_idmap(void)
 	if (!cred)
 		return -ENOMEM;
 
-	keyring = keyring_alloc(".cifs_idmap", 0, 0, cred,
+	keyring = keyring_alloc(".cifs_idmap",
+				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ,
 				KEY_ALLOC_NOT_IN_QUOTA, NULL);
@@ -366,7 +379,7 @@ init_cifs_idmap(void)
 	cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
 	root_cred = cred;
 
-	cFYI(1, "cifs idmap keyring: %d", key_serial(keyring));
+	cifs_dbg(FYI, "cifs idmap keyring: %d\n", key_serial(keyring));
 	return 0;
 
 failed_put_key:
@@ -382,7 +395,7 @@ exit_cifs_idmap(void)
 	key_revoke(root_cred->thread_keyring);
 	unregister_key_type(&cifs_idmap_key_type);
 	put_cred(root_cred);
-	cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name);
+	cifs_dbg(FYI, "Unregistered %s key type\n", cifs_idmap_key_type.name);
 }
 
 /* copy ntsd, owner sid, and group sid from a security descriptor to another */
@@ -449,14 +462,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
 			*pbits_to_set &= ~S_IXUGO;
 		return;
 	} else if (type != ACCESS_ALLOWED) {
-		cERROR(1, "unknown access control type %d", type);
+		cifs_dbg(VFS, "unknown access control type %d\n", type);
 		return;
 	}
 	/* else ACCESS_ALLOWED type */
 
 	if (flags & GENERIC_ALL) {
 		*pmode |= (S_IRWXUGO & (*pbits_to_set));
-		cFYI(DBG2, "all perms");
+		cifs_dbg(NOISY, "all perms\n");
 		return;
 	}
 	if ((flags & GENERIC_WRITE) ||
@@ -469,7 +482,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
 			((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
 		*pmode |= (S_IXUGO & (*pbits_to_set));
 
-	cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
+	cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode);
 	return;
 }
 
@@ -498,7 +511,8 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
 	if (mode & S_IXUGO)
 		*pace_flags |= SET_FILE_EXEC_RIGHTS;
 
-	cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
+	cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n",
+		 mode, *pace_flags);
 	return;
 }
 
@@ -538,24 +552,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
 	/* validate that we do not go past end of acl */
 
 	if (le16_to_cpu(pace->size) < 16) {
-		cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
+		cifs_dbg(VFS, "ACE too small %d\n", le16_to_cpu(pace->size));
 		return;
 	}
 
 	if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
-		cERROR(1, "ACL too small to parse ACE");
+		cifs_dbg(VFS, "ACL too small to parse ACE\n");
 		return;
 	}
 
 	num_subauth = pace->sid.num_subauth;
 	if (num_subauth) {
 		int i;
-		cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
-			pace->sid.revision, pace->sid.num_subauth, pace->type,
-			pace->flags, le16_to_cpu(pace->size));
+		cifs_dbg(FYI, "ACE revision %d num_auth %d type %d flags %d size %d\n",
+			 pace->sid.revision, pace->sid.num_subauth, pace->type,
+			 pace->flags, le16_to_cpu(pace->size));
 		for (i = 0; i < num_subauth; ++i) {
-			cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
-				le32_to_cpu(pace->sid.sub_auth[i]));
+			cifs_dbg(FYI, "ACE sub_auth[%d]: 0x%x\n",
+				 i, le32_to_cpu(pace->sid.sub_auth[i]));
 		}
 
 		/* BB add length check to make sure that we do not have huge
@@ -588,13 +602,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 
 	/* validate that we do not go past end of acl */
 	if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
-		cERROR(1, "ACL too small to parse DACL");
+		cifs_dbg(VFS, "ACL too small to parse DACL\n");
 		return;
 	}
 
-	cFYI(DBG2, "DACL revision %d size %d num aces %d",
-		le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
-		le32_to_cpu(pdacl->num_aces));
+	cifs_dbg(NOISY, "DACL revision %d size %d num aces %d\n",
+		 le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
+		 le32_to_cpu(pdacl->num_aces));
 
 	/* reset rwx permissions for user/group/other.
 	   Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -614,10 +628,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 			return;
 		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
 				GFP_KERNEL);
-		if (!ppace) {
-			cERROR(1, "DACL memory allocation error");
+		if (!ppace)
 			return;
-		}
 
 		for (i = 0; i < num_aces; ++i) {
 			ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
@@ -690,25 +702,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 	/* validate that we do not go past end of ACL - sid must be at least 8
 	   bytes long (assuming no sub-auths - e.g. the null SID */
 	if (end_of_acl < (char *)psid + 8) {
-		cERROR(1, "ACL too small to parse SID %p", psid);
+		cifs_dbg(VFS, "ACL too small to parse SID %p\n", psid);
 		return -EINVAL;
 	}
 
 #ifdef CONFIG_CIFS_DEBUG2
 	if (psid->num_subauth) {
 		int i;
-		cFYI(1, "SID revision %d num_auth %d",
-			psid->revision, psid->num_subauth);
+		cifs_dbg(FYI, "SID revision %d num_auth %d\n",
+			 psid->revision, psid->num_subauth);
 
 		for (i = 0; i < psid->num_subauth; i++) {
-			cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
-				le32_to_cpu(psid->sub_auth[i]));
+			cifs_dbg(FYI, "SID sub_auth[%d]: 0x%x\n",
+				 i, le32_to_cpu(psid->sub_auth[i]));
 		}
 
 		/* BB add length check to make sure that we do not have huge
 			num auths and therefore go off the end */
-		cFYI(1, "RID 0x%x",
-			le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
+		cifs_dbg(FYI, "RID 0x%x\n",
+			 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
 	}
 #endif
 
@@ -735,31 +747,33 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
 				le32_to_cpu(pntsd->gsidoffset));
 	dacloffset = le32_to_cpu(pntsd->dacloffset);
 	dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-	cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
-		 "sacloffset 0x%x dacloffset 0x%x",
+	cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n",
 		 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
 		 le32_to_cpu(pntsd->gsidoffset),
 		 le32_to_cpu(pntsd->sacloffset), dacloffset);
 /*	cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
 	rc = parse_sid(owner_sid_ptr, end_of_acl);
 	if (rc) {
-		cFYI(1, "%s: Error %d parsing Owner SID", __func__, rc);
+		cifs_dbg(FYI, "%s: Error %d parsing Owner SID\n", __func__, rc);
 		return rc;
 	}
 	rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER);
 	if (rc) {
-		cFYI(1, "%s: Error %d mapping Owner SID to uid", __func__, rc);
+		cifs_dbg(FYI, "%s: Error %d mapping Owner SID to uid\n",
+			 __func__, rc);
 		return rc;
 	}
 
 	rc = parse_sid(group_sid_ptr, end_of_acl);
 	if (rc) {
-		cFYI(1, "%s: Error %d mapping Owner SID to gid", __func__, rc);
+		cifs_dbg(FYI, "%s: Error %d mapping Owner SID to gid\n",
+			 __func__, rc);
 		return rc;
 	}
 	rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP);
 	if (rc) {
-		cFYI(1, "%s: Error %d mapping Group SID to gid", __func__, rc);
+		cifs_dbg(FYI, "%s: Error %d mapping Group SID to gid\n",
+			 __func__, rc);
 		return rc;
 	}
 
@@ -767,14 +781,14 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
 		parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
 			   group_sid_ptr, fattr);
 	else
-		cFYI(1, "no ACL"); /* BB grant all or default perms? */
+		cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */
 
 	return rc;
 }
 
 /* Convert permission bits from mode to equivalent CIFS ACL */
 static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
-	__u32 secdesclen, __u64 nmode, uid_t uid, gid_t gid, int *aclflag)
+	__u32 secdesclen, __u64 nmode, kuid_t uid, kgid_t gid, int *aclflag)
 {
 	int rc = 0;
 	__u32 dacloffset;
@@ -806,17 +820,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 		*aclflag = CIFS_ACL_DACL;
 	} else {
 		memcpy(pnntsd, pntsd, secdesclen);
-		if (uid != NO_CHANGE_32) { /* chown */
+		if (uid_valid(uid)) { /* chown */
+			uid_t id;
 			owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
 					le32_to_cpu(pnntsd->osidoffset));
 			nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
 								GFP_KERNEL);
 			if (!nowner_sid_ptr)
 				return -ENOMEM;
-			rc = id_to_sid(uid, SIDOWNER, nowner_sid_ptr);
+			id = from_kuid(&init_user_ns, uid);
+			rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr);
 			if (rc) {
-				cFYI(1, "%s: Mapping error %d for owner id %d",
-						__func__, rc, uid);
+				cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n",
+					 __func__, rc, id);
 				kfree(nowner_sid_ptr);
 				return rc;
 			}
@@ -824,17 +840,19 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 			kfree(nowner_sid_ptr);
 			*aclflag = CIFS_ACL_OWNER;
 		}
-		if (gid != NO_CHANGE_32) { /* chgrp */
+		if (gid_valid(gid)) { /* chgrp */
+			gid_t id;
 			group_sid_ptr = (struct cifs_sid *)((char *)pnntsd +
 					le32_to_cpu(pnntsd->gsidoffset));
 			ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
 								GFP_KERNEL);
 			if (!ngroup_sid_ptr)
 				return -ENOMEM;
-			rc = id_to_sid(gid, SIDGROUP, ngroup_sid_ptr);
+			id = from_kgid(&init_user_ns, gid);
+			rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr);
 			if (rc) {
-				cFYI(1, "%s: Mapping error %d for group id %d",
-						__func__, rc, gid);
+				cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n",
+					 __func__, rc, id);
 				kfree(ngroup_sid_ptr);
 				return rc;
 			}
@@ -864,7 +882,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
 
 	cifs_put_tlink(tlink);
 
-	cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+	cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
 	if (rc)
 		return ERR_PTR(rc);
 	return pntsd;
@@ -901,7 +919,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 	cifs_put_tlink(tlink);
 	free_xid(xid);
 
-	cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen);
+	cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen);
 	if (rc)
 		return ERR_PTR(rc);
 	return pntsd;
@@ -955,12 +973,12 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 			create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
-		cERROR(1, "Unable to open file to set ACL");
+		cifs_dbg(VFS, "Unable to open file to set ACL\n");
 		goto out;
 	}
 
 	rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag);
-	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
+	cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc);
 
 	CIFSSMBClose(xid, tcon, fid);
 out:
@@ -978,7 +996,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
 	u32 acllen = 0;
 	int rc = 0;
 
-	cFYI(DBG2, "converting ACL to mode for %s", path);
+	cifs_dbg(NOISY, "converting ACL to mode for %s\n", path);
 
 	if (pfid)
 		pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
@@ -988,12 +1006,12 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
 	/* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
 	if (IS_ERR(pntsd)) {
 		rc = PTR_ERR(pntsd);
-		cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+		cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
 	} else {
 		rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr);
 		kfree(pntsd);
 		if (rc)
-			cERROR(1, "parse sec desc failed rc = %d", rc);
+			cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc);
 	}
 
 	return rc;
@@ -1002,7 +1020,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
 /* Convert mode bits to an ACL so we can update the ACL on the server */
 int
 id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
-			uid_t uid, gid_t gid)
+			kuid_t uid, kgid_t gid)
 {
 	int rc = 0;
 	int aclflag = CIFS_ACL_DACL; /* default flag to set */
@@ -1010,13 +1028,13 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
 	struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
 	struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
 
-	cFYI(DBG2, "set ACL from mode for %s", path);
+	cifs_dbg(NOISY, "set ACL from mode for %s\n", path);
 
 	/* Get the security descriptor */
 	pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
 	if (IS_ERR(pntsd)) {
 		rc = PTR_ERR(pntsd);
-		cERROR(1, "%s: error %d getting sec desc", __func__, rc);
+		cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
 		goto out;
 	}
 
@@ -1029,7 +1047,6 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
 	secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN);
 	pnntsd = kmalloc(secdesclen, GFP_KERNEL);
 	if (!pnntsd) {
-		cERROR(1, "Unable to allocate security descriptor");
 		kfree(pntsd);
 		return -ENOMEM;
 	}
@@ -1037,12 +1054,12 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
 	rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
 				&aclflag);
 
-	cFYI(DBG2, "build_sec_desc rc: %d", rc);
+	cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
 
 	if (!rc) {
 		/* Set the security descriptor */
 		rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag);
-		cFYI(DBG2, "set_cifs_acl rc: %d", rc);
+		cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc);
 	}
 
 	kfree(pnntsd);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 652f5051be0..3d8bf941d12 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -50,20 +50,20 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
 		return -EINVAL;
 
 	if (!server->secmech.sdescmd5) {
-		cERROR(1, "%s: Can't generate signature", __func__);
+		cifs_dbg(VFS, "%s: Can't generate signature\n", __func__);
 		return -1;
 	}
 
 	rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
 	if (rc) {
-		cERROR(1, "%s: Could not init md5", __func__);
+		cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
 		return rc;
 	}
 
 	rc = crypto_shash_update(&server->secmech.sdescmd5->shash,
 		server->session_key.response, server->session_key.len);
 	if (rc) {
-		cERROR(1, "%s: Could not update with response", __func__);
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
 		return rc;
 	}
 
@@ -71,7 +71,7 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
 		if (iov[i].iov_len == 0)
 			continue;
 		if (iov[i].iov_base == NULL) {
-			cERROR(1, "null iovec entry");
+			cifs_dbg(VFS, "null iovec entry\n");
 			return -EIO;
 		}
 		/* The first entry includes a length field (which does not get
@@ -88,8 +88,8 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
 				iov[i].iov_base, iov[i].iov_len);
 		}
 		if (rc) {
-			cERROR(1, "%s: Could not update with payload",
-							__func__);
+			cifs_dbg(VFS, "%s: Could not update with payload\n",
+				 __func__);
 			return rc;
 		}
 	}
@@ -106,7 +106,7 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
 
 	rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
 	if (rc)
-		cERROR(1, "%s: Could not generate md5 hash", __func__);
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
 
 	return rc;
 }
@@ -135,8 +135,8 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
 				cpu_to_le32(server->sequence_number);
 	cifs_pdu->Signature.Sequence.Reserved = 0;
 
-	*pexpected_response_sequence_number = server->sequence_number++;
-	server->sequence_number++;
+	*pexpected_response_sequence_number = ++server->sequence_number;
+	++server->sequence_number;
 
 	rc = cifs_calc_signature(rqst, server, smb_signature);
 	if (rc)
@@ -196,8 +196,8 @@ int cifs_verify_signature(struct smb_rqst *rqst,
 
 	/* Do not need to verify session setups with signature "BSRSPYL "  */
 	if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
-		cFYI(1, "dummy signature received for smb command 0x%x",
-			cifs_pdu->Command);
+		cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n",
+			 cifs_pdu->Command);
 
 	/* save off the origiginal signature so we can modify the smb and check
 		its signature against what the server sent */
@@ -235,30 +235,30 @@ int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp)
 		return -EINVAL;
 
 	ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL);
-	if (!ses->auth_key.response) {
-		cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len);
+	if (!ses->auth_key.response)
 		return -ENOMEM;
-	}
+
 	ses->auth_key.len = temp_len;
 
 	rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
 			ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp);
 	if (rc) {
-		cFYI(1, "%s Can't generate NTLM response, error: %d",
-			__func__, rc);
+		cifs_dbg(FYI, "%s Can't generate NTLM response, error: %d\n",
+			 __func__, rc);
 		return rc;
 	}
 
 	rc = E_md4hash(ses->password, temp_key, nls_cp);
 	if (rc) {
-		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+		cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
+			 __func__, rc);
 		return rc;
 	}
 
 	rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
 	if (rc)
-		cFYI(1, "%s Can't generate NTLM session key, error: %d",
-			__func__, rc);
+		cifs_dbg(FYI, "%s Can't generate NTLM session key, error: %d\n",
+			 __func__, rc);
 
 	return rc;
 }
@@ -276,7 +276,6 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 		strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
 
 	if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
-		memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
 		memcpy(lnm_session_key, password_with_pad,
 			CIFS_ENCPWD_SIZE);
 		return 0;
@@ -334,7 +333,6 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
 	if (!ses->auth_key.response) {
 		ses->auth_key.len = 0;
-		cERROR(1, "Challenge target info allocation failure");
 		return -ENOMEM;
 	}
 
@@ -415,12 +413,12 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 	int rc = 0;
 	int len;
 	char nt_hash[CIFS_NTHASH_SIZE];
-	wchar_t *user;
+	__le16 *user;
 	wchar_t *domain;
 	wchar_t *server;
 
 	if (!ses->server->secmech.sdeschmacmd5) {
-		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash");
+		cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
 		return -1;
 	}
 
@@ -430,27 +428,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 	rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash,
 				CIFS_NTHASH_SIZE);
 	if (rc) {
-		cERROR(1, "%s: Could not set NT Hash as a key", __func__);
+		cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__);
 		return rc;
 	}
 
 	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
 	if (rc) {
-		cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5");
+		cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__);
 		return rc;
 	}
 
-	/* convert ses->user_name to unicode and uppercase */
+	/* convert ses->user_name to unicode */
 	len = ses->user_name ? strlen(ses->user_name) : 0;
 	user = kmalloc(2 + (len * 2), GFP_KERNEL);
 	if (user == NULL) {
-		cERROR(1, "calc_ntlmv2_hash: user mem alloc failure");
 		rc = -ENOMEM;
 		return rc;
 	}
 
 	if (len) {
-		len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp);
+		len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
 		UniStrupr(user);
 	} else {
 		memset(user, '\0', 2);
@@ -460,7 +457,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 				(char *)user, 2 * len);
 	kfree(user);
 	if (rc) {
-		cERROR(1, "%s: Could not update with user", __func__);
+		cifs_dbg(VFS, "%s: Could not update with user\n", __func__);
 		return rc;
 	}
 
@@ -470,7 +467,6 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 
 		domain = kmalloc(2 + (len * 2), GFP_KERNEL);
 		if (domain == NULL) {
-			cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
 			rc = -ENOMEM;
 			return rc;
 		}
@@ -481,8 +477,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 					(char *)domain, 2 * len);
 		kfree(domain);
 		if (rc) {
-			cERROR(1, "%s: Could not update with domain",
-								__func__);
+			cifs_dbg(VFS, "%s: Could not update with domain\n",
+				 __func__);
 			return rc;
 		}
 	} else if (ses->serverName) {
@@ -490,7 +486,6 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 
 		server = kmalloc(2 + (len * 2), GFP_KERNEL);
 		if (server == NULL) {
-			cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
 			rc = -ENOMEM;
 			return rc;
 		}
@@ -501,8 +496,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 					(char *)server, 2 * len);
 		kfree(server);
 		if (rc) {
-			cERROR(1, "%s: Could not update with server",
-								__func__);
+			cifs_dbg(VFS, "%s: Could not update with server\n",
+				 __func__);
 			return rc;
 		}
 	}
@@ -510,7 +505,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
 					ntlmv2_hash);
 	if (rc)
-		cERROR(1, "%s: Could not generate md5 hash", __func__);
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
 
 	return rc;
 }
@@ -522,24 +517,25 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
 	unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
 
 	if (!ses->server->secmech.sdeschmacmd5) {
-		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash");
+		cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
 		return -1;
 	}
 
 	rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
 				ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
 	if (rc) {
-		cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__);
+		cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
+			 __func__);
 		return rc;
 	}
 
 	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
 	if (rc) {
-		cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
+		cifs_dbg(VFS, "%s: could not init hmacmd5\n", __func__);
 		return rc;
 	}
 
-	if (ses->server->secType == RawNTLMSSP)
+	if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
 		memcpy(ses->auth_key.response + offset,
 			ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
 	else
@@ -548,14 +544,14 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
 	rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
 		ses->auth_key.response + offset, ses->auth_key.len - offset);
 	if (rc) {
-		cERROR(1, "%s: Could not update with response", __func__);
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
 		return rc;
 	}
 
 	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
 		ses->auth_key.response + CIFS_SESS_KEY_SIZE);
 	if (rc)
-		cERROR(1, "%s: Could not generate md5 hash", __func__);
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
 
 	return rc;
 }
@@ -571,18 +567,19 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	char ntlmv2_hash[16];
 	unsigned char *tiblob = NULL; /* target info blob */
 
-	if (ses->server->secType == RawNTLMSSP) {
+	if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
 		if (!ses->domainName) {
 			rc = find_domain_name(ses, nls_cp);
 			if (rc) {
-				cERROR(1, "error %d finding domain name", rc);
+				cifs_dbg(VFS, "error %d finding domain name\n",
+					 rc);
 				goto setup_ntlmv2_rsp_ret;
 			}
 		}
 	} else {
 		rc = build_avpair_blob(ses, nls_cp);
 		if (rc) {
-			cERROR(1, "error %d building av pair blob", rc);
+			cifs_dbg(VFS, "error %d building av pair blob\n", rc);
 			goto setup_ntlmv2_rsp_ret;
 		}
 	}
@@ -595,7 +592,6 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	if (!ses->auth_key.response) {
 		rc = ENOMEM;
 		ses->auth_key.len = 0;
-		cERROR(1, "%s: Can't allocate auth blob", __func__);
 		goto setup_ntlmv2_rsp_ret;
 	}
 	ses->auth_key.len += baselen;
@@ -613,14 +609,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	/* calculate ntlmv2_hash */
 	rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
 	if (rc) {
-		cERROR(1, "could not get v2 hash rc %d", rc);
+		cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc);
 		goto setup_ntlmv2_rsp_ret;
 	}
 
 	/* calculate first part of the client response (CR1) */
 	rc = CalcNTLMv2_response(ses, ntlmv2_hash);
 	if (rc) {
-		cERROR(1, "Could not calculate CR1  rc: %d", rc);
+		cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc);
 		goto setup_ntlmv2_rsp_ret;
 	}
 
@@ -628,13 +624,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
 		ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
 	if (rc) {
-		cERROR(1, "%s: Could not set NTLMV2 Hash as a key", __func__);
+		cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
+			 __func__);
 		goto setup_ntlmv2_rsp_ret;
 	}
 
 	rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
 	if (rc) {
-		cERROR(1, "%s: Could not init hmacmd5", __func__);
+		cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
 		goto setup_ntlmv2_rsp_ret;
 	}
 
@@ -642,14 +639,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 		ses->auth_key.response + CIFS_SESS_KEY_SIZE,
 		CIFS_HMAC_MD5_HASH_SIZE);
 	if (rc) {
-		cERROR(1, "%s: Could not update with response", __func__);
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
 		goto setup_ntlmv2_rsp_ret;
 	}
 
 	rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
 		ses->auth_key.response);
 	if (rc)
-		cERROR(1, "%s: Could not generate md5 hash", __func__);
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
 
 setup_ntlmv2_rsp_ret:
 	kfree(tiblob);
@@ -671,7 +668,7 @@ calc_seckey(struct cifs_ses *ses)
 	tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(tfm_arc4)) {
 		rc = PTR_ERR(tfm_arc4);
-		cERROR(1, "could not allocate crypto API arc4");
+		cifs_dbg(VFS, "could not allocate crypto API arc4\n");
 		return rc;
 	}
 
@@ -680,7 +677,8 @@ calc_seckey(struct cifs_ses *ses)
 	rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
 					CIFS_SESS_KEY_SIZE);
 	if (rc) {
-		cERROR(1, "%s: Could not set response as a key", __func__);
+		cifs_dbg(VFS, "%s: Could not set response as a key\n",
+			 __func__);
 		return rc;
 	}
 
@@ -689,7 +687,7 @@ calc_seckey(struct cifs_ses *ses)
 
 	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
 	if (rc) {
-		cERROR(1, "could not encrypt session key rc: %d", rc);
+		cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc);
 		crypto_free_blkcipher(tfm_arc4);
 		return rc;
 	}
@@ -707,6 +705,9 @@ calc_seckey(struct cifs_ses *ses)
 void
 cifs_crypto_shash_release(struct TCP_Server_Info *server)
 {
+	if (server->secmech.cmacaes)
+		crypto_free_shash(server->secmech.cmacaes);
+
 	if (server->secmech.hmacsha256)
 		crypto_free_shash(server->secmech.hmacsha256);
 
@@ -716,6 +717,8 @@ cifs_crypto_shash_release(struct TCP_Server_Info *server)
 	if (server->secmech.hmacmd5)
 		crypto_free_shash(server->secmech.hmacmd5);
 
+	kfree(server->secmech.sdesccmacaes);
+
 	kfree(server->secmech.sdeschmacsha256);
 
 	kfree(server->secmech.sdeschmacmd5);
@@ -731,29 +734,35 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
 
 	server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
 	if (IS_ERR(server->secmech.hmacmd5)) {
-		cERROR(1, "could not allocate crypto hmacmd5");
+		cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
 		return PTR_ERR(server->secmech.hmacmd5);
 	}
 
 	server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
 	if (IS_ERR(server->secmech.md5)) {
-		cERROR(1, "could not allocate crypto md5");
+		cifs_dbg(VFS, "could not allocate crypto md5\n");
 		rc = PTR_ERR(server->secmech.md5);
 		goto crypto_allocate_md5_fail;
 	}
 
 	server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
 	if (IS_ERR(server->secmech.hmacsha256)) {
-		cERROR(1, "could not allocate crypto hmacsha256\n");
+		cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
 		rc = PTR_ERR(server->secmech.hmacsha256);
 		goto crypto_allocate_hmacsha256_fail;
 	}
 
+	server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
+	if (IS_ERR(server->secmech.cmacaes)) {
+		cifs_dbg(VFS, "could not allocate crypto cmac-aes");
+		rc = PTR_ERR(server->secmech.cmacaes);
+		goto crypto_allocate_cmacaes_fail;
+	}
+
 	size = sizeof(struct shash_desc) +
 			crypto_shash_descsize(server->secmech.hmacmd5);
 	server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
 	if (!server->secmech.sdeschmacmd5) {
-		cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5");
 		rc = -ENOMEM;
 		goto crypto_allocate_hmacmd5_sdesc_fail;
 	}
@@ -764,7 +773,6 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
 			crypto_shash_descsize(server->secmech.md5);
 	server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
 	if (!server->secmech.sdescmd5) {
-		cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5");
 		rc = -ENOMEM;
 		goto crypto_allocate_md5_sdesc_fail;
 	}
@@ -775,15 +783,28 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
 			crypto_shash_descsize(server->secmech.hmacsha256);
 	server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
 	if (!server->secmech.sdeschmacsha256) {
-		cERROR(1, "%s: Can't alloc hmacsha256\n", __func__);
 		rc = -ENOMEM;
 		goto crypto_allocate_hmacsha256_sdesc_fail;
 	}
 	server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
 	server->secmech.sdeschmacsha256->shash.flags = 0x0;
 
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->secmech.cmacaes);
+	server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
+	if (!server->secmech.sdesccmacaes) {
+		cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
+		rc = -ENOMEM;
+		goto crypto_allocate_cmacaes_sdesc_fail;
+	}
+	server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
+	server->secmech.sdesccmacaes->shash.flags = 0x0;
+
 	return 0;
 
+crypto_allocate_cmacaes_sdesc_fail:
+	kfree(server->secmech.sdeschmacsha256);
+
 crypto_allocate_hmacsha256_sdesc_fail:
 	kfree(server->secmech.sdescmd5);
 
@@ -791,6 +812,9 @@ crypto_allocate_md5_sdesc_fail:
 	kfree(server->secmech.sdeschmacmd5);
 
 crypto_allocate_hmacmd5_sdesc_fail:
+	crypto_free_shash(server->secmech.cmacaes);
+
+crypto_allocate_cmacaes_fail:
 	crypto_free_shash(server->secmech.hmacsha256);
 
 crypto_allocate_hmacsha256_fail:
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index de7f9168a11..4bdd547dbf6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -91,6 +91,30 @@ struct workqueue_struct	*cifsiod_wq;
 __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
 #endif
 
+/*
+ * Bumps refcount for cifs super block.
+ * Note that it should be only called if a referece to VFS super block is
+ * already held, e.g. in open-type syscalls context. Otherwise it can race with
+ * atomic_dec_and_test in deactivate_locked_super.
+ */
+void
+cifs_sb_active(struct super_block *sb)
+{
+	struct cifs_sb_info *server = CIFS_SB(sb);
+
+	if (atomic_inc_return(&server->active) == 1)
+		atomic_inc(&sb->s_active);
+}
+
+void
+cifs_sb_deactive(struct super_block *sb)
+{
+	struct cifs_sb_info *server = CIFS_SB(sb);
+
+	if (atomic_dec_and_test(&server->active))
+		deactivate_super(sb);
+}
+
 static int
 cifs_read_super(struct super_block *sb)
 {
@@ -137,7 +161,7 @@ cifs_read_super(struct super_block *sb)
 
 #ifdef CONFIG_CIFS_NFSD_EXPORT
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-		cFYI(1, "export ops supported");
+		cifs_dbg(FYI, "export ops supported\n");
 		sb->s_export_op = &cifs_export_ops;
 	}
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
@@ -145,7 +169,7 @@ cifs_read_super(struct super_block *sb)
 	return 0;
 
 out_no_root:
-	cERROR(1, "cifs_read_super: get root inode failed");
+	cifs_dbg(VFS, "%s: get root inode failed\n", __func__);
 	return rc;
 }
 
@@ -288,11 +312,14 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 }
 
 static void
-cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
+cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
 {
+	if (ses->sectype == Unspecified)
+		return;
+
 	seq_printf(s, ",sec=");
 
-	switch (server->secType) {
+	switch (ses->sectype) {
 	case LANMAN:
 		seq_printf(s, "lanman");
 		break;
@@ -314,7 +341,7 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
 		break;
 	}
 
-	if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+	if (ses->sign)
 		seq_printf(s, "i");
 }
 
@@ -345,12 +372,9 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
 
 	seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
-	cifs_show_security(s, tcon->ses->server);
+	cifs_show_security(s, tcon->ses);
 	cifs_show_cache_flavor(s, cifs_sb);
 
-	seq_printf(s, ",unc=");
-	seq_escape(s, tcon->treeName, " \t\n\\");
-
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
 		seq_printf(s, ",multiuser");
 	else if (tcon->ses->user_name)
@@ -375,13 +399,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 				   (int)(srcaddr->sa_family));
 	}
 
-	seq_printf(s, ",uid=%u", cifs_sb->mnt_uid);
+	seq_printf(s, ",uid=%u",
+		   from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid));
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
 		seq_printf(s, ",forceuid");
 	else
 		seq_printf(s, ",noforceuid");
 
-	seq_printf(s, ",gid=%u", cifs_sb->mnt_gid);
+	seq_printf(s, ",gid=%u",
+		   from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid));
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
 		seq_printf(s, ",forcegid");
 	else
@@ -436,9 +462,13 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
 		seq_printf(s, ",noperm");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
-		seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid);
+		seq_printf(s, ",backupuid=%u",
+			   from_kuid_munged(&init_user_ns,
+					    cifs_sb->mnt_backupuid));
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
-		seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid);
+		seq_printf(s, ",backupgid=%u",
+			   from_kgid_munged(&init_user_ns,
+					    cifs_sb->mnt_backupgid));
 
 	seq_printf(s, ",rsize=%u", cifs_sb->rsize);
 	seq_printf(s, ",wsize=%u", cifs_sb->wsize);
@@ -472,7 +502,7 @@ static void cifs_umount_begin(struct super_block *sb)
 	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
 	/* cancel_notify_requests(tcon); */
 	if (tcon->ses && tcon->ses->server) {
-		cFYI(1, "wake up tasks now - umount begin not complete");
+		cifs_dbg(FYI, "wake up tasks now - umount begin not complete\n");
 		wake_up_all(&tcon->ses->server->request_q);
 		wake_up_all(&tcon->ses->server->response_q);
 		msleep(1); /* yield */
@@ -543,7 +573,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 	if (full_path == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	cFYI(1, "Get root dentry for %s", full_path);
+	cifs_dbg(FYI, "Get root dentry for %s\n", full_path);
 
 	sep = CIFS_DIR_SEP(cifs_sb);
 	dentry = dget(sb->s_root);
@@ -558,6 +588,11 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 			dentry = ERR_PTR(-ENOENT);
 			break;
 		}
+		if (!S_ISDIR(dir->i_mode)) {
+			dput(dentry);
+			dentry = ERR_PTR(-ENOTDIR);
+			break;
+		}
 
 		/* skip separators */
 		while (*s == sep)
@@ -597,7 +632,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 	struct cifs_mnt_data mnt_data;
 	struct dentry *root;
 
-	cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
+	cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
 
 	volume_info = cifs_get_volume_info((char *)data, dev_name);
 	if (IS_ERR(volume_info))
@@ -620,7 +655,8 @@ cifs_do_mount(struct file_system_type *fs_type,
 	rc = cifs_mount(cifs_sb, volume_info);
 	if (rc) {
 		if (!(flags & MS_SILENT))
-			cERROR(1, "cifs_mount failed w/return code = %d", rc);
+			cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
+				 rc);
 		root = ERR_PTR(rc);
 		goto out_mountdata;
 	}
@@ -640,7 +676,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 	}
 
 	if (sb->s_root) {
-		cFYI(1, "Use existing superblock");
+		cifs_dbg(FYI, "Use existing superblock\n");
 		cifs_umount(cifs_sb);
 	} else {
 		rc = cifs_read_super(sb);
@@ -656,7 +692,7 @@ cifs_do_mount(struct file_system_type *fs_type,
 	if (IS_ERR(root))
 		goto out_super;
 
-	cFYI(1, "dentry root is: %p", root);
+	cifs_dbg(FYI, "dentry root is: %p\n", root);
 	goto out;
 
 out_super:
@@ -677,7 +713,7 @@ out_nls:
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				   unsigned long nr_segs, loff_t pos)
 {
-	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t written;
 	int rc;
 
@@ -688,7 +724,8 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 	rc = filemap_fdatawrite(inode->i_mapping);
 	if (rc)
-		cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
+		cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n",
+			 rc, inode);
 
 	return written;
 }
@@ -701,7 +738,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
 	 */
 	if (whence != SEEK_SET && whence != SEEK_CUR) {
 		int rc;
-		struct inode *inode = file->f_path.dentry->d_inode;
+		struct inode *inode = file_inode(file);
 
 		/*
 		 * We need to be sure that all dirty pages are written and the
@@ -731,9 +768,9 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
 
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
-	/* note that this is called by vfs setlease with lock_flocks held
+	/* note that this is called by vfs setlease with i_lock held
 	   to protect *lease from going away */
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct cifsFileInfo *cfile = file->private_data;
 
 	if (!(S_ISREG(inode->i_mode)))
@@ -766,6 +803,7 @@ struct file_system_type cifs_fs_type = {
 	.kill_sb = cifs_kill_sb,
 	/*  .fs_flags */
 };
+MODULE_ALIAS_FS("cifs");
 const struct inode_operations cifs_dir_inode_ops = {
 	.create = cifs_create,
 	.atomic_open = cifs_atomic_open,
@@ -933,7 +971,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 };
 
 const struct file_operations cifs_dir_ops = {
-	.readdir = cifs_readdir,
+	.iterate = cifs_readdir,
 	.release = cifs_closedir,
 	.read    = generic_read_dir,
 	.unlocked_ioctl  = cifs_ioctl,
@@ -994,7 +1032,10 @@ cifs_init_request_bufs(void)
 	} else {
 		CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
 	}
-/*	cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
+/*
+	cifs_dbg(VFS, "CIFSMaxBufSize %d 0x%x\n",
+		 CIFSMaxBufSize, CIFSMaxBufSize);
+*/
 	cifs_req_cachep = kmem_cache_create("cifs_request",
 					    CIFSMaxBufSize + max_hdr_size, 0,
 					    SLAB_HWCACHE_ALIGN, NULL);
@@ -1005,7 +1046,7 @@ cifs_init_request_bufs(void)
 		cifs_min_rcv = 1;
 	else if (cifs_min_rcv > 64) {
 		cifs_min_rcv = 64;
-		cERROR(1, "cifs_min_rcv set to maximum (64)");
+		cifs_dbg(VFS, "cifs_min_rcv set to maximum (64)\n");
 	}
 
 	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
@@ -1036,7 +1077,7 @@ cifs_init_request_bufs(void)
 		cifs_min_small = 2;
 	else if (cifs_min_small > 256) {
 		cifs_min_small = 256;
-		cFYI(1, "cifs_min_small set to maximum (256)");
+		cifs_dbg(FYI, "cifs_min_small set to maximum (256)\n");
 	}
 
 	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
@@ -1127,10 +1168,11 @@ init_cifs(void)
 
 	if (cifs_max_pending < 2) {
 		cifs_max_pending = 2;
-		cFYI(1, "cifs_max_pending set to min of 2");
+		cifs_dbg(FYI, "cifs_max_pending set to min of 2\n");
 	} else if (cifs_max_pending > CIFS_MAX_REQ) {
 		cifs_max_pending = CIFS_MAX_REQ;
-		cFYI(1, "cifs_max_pending set to max of %u", CIFS_MAX_REQ);
+		cifs_dbg(FYI, "cifs_max_pending set to max of %u\n",
+			 CIFS_MAX_REQ);
 	}
 
 	cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
@@ -1199,7 +1241,7 @@ out_clean_proc:
 static void __exit
 exit_cifs(void)
 {
-	cFYI(DBG2, "exit_cifs");
+	cifs_dbg(NOISY, "exit_cifs\n");
 	unregister_filesystem(&cifs_fs_type);
 	cifs_dfs_release_automount_timer();
 #ifdef CONFIG_CIFS_ACL
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7163419cecd..ea723a5e822 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,6 +41,10 @@ extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
 
+/* Functions related to super block operations */
+extern void cifs_sb_active(struct super_block *sb);
+extern void cifs_sb_deactive(struct super_block *sb);
+
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
 extern struct inode *cifs_root_iget(struct super_block *);
@@ -97,7 +101,7 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
-extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
+extern int cifs_readdir(struct file *file, struct dir_context *ctx);
 
 /* Functions related to dir entries */
 extern const struct dentry_operations cifs_dentry_ops;
@@ -128,5 +132,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "2.0"
+#define CIFS_VERSION   "2.01"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index e6899cea1c3..e66b0888254 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -101,20 +101,14 @@ enum statusEnum {
 };
 
 enum securityEnum {
-	LANMAN = 0,			/* Legacy LANMAN auth */
+	Unspecified = 0,	/* not specified */
+	LANMAN,			/* Legacy LANMAN auth */
 	NTLM,			/* Legacy NTLM012 auth with NTLM hash */
 	NTLMv2,			/* Legacy NTLM auth with NTLMv2 hash */
 	RawNTLMSSP,		/* NTLMSSP without SPNEGO, NTLMv2 hash */
-/*	NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
 	Kerberos,		/* Kerberos via SPNEGO */
 };
 
-enum protocolEnum {
-	TCP = 0,
-	SCTP
-	/* Netbios frames protocol not supported at this time */
-};
-
 struct session_key {
 	unsigned int len;
 	char *response;
@@ -131,9 +125,11 @@ struct cifs_secmech {
 	struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
 	struct crypto_shash *md5; /* md5 hash function */
 	struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
+	struct crypto_shash *cmacaes; /* block-cipher based MAC function */
 	struct sdesc *sdeschmacmd5;  /* ctxt to generate ntlmv2 hash, CR1 */
 	struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
 	struct sdesc *sdeschmacsha256;  /* ctxt to generate smb2 signature */
+	struct sdesc *sdesccmacaes;  /* ctxt to generate smb3 signature */
 };
 
 /* per smb session structure/fields */
@@ -181,6 +177,7 @@ enum smb_version {
 	Smb_20,
 	Smb_21,
 	Smb_30,
+	Smb_302,
 };
 
 struct mid_q_entry;
@@ -228,6 +225,7 @@ struct smb_version_operations {
 	void (*dump_detail)(void *);
 	void (*clear_stats)(struct cifs_tcon *);
 	void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
+	void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
 	/* verify the message */
 	int (*check_message)(char *, unsigned int);
 	bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
@@ -367,6 +365,8 @@ struct smb_version_operations {
 	void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
 	/* generate new lease key */
 	void (*new_lease_key)(struct cifs_fid *fid);
+	/* The next two functions will need to be changed to per smb session */
+	void (*generate_signingkey)(struct TCP_Server_Info *server);
 	int (*calc_signature)(struct smb_rqst *rqst,
 				   struct TCP_Server_Info *server);
 };
@@ -387,6 +387,8 @@ struct smb_version_values {
 	unsigned int	cap_nt_find;
 	unsigned int	cap_large_files;
 	unsigned int	oplock_read;
+	__u16		signing_enabled;
+	__u16		signing_required;
 };
 
 #define HEADER_SIZE(server) (server->vals->header_size)
@@ -400,14 +402,15 @@ struct smb_vol {
 	char *iocharset;  /* local code page for mapping to and from Unicode */
 	char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
 	char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
-	uid_t cred_uid;
-	uid_t linux_uid;
-	gid_t linux_gid;
-	uid_t backupuid;
-	gid_t backupgid;
+	kuid_t cred_uid;
+	kuid_t linux_uid;
+	kgid_t linux_gid;
+	kuid_t backupuid;
+	kgid_t backupgid;
 	umode_t file_mode;
 	umode_t dir_mode;
-	unsigned secFlg;
+	enum securityEnum sectype; /* sectype requested via mnt opts */
+	bool sign; /* was signing requested via mnt opts? */
 	bool retry:1;
 	bool intr:1;
 	bool setuids:1;
@@ -441,6 +444,7 @@ struct smb_vol {
 	bool mfsymlinks:1; /* use Minshall+French Symlinks */
 	bool multiuser:1;
 	bool rwpidforward:1; /* pid forward for read/write operations */
+	bool nosharesock;
 	unsigned int rsize;
 	unsigned int wsize;
 	bool sockopt_tcp_nodelay:1;
@@ -514,6 +518,7 @@ struct TCP_Server_Info {
 	struct task_struct *tsk;
 	char server_GUID[16];
 	__u16 sec_mode;
+	bool sign; /* is signing enabled on this connection? */
 	bool session_estab; /* mark when very first sess is established */
 #ifdef CONFIG_CIFS_SMB2
 	int echo_credits;  /* echo reserved slots */
@@ -521,7 +526,6 @@ struct TCP_Server_Info {
 	bool echoes:1; /* enable echoes */
 #endif
 	u16 dialect; /* dialect index that server chose */
-	enum securityEnum secType;
 	bool oplocks:1; /* enable oplocks */
 	unsigned int maxReq;	/* Clients should submit no more */
 	/* than maxReq distinct unanswered SMBs to the server when using  */
@@ -540,12 +544,17 @@ struct TCP_Server_Info {
 	int timeAdj;  /* Adjust for difference in server time zone in sec */
 	__u64 CurrentMid;         /* multiplex id - rotating counter */
 	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
+	char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
 	/* 16th byte of RFC1001 workstation name is always null */
 	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
 	__u32 sequence_number; /* for signing, protected by srv_mutex */
 	struct session_key session_key;
 	unsigned long lstrp; /* when we got last response from this server */
 	struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
+#define	CIFS_NEGFLAVOR_LANMAN	0	/* wct == 13, LANMAN */
+#define	CIFS_NEGFLAVOR_UNENCAP	1	/* wct == 17, but no ext_sec */
+#define	CIFS_NEGFLAVOR_EXTENDED	2	/* wct == 17, ext_sec bit set */
+	char	negflavor;	/* NEGOTIATE response flavor */
 	/* extended security flavors that server supports */
 	bool	sec_ntlmssp;		/* supports NTLMSSP */
 	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
@@ -697,14 +706,13 @@ struct cifs_ses {
 	enum statusEnum status;
 	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
 	__u16 ipc_tid;		/* special tid for connection to IPC share */
-	__u16 flags;
 	__u16 vcnum;
 	char *serverOS;		/* name of operating system underlying server */
 	char *serverNOS;	/* name of network operating system of server */
 	char *serverDomain;	/* security realm of server */
 	__u64 Suid;		/* remote smb uid  */
-	uid_t linux_uid;        /* overriding owner of files on the mount */
-	uid_t cred_uid;		/* owner of credentials */
+	kuid_t linux_uid;	/* overriding owner of files on the mount */
+	kuid_t cred_uid;	/* owner of credentials */
 	unsigned int capabilities;
 	char serverName[SERVER_NAME_LEN_WITH_NULL * 2];	/* BB make bigger for
 				TCP names - will ipv6 and sctp addresses fit? */
@@ -714,21 +722,14 @@ struct cifs_ses {
 	char *password;
 	struct session_key auth_key;
 	struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
+	enum securityEnum sectype; /* what security flavor was specified? */
+	bool sign;		/* is signing required? */
 	bool need_reconnect:1; /* connection reset, uid now invalid */
 #ifdef CONFIG_CIFS_SMB2
 	__u16 session_flags;
 #endif /* CONFIG_CIFS_SMB2 */
 };
 
-/* no more than one of the following three session flags may be set */
-#define CIFS_SES_NT4 1
-#define CIFS_SES_OS2 2
-#define CIFS_SES_W9X 4
-/* following flag is set for old servers such as OS2 (and Win95?)
-   which do not negotiate NTLM or POSIX dialects, but instead
-   negotiate one of the older LANMAN dialects */
-#define CIFS_SES_LANMAN 8
-
 static inline bool
 cap_unix(struct cifs_ses *ses)
 {
@@ -816,7 +817,7 @@ struct cifs_tcon {
 #ifdef CONFIG_CIFS_SMB2
 	bool print:1;		/* set if connection to printer share */
 	bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
-	__u32 capabilities;
+	__le32 capabilities;
 	__u32 share_flags;
 	__u32 maximal_access;
 	__u32 vol_serial_number;
@@ -838,7 +839,7 @@ struct cifs_tcon {
  */
 struct tcon_link {
 	struct rb_node		tl_rbnode;
-	uid_t			tl_uid;
+	kuid_t			tl_uid;
 	unsigned long		tl_flags;
 #define TCON_LINK_MASTER	0
 #define TCON_LINK_PENDING	1
@@ -931,7 +932,7 @@ struct cifsFileInfo {
 	struct list_head tlist;	/* pointer to next fid owned by tcon */
 	struct list_head flist;	/* next fid (file instance) for this inode */
 	struct cifs_fid_locks *llist;	/* brlocks held by this fid */
-	unsigned int uid;	/* allows finding which FileInfo structure */
+	kuid_t uid;		/* allows finding which FileInfo structure */
 	__u32 pid;		/* process id who opened file */
 	struct cifs_fid fid;	/* file id from remote */
 	/* BB add lock scope info here if needed */ ;
@@ -1245,8 +1246,8 @@ struct cifs_fattr {
 	u64		cf_eof;
 	u64		cf_bytes;
 	u64		cf_createtime;
-	uid_t		cf_uid;
-	gid_t		cf_gid;
+	kuid_t		cf_uid;
+	kgid_t		cf_gid;
 	umode_t		cf_mode;
 	dev_t		cf_rdev;
 	unsigned int	cf_nlink;
@@ -1348,7 +1349,7 @@ require use of the stronger protocol */
 #define   CIFSSEC_MUST_SEAL	0x40040 /* not supported yet */
 #define   CIFSSEC_MUST_NTLMSSP	0x80080 /* raw ntlmssp with ntlmv2 */
 
-#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP)
+#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
 #define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
 #define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
 /*
@@ -1494,4 +1495,7 @@ extern struct smb_version_values smb21_values;
 #define SMB30_VERSION_STRING	"3.0"
 extern struct smb_version_operations smb30_operations;
 extern struct smb_version_values smb30_values;
+#define SMB302_VERSION_STRING	"3.02"
+/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
+extern struct smb_version_values smb302_values;
 #endif	/* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b9d59a948a2..11ca24a8e05 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -142,6 +142,11 @@
  */
 #define CIFS_SESS_KEY_SIZE (16)
 
+/*
+ * Size of the smb3 signing key
+ */
+#define SMB3_SIGN_KEY_SIZE (16)
+
 #define CIFS_CLIENT_CHALLENGE_SIZE (8)
 #define CIFS_SERVER_CHALLENGE_SIZE (8)
 #define CIFS_HMAC_MD5_HASH_SIZE (16)
@@ -277,7 +282,6 @@
 #define CIFS_NO_HANDLE        0xFFFF
 
 #define NO_CHANGE_64          0xFFFFFFFFFFFFFFFFULL
-#define NO_CHANGE_32          0xFFFFFFFFUL
 
 /* IPC$ in ASCII */
 #define CIFS_IPC_RESOURCE "\x49\x50\x43\x24"
@@ -532,7 +536,7 @@ typedef struct lanman_neg_rsp {
 #define READ_RAW_ENABLE 1
 #define WRITE_RAW_ENABLE 2
 #define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
-
+#define SMB1_CLIENT_GUID_SIZE (16)
 typedef struct negotiate_rsp {
 	struct smb_hdr hdr;	/* wct = 17 */
 	__le16 DialectIndex; /* 0xFFFF = no dialect acceptable */
@@ -554,7 +558,7 @@ typedef struct negotiate_rsp {
 		/* followed by 16 bytes of server GUID */
 		/* then security blob if cap_extended_security negotiated */
 		struct {
-			unsigned char GUID[16];
+			unsigned char GUID[SMB1_CLIENT_GUID_SIZE];
 			unsigned char SecurityBlob[1];
 		} __attribute__((packed)) extended_response;
 	} __attribute__((packed)) u;
@@ -1316,6 +1320,14 @@ typedef struct smb_com_ntransact_rsp {
 	/* parms and data follow */
 } __attribute__((packed)) NTRANSACT_RSP;
 
+/* See MS-SMB 2.2.7.2.1.1 */
+struct srv_copychunk {
+	__le64 SourceOffset;
+	__le64 DestinationOffset;
+	__le32 CopyLength;
+	__u32  Reserved;
+} __packed;
+
 typedef struct smb_com_transaction_ioctl_req {
 	struct smb_hdr hdr;	/* wct = 23 */
 	__u8 MaxSetupCount;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1988c1baa22..c8ff018fae6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -45,16 +45,17 @@ extern void _free_xid(unsigned int);
 #define get_xid()						\
 ({								\
 	unsigned int __xid = _get_xid();				\
-	cFYI(1, "CIFS VFS: in %s as Xid: %u with uid: %d",	\
-	     __func__, __xid, current_fsuid());			\
+	cifs_dbg(FYI, "CIFS VFS: in %s as Xid: %u with uid: %d\n",	\
+		 __func__, __xid,					\
+		 from_kuid(&init_user_ns, current_fsuid()));		\
 	__xid;							\
 })
 
 #define free_xid(curr_xid)					\
 do {								\
 	_free_xid(curr_xid);					\
-	cFYI(1, "CIFS VFS: leaving %s (xid = %u) rc = %d",	\
-	     __func__, curr_xid, (int)rc);			\
+	cifs_dbg(FYI, "CIFS VFS: leaving %s (xid = %u) rc = %d\n",	\
+		 __func__, curr_xid, (int)rc);				\
 } while (0)
 extern int init_cifs_idmap(void);
 extern void exit_cifs_idmap(void);
@@ -117,6 +118,8 @@ extern void header_assemble(struct smb_hdr *, char /* command */ ,
 extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
 				struct cifs_ses *ses,
 				void **request_buf);
+extern enum securityEnum select_sectype(struct TCP_Server_Info *server,
+				enum securityEnum requested);
 extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
 			  const struct nls_table *nls_cp);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -161,7 +164,7 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
 			      struct cifs_fattr *fattr, struct inode *inode,
 			      const char *path, const __u16 *pfid);
 extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
-					uid_t, gid_t);
+					kuid_t, kgid_t);
 extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
 					const char *, u32 *);
 extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
@@ -211,6 +214,7 @@ extern int cifs_negotiate_protocol(const unsigned int xid,
 				   struct cifs_ses *ses);
 extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
 			      struct nls_table *nls_info);
+extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required);
 extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses);
 
 extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
@@ -304,8 +308,8 @@ struct cifs_unix_set_info_args {
 	__u64	atime;
 	__u64	mtime;
 	__u64	mode;
-	__u64	uid;
-	__u64	gid;
+	kuid_t	uid;
+	kgid_t	gid;
 	dev_t	device;
 };
 
@@ -432,6 +436,7 @@ extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
 extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
 extern int calc_seckey(struct cifs_ses *);
+extern void generate_smb3signingkey(struct TCP_Server_Info *);
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern int calc_lanman_hash(const char *password, const char *cryptkey,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 76d0d299885..a89c4cb4e6c 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -139,8 +139,8 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
 		if (smb_command != SMB_COM_WRITE_ANDX &&
 		    smb_command != SMB_COM_OPEN_ANDX &&
 		    smb_command != SMB_COM_TREE_DISCONNECT) {
-			cFYI(1, "can not send cmd %d while umounting",
-				smb_command);
+			cifs_dbg(FYI, "can not send cmd %d while umounting\n",
+				 smb_command);
 			return -ENODEV;
 		}
 	}
@@ -163,7 +163,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
 		 * back on-line
 		 */
 		if (!tcon->retry) {
-			cFYI(1, "gave up waiting on reconnect in smb_init");
+			cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
 			return -EHOSTDOWN;
 		}
 	}
@@ -191,7 +191,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
 	cifs_mark_open_files_invalid(tcon);
 	rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
 	mutex_unlock(&ses->session_mutex);
-	cFYI(1, "reconnect tcon rc = %d", rc);
+	cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
 
 	if (rc)
 		goto out;
@@ -367,6 +367,185 @@ vt2_err:
 	return -EINVAL;
 }
 
+static int
+decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr)
+{
+	int	rc = 0;
+	u16	count;
+	char	*guid = pSMBr->u.extended_response.GUID;
+	struct TCP_Server_Info *server = ses->server;
+
+	count = get_bcc(&pSMBr->hdr);
+	if (count < SMB1_CLIENT_GUID_SIZE)
+		return -EIO;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	if (server->srv_count > 1) {
+		spin_unlock(&cifs_tcp_ses_lock);
+		if (memcmp(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE) != 0) {
+			cifs_dbg(FYI, "server UID changed\n");
+			memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+		}
+	} else {
+		spin_unlock(&cifs_tcp_ses_lock);
+		memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
+	}
+
+	if (count == SMB1_CLIENT_GUID_SIZE) {
+		server->sec_ntlmssp = true;
+	} else {
+		count -= SMB1_CLIENT_GUID_SIZE;
+		rc = decode_negTokenInit(
+			pSMBr->u.extended_response.SecurityBlob, count, server);
+		if (rc != 1)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+int
+cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
+{
+	bool srv_sign_required = server->sec_mode & server->vals->signing_required;
+	bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
+	bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN;
+
+	/*
+	 * Is signing required by mnt options? If not then check
+	 * global_secflags to see if it is there.
+	 */
+	if (!mnt_sign_required)
+		mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
+						CIFSSEC_MUST_SIGN);
+
+	/*
+	 * If signing is required then it's automatically enabled too,
+	 * otherwise, check to see if the secflags allow it.
+	 */
+	mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
+				(global_secflags & CIFSSEC_MAY_SIGN);
+
+	/* If server requires signing, does client allow it? */
+	if (srv_sign_required) {
+		if (!mnt_sign_enabled) {
+			cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!");
+			return -ENOTSUPP;
+		}
+		server->sign = true;
+	}
+
+	/* If client requires signing, does server allow it? */
+	if (mnt_sign_required) {
+		if (!srv_sign_enabled) {
+			cifs_dbg(VFS, "Server does not support signing!");
+			return -ENOTSUPP;
+		}
+		server->sign = true;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+static int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+	__s16 tmp;
+	struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
+
+	if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT)
+		return -EOPNOTSUPP;
+
+	server->sec_mode = le16_to_cpu(rsp->SecurityMode);
+	server->maxReq = min_t(unsigned int,
+			       le16_to_cpu(rsp->MaxMpxCount),
+			       cifs_max_pending);
+	set_credits(server, server->maxReq);
+	server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
+	server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
+	/* even though we do not use raw we might as well set this
+	accurately, in case we ever find a need for it */
+	if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
+		server->max_rw = 0xFF00;
+		server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
+	} else {
+		server->max_rw = 0;/* do not need to use raw anyway */
+		server->capabilities = CAP_MPX_MODE;
+	}
+	tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
+	if (tmp == -1) {
+		/* OS/2 often does not set timezone therefore
+		 * we must use server time to calc time zone.
+		 * Could deviate slightly from the right zone.
+		 * Smallest defined timezone difference is 15 minutes
+		 * (i.e. Nepal).  Rounding up/down is done to match
+		 * this requirement.
+		 */
+		int val, seconds, remain, result;
+		struct timespec ts, utc;
+		utc = CURRENT_TIME;
+		ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
+				    rsp->SrvTime.Time, 0);
+		cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
+			 (int)ts.tv_sec, (int)utc.tv_sec,
+			 (int)(utc.tv_sec - ts.tv_sec));
+		val = (int)(utc.tv_sec - ts.tv_sec);
+		seconds = abs(val);
+		result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
+		remain = seconds % MIN_TZ_ADJ;
+		if (remain >= (MIN_TZ_ADJ / 2))
+			result += MIN_TZ_ADJ;
+		if (val < 0)
+			result = -result;
+		server->timeAdj = result;
+	} else {
+		server->timeAdj = (int)tmp;
+		server->timeAdj *= 60; /* also in seconds */
+	}
+	cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
+
+
+	/* BB get server time for time conversions and add
+	code to use it and timezone since this is not UTC */
+
+	if (rsp->EncryptionKeyLength ==
+			cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
+		memcpy(server->cryptkey, rsp->EncryptionKey,
+			CIFS_CRYPTO_KEY_SIZE);
+	} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
+		return -EIO; /* need cryptkey unless plain text */
+	}
+
+	cifs_dbg(FYI, "LANMAN negotiated\n");
+	return 0;
+}
+#else
+static inline int
+decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
+{
+	cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
+	return -EOPNOTSUPP;
+}
+#endif
+
+static bool
+should_set_ext_sec_flag(enum securityEnum sectype)
+{
+	switch (sectype) {
+	case RawNTLMSSP:
+	case Kerberos:
+		return true;
+	case Unspecified:
+		if (global_secflags &
+		    (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP))
+			return true;
+		/* Fallthrough */
+	default:
+		return false;
+	}
+}
+
 int
 CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
 {
@@ -375,41 +554,24 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
 	int rc = 0;
 	int bytes_returned;
 	int i;
-	struct TCP_Server_Info *server;
+	struct TCP_Server_Info *server = ses->server;
 	u16 count;
-	unsigned int secFlags;
 
-	if (ses->server)
-		server = ses->server;
-	else {
-		rc = -EIO;
-		return rc;
+	if (!server) {
+		WARN(1, "%s: server is NULL!\n", __func__);
+		return -EIO;
 	}
+
 	rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ ,
 		      (void **) &pSMB, (void **) &pSMBr);
 	if (rc)
 		return rc;
 
-	/* if any of auth flags (ie not sign or seal) are overriden use them */
-	if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
-		secFlags = ses->overrideSecFlg;  /* BB FIXME fix sign flags? */
-	else /* if override flags set only sign/seal OR them with global auth */
-		secFlags = global_secflags | ses->overrideSecFlg;
-
-	cFYI(1, "secFlags 0x%x", secFlags);
-
 	pSMB->hdr.Mid = get_next_mid(server);
 	pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
 
-	if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
-		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
-		cFYI(1, "Kerberos only mechanism, enable extended security");
-		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-	} else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
-		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
-		cFYI(1, "NTLMSSP only mechanism, enable extended security");
+	if (should_set_ext_sec_flag(ses->sectype)) {
+		cifs_dbg(FYI, "Requesting extended security.");
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	}
 
@@ -428,7 +590,7 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
 		goto neg_err_exit;
 
 	server->dialect = le16_to_cpu(pSMBr->DialectIndex);
-	cFYI(1, "Dialect: %d", server->dialect);
+	cifs_dbg(FYI, "Dialect: %d\n", server->dialect);
 	/* Check wct = 1 error case */
 	if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
 		/* core returns wct = 1, but we do not ask for core - otherwise
@@ -436,129 +598,20 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
 		could not negotiate a common dialect */
 		rc = -EOPNOTSUPP;
 		goto neg_err_exit;
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-	} else if ((pSMBr->hdr.WordCount == 13)
-			&& ((server->dialect == LANMAN_PROT)
-				|| (server->dialect == LANMAN2_PROT))) {
-		__s16 tmp;
-		struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
-
-		if ((secFlags & CIFSSEC_MAY_LANMAN) ||
-			(secFlags & CIFSSEC_MAY_PLNTXT))
-			server->secType = LANMAN;
-		else {
-			cERROR(1, "mount failed weak security disabled"
-				   " in /proc/fs/cifs/SecurityFlags");
-			rc = -EOPNOTSUPP;
-			goto neg_err_exit;
-		}
-		server->sec_mode = le16_to_cpu(rsp->SecurityMode);
-		server->maxReq = min_t(unsigned int,
-				       le16_to_cpu(rsp->MaxMpxCount),
-				       cifs_max_pending);
-		set_credits(server, server->maxReq);
-		server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
-		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
-		/* even though we do not use raw we might as well set this
-		accurately, in case we ever find a need for it */
-		if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
-			server->max_rw = 0xFF00;
-			server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
-		} else {
-			server->max_rw = 0;/* do not need to use raw anyway */
-			server->capabilities = CAP_MPX_MODE;
-		}
-		tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
-		if (tmp == -1) {
-			/* OS/2 often does not set timezone therefore
-			 * we must use server time to calc time zone.
-			 * Could deviate slightly from the right zone.
-			 * Smallest defined timezone difference is 15 minutes
-			 * (i.e. Nepal).  Rounding up/down is done to match
-			 * this requirement.
-			 */
-			int val, seconds, remain, result;
-			struct timespec ts, utc;
-			utc = CURRENT_TIME;
-			ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
-					    rsp->SrvTime.Time, 0);
-			cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
-				(int)ts.tv_sec, (int)utc.tv_sec,
-				(int)(utc.tv_sec - ts.tv_sec));
-			val = (int)(utc.tv_sec - ts.tv_sec);
-			seconds = abs(val);
-			result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
-			remain = seconds % MIN_TZ_ADJ;
-			if (remain >= (MIN_TZ_ADJ / 2))
-				result += MIN_TZ_ADJ;
-			if (val < 0)
-				result = -result;
-			server->timeAdj = result;
-		} else {
-			server->timeAdj = (int)tmp;
-			server->timeAdj *= 60; /* also in seconds */
-		}
-		cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
-
-
-		/* BB get server time for time conversions and add
-		code to use it and timezone since this is not UTC */
-
-		if (rsp->EncryptionKeyLength ==
-				cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
-			memcpy(ses->server->cryptkey, rsp->EncryptionKey,
-				CIFS_CRYPTO_KEY_SIZE);
-		} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
-			rc = -EIO; /* need cryptkey unless plain text */
-			goto neg_err_exit;
-		}
-
-		cFYI(1, "LANMAN negotiated");
-		/* we will not end up setting signing flags - as no signing
-		was in LANMAN and server did not return the flags on */
-		goto signing_check;
-#else /* weak security disabled */
 	} else if (pSMBr->hdr.WordCount == 13) {
-		cERROR(1, "mount failed, cifs module not built "
-			  "with CIFS_WEAK_PW_HASH support");
-		rc = -EOPNOTSUPP;
-#endif /* WEAK_PW_HASH */
-		goto neg_err_exit;
+		server->negflavor = CIFS_NEGFLAVOR_LANMAN;
+		rc = decode_lanman_negprot_rsp(server, pSMBr);
+		goto signing_check;
 	} else if (pSMBr->hdr.WordCount != 17) {
 		/* unknown wct */
 		rc = -EOPNOTSUPP;
 		goto neg_err_exit;
 	}
-	/* else wct == 17 NTLM */
+	/* else wct == 17, NTLM or better */
+
 	server->sec_mode = pSMBr->SecurityMode;
 	if ((server->sec_mode & SECMODE_USER) == 0)
-		cFYI(1, "share mode security");
-
-	if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0)
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-		if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
-#endif /* CIFS_WEAK_PW_HASH */
-			cERROR(1, "Server requests plain text password"
-				  " but client support disabled");
-
-	if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
-		server->secType = NTLMv2;
-	else if (secFlags & CIFSSEC_MAY_NTLM)
-		server->secType = NTLM;
-	else if (secFlags & CIFSSEC_MAY_NTLMV2)
-		server->secType = NTLMv2;
-	else if (secFlags & CIFSSEC_MAY_KRB5)
-		server->secType = Kerberos;
-	else if (secFlags & CIFSSEC_MAY_NTLMSSP)
-		server->secType = RawNTLMSSP;
-	else if (secFlags & CIFSSEC_MAY_LANMAN)
-		server->secType = LANMAN;
-	else {
-		rc = -EOPNOTSUPP;
-		cERROR(1, "Invalid security type");
-		goto neg_err_exit;
-	}
-	/* else ... any others ...? */
+		cifs_dbg(FYI, "share mode security\n");
 
 	/* one byte, so no need to convert this or EncryptionKeyLen from
 	   little endian */
@@ -568,100 +621,34 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
 	/* probably no need to store and check maxvcs */
 	server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize);
 	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
-	cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
+	cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
 	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
 	server->timeAdj *= 60;
+
 	if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+		server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
 		memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
 		       CIFS_CRYPTO_KEY_SIZE);
 	} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
 			server->capabilities & CAP_EXTENDED_SECURITY) &&
 				(pSMBr->EncryptionKeyLength == 0)) {
-		/* decode security blob */
-		count = get_bcc(&pSMBr->hdr);
-		if (count < 16) {
-			rc = -EIO;
-			goto neg_err_exit;
-		}
-		spin_lock(&cifs_tcp_ses_lock);
-		if (server->srv_count > 1) {
-			spin_unlock(&cifs_tcp_ses_lock);
-			if (memcmp(server->server_GUID,
-				   pSMBr->u.extended_response.
-				   GUID, 16) != 0) {
-				cFYI(1, "server UID changed");
-				memcpy(server->server_GUID,
-					pSMBr->u.extended_response.GUID,
-					16);
-			}
-		} else {
-			spin_unlock(&cifs_tcp_ses_lock);
-			memcpy(server->server_GUID,
-			       pSMBr->u.extended_response.GUID, 16);
-		}
-
-		if (count == 16) {
-			server->secType = RawNTLMSSP;
-		} else {
-			rc = decode_negTokenInit(pSMBr->u.extended_response.
-						 SecurityBlob, count - 16,
-						 server);
-			if (rc == 1)
-				rc = 0;
-			else
-				rc = -EINVAL;
-			if (server->secType == Kerberos) {
-				if (!server->sec_kerberos &&
-						!server->sec_mskerberos)
-					rc = -EOPNOTSUPP;
-			} else if (server->secType == RawNTLMSSP) {
-				if (!server->sec_ntlmssp)
-					rc = -EOPNOTSUPP;
-			} else
-					rc = -EOPNOTSUPP;
-		}
+		server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
+		rc = decode_ext_sec_blob(ses, pSMBr);
 	} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
 		rc = -EIO; /* no crypt key only if plain text pwd */
-		goto neg_err_exit;
-	} else
-		server->capabilities &= ~CAP_EXTENDED_SECURITY;
-
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
-signing_check:
-#endif
-	if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
-		/* MUST_SIGN already includes the MAY_SIGN FLAG
-		   so if this is zero it means that signing is disabled */
-		cFYI(1, "Signing disabled");
-		if (server->sec_mode & SECMODE_SIGN_REQUIRED) {
-			cERROR(1, "Server requires "
-				   "packet signing to be enabled in "
-				   "/proc/fs/cifs/SecurityFlags.");
-			rc = -EOPNOTSUPP;
-		}
-		server->sec_mode &=
-			~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
-	} else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
-		/* signing required */
-		cFYI(1, "Must sign - secFlags 0x%x", secFlags);
-		if ((server->sec_mode &
-			(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
-			cERROR(1, "signing required but server lacks support");
-			rc = -EOPNOTSUPP;
-		} else
-			server->sec_mode |= SECMODE_SIGN_REQUIRED;
 	} else {
-		/* signing optional ie CIFSSEC_MAY_SIGN */
-		if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0)
-			server->sec_mode &=
-				~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
+		server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
+		server->capabilities &= ~CAP_EXTENDED_SECURITY;
 	}
 
+signing_check:
+	if (!rc)
+		rc = cifs_enable_signing(server, ses->sign);
 neg_err_exit:
 	cifs_buf_release(pSMB);
 
-	cFYI(1, "negprot rc %d", rc);
+	cifs_dbg(FYI, "negprot rc %d\n", rc);
 	return rc;
 }
 
@@ -671,7 +658,7 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon)
 	struct smb_hdr *smb_buffer;
 	int rc = 0;
 
-	cFYI(1, "In tree disconnect");
+	cifs_dbg(FYI, "In tree disconnect\n");
 
 	/* BB: do we need to check this? These should never be NULL. */
 	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
@@ -693,7 +680,7 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon)
 
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *)smb_buffer, 0);
 	if (rc)
-		cFYI(1, "Tree disconnect failed %d", rc);
+		cifs_dbg(FYI, "Tree disconnect failed %d\n", rc);
 
 	/* No need to return error on this operation if tid invalidated and
 	   closed on server already e.g. due to tcp session crashing */
@@ -728,7 +715,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 	struct smb_rqst rqst = { .rq_iov = &iov,
 				 .rq_nvec = 1 };
 
-	cFYI(1, "In echo request");
+	cifs_dbg(FYI, "In echo request\n");
 
 	rc = small_smb_init(SMB_COM_ECHO, 0, NULL, (void **)&smb);
 	if (rc)
@@ -747,7 +734,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 	rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback,
 			     server, CIFS_ASYNC_OP | CIFS_ECHO_OP);
 	if (rc)
-		cFYI(1, "Echo request failed: %d", rc);
+		cifs_dbg(FYI, "Echo request failed: %d\n", rc);
 
 	cifs_small_buf_release(smb);
 
@@ -760,7 +747,7 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
 	LOGOFF_ANDX_REQ *pSMB;
 	int rc = 0;
 
-	cFYI(1, "In SMBLogoff for session disconnect");
+	cifs_dbg(FYI, "In SMBLogoff for session disconnect\n");
 
 	/*
 	 * BB: do we need to check validity of ses and server? They should
@@ -782,9 +769,8 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
 
 	pSMB->hdr.Mid = get_next_mid(ses->server);
 
-	if (ses->server->sec_mode &
-		   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-			pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+	if (ses->server->sign)
+		pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 
 	pSMB->hdr.Uid = ses->Suid;
 
@@ -814,7 +800,7 @@ CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, "In POSIX delete");
+	cifs_dbg(FYI, "In POSIX delete\n");
 PsxDelete:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -866,7 +852,7 @@ PsxDelete:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "Posix delete returned %d", rc);
+		cifs_dbg(FYI, "Posix delete returned %d\n", rc);
 	cifs_buf_release(pSMB);
 
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes);
@@ -914,7 +900,7 @@ DelFileRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_deletes);
 	if (rc)
-		cFYI(1, "Error in RMFile = %d", rc);
+		cifs_dbg(FYI, "Error in RMFile = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -934,7 +920,7 @@ CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 	int name_len;
 	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
 
-	cFYI(1, "In CIFSSMBRmDir");
+	cifs_dbg(FYI, "In CIFSSMBRmDir\n");
 RmDirRetry:
 	rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -960,7 +946,7 @@ RmDirRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_rmdirs);
 	if (rc)
-		cFYI(1, "Error in RMDir = %d", rc);
+		cifs_dbg(FYI, "Error in RMDir = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -979,7 +965,7 @@ CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 	int name_len;
 	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
 
-	cFYI(1, "In CIFSSMBMkDir");
+	cifs_dbg(FYI, "In CIFSSMBMkDir\n");
 MkDirRetry:
 	rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -1005,7 +991,7 @@ MkDirRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_mkdirs);
 	if (rc)
-		cFYI(1, "Error in Mkdir = %d", rc);
+		cifs_dbg(FYI, "Error in Mkdir = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -1029,7 +1015,7 @@ CIFSPOSIXCreate(const unsigned int xid, struct cifs_tcon *tcon,
 	OPEN_PSX_REQ *pdata;
 	OPEN_PSX_RSP *psx_rsp;
 
-	cFYI(1, "In POSIX Create");
+	cifs_dbg(FYI, "In POSIX Create\n");
 PsxCreat:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -1083,11 +1069,11 @@ PsxCreat:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Posix create returned %d", rc);
+		cifs_dbg(FYI, "Posix create returned %d\n", rc);
 		goto psx_create_err;
 	}
 
-	cFYI(1, "copying inode info");
+	cifs_dbg(FYI, "copying inode info\n");
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 	if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) {
@@ -1109,11 +1095,11 @@ PsxCreat:
 	/* check to make sure response data is there */
 	if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
 		pRetData->Type = cpu_to_le32(-1); /* unknown */
-		cFYI(DBG2, "unknown type");
+		cifs_dbg(NOISY, "unknown type\n");
 	} else {
 		if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)
 					+ sizeof(FILE_UNIX_BASIC_INFO)) {
-			cERROR(1, "Open response data too small");
+			cifs_dbg(VFS, "Open response data too small\n");
 			pRetData->Type = cpu_to_le32(-1);
 			goto psx_create_err;
 		}
@@ -1160,7 +1146,7 @@ static __u16 convert_disposition(int disposition)
 			ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
 			break;
 		default:
-			cFYI(1, "unknown disposition %d", disposition);
+			cifs_dbg(FYI, "unknown disposition %d\n", disposition);
 			ofun =  SMBOPEN_OAPPEND; /* regular open */
 	}
 	return ofun;
@@ -1251,7 +1237,7 @@ OldOpenRetry:
 			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
 	if (rc) {
-		cFYI(1, "Error in Open = %d", rc);
+		cifs_dbg(FYI, "Error in Open = %d\n", rc);
 	} else {
 	/* BB verify if wct == 15 */
 
@@ -1364,7 +1350,7 @@ openRetry:
 			(struct smb_hdr *)pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
 	if (rc) {
-		cFYI(1, "Error in Open = %d", rc);
+		cifs_dbg(FYI, "Error in Open = %d\n", rc);
 	} else {
 		*pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
 		*netfid = pSMBr->Fid;	/* cifs fid stays in le */
@@ -1425,8 +1411,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	char *buf = server->smallbuf;
 	unsigned int buflen = get_rfc1002_length(buf) + 4;
 
-	cFYI(1, "%s: mid=%llu offset=%llu bytes=%u", __func__,
-		mid->mid, rdata->offset, rdata->bytes);
+	cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n",
+		 __func__, mid->mid, rdata->offset, rdata->bytes);
 
 	/*
 	 * read the rest of READ_RSP header (sans Data array), or whatever we
@@ -1447,16 +1433,16 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	/* Was the SMB read successful? */
 	rdata->result = server->ops->map_error(buf, false);
 	if (rdata->result != 0) {
-		cFYI(1, "%s: server returned error %d", __func__,
-			rdata->result);
+		cifs_dbg(FYI, "%s: server returned error %d\n",
+			 __func__, rdata->result);
 		return cifs_readv_discard(server, mid);
 	}
 
 	/* Is there enough to get to the rest of the READ_RSP header? */
 	if (server->total_read < server->vals->read_rsp_size) {
-		cFYI(1, "%s: server returned short header. got=%u expected=%zu",
-			__func__, server->total_read,
-			server->vals->read_rsp_size);
+		cifs_dbg(FYI, "%s: server returned short header. got=%u expected=%zu\n",
+			 __func__, server->total_read,
+			 server->vals->read_rsp_size);
 		rdata->result = -EIO;
 		return cifs_readv_discard(server, mid);
 	}
@@ -1468,19 +1454,19 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 		 * is beyond the EOF. Treat it as if the data starts just after
 		 * the header.
 		 */
-		cFYI(1, "%s: data offset (%u) inside read response header",
-			__func__, data_offset);
+		cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n",
+			 __func__, data_offset);
 		data_offset = server->total_read;
 	} else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
 		/* data_offset is beyond the end of smallbuf */
-		cFYI(1, "%s: data offset (%u) beyond end of smallbuf",
-			__func__, data_offset);
+		cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",
+			 __func__, data_offset);
 		rdata->result = -EIO;
 		return cifs_readv_discard(server, mid);
 	}
 
-	cFYI(1, "%s: total_read=%u data_offset=%u", __func__,
-		server->total_read, data_offset);
+	cifs_dbg(FYI, "%s: total_read=%u data_offset=%u\n",
+		 __func__, server->total_read, data_offset);
 
 	len = data_offset - server->total_read;
 	if (len > 0) {
@@ -1496,8 +1482,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	/* set up first iov for signature check */
 	rdata->iov.iov_base = buf;
 	rdata->iov.iov_len = server->total_read;
-	cFYI(1, "0: iov_base=%p iov_len=%zu",
-		rdata->iov.iov_base, rdata->iov.iov_len);
+	cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+		 rdata->iov.iov_base, rdata->iov.iov_len);
 
 	/* how much data is in the response? */
 	data_len = server->ops->read_data_length(buf);
@@ -1514,8 +1500,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	server->total_read += length;
 	rdata->bytes = length;
 
-	cFYI(1, "total_read=%u buflen=%u remaining=%u", server->total_read,
-		buflen, data_len);
+	cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
+		 server->total_read, buflen, data_len);
 
 	/* discard anything left over */
 	if (server->total_read < buflen)
@@ -1538,21 +1524,21 @@ cifs_readv_callback(struct mid_q_entry *mid)
 				 .rq_pagesz = rdata->pagesz,
 				 .rq_tailsz = rdata->tailsz };
 
-	cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__,
-		mid->mid, mid->mid_state, rdata->result, rdata->bytes);
+	cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+		 __func__, mid->mid, mid->mid_state, rdata->result,
+		 rdata->bytes);
 
 	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
 		/* result already set, check signature */
-		if (server->sec_mode &
-		    (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+		if (server->sign) {
 			int rc = 0;
 
 			rc = cifs_verify_signature(&rqst, server,
-						  mid->sequence_number + 1);
+						  mid->sequence_number);
 			if (rc)
-				cERROR(1, "SMB signature verification returned "
-				       "error = %d", rc);
+				cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+					 rc);
 		}
 		/* FIXME: should this be counted toward the initiating task? */
 		task_io_account_read(rdata->bytes);
@@ -1582,8 +1568,8 @@ cifs_async_readv(struct cifs_readdata *rdata)
 	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
 				 .rq_nvec = 1 };
 
-	cFYI(1, "%s: offset=%llu bytes=%u", __func__,
-		rdata->offset, rdata->bytes);
+	cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
+		 __func__, rdata->offset, rdata->bytes);
 
 	if (tcon->ses->capabilities & CAP_LARGE_FILES)
 		wct = 12;
@@ -1653,7 +1639,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
 	struct cifs_tcon *tcon = io_parms->tcon;
 	unsigned int count = io_parms->length;
 
-	cFYI(1, "Reading %d bytes on fid %d", count, netfid);
+	cifs_dbg(FYI, "Reading %d bytes on fid %d\n", count, netfid);
 	if (tcon->ses->capabilities & CAP_LARGE_FILES)
 		wct = 12;
 	else {
@@ -1701,7 +1687,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
 	pSMBr = (READ_RSP *)iov[0].iov_base;
 	if (rc) {
-		cERROR(1, "Send error in read = %d", rc);
+		cifs_dbg(VFS, "Send error in read = %d\n", rc);
 	} else {
 		int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
 		data_length = data_length << 16;
@@ -1711,7 +1697,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
 		/*check that DataLength would not go beyond end of SMB */
 		if ((data_length > CIFSMaxBufSize)
 				|| (data_length > count)) {
-			cFYI(1, "bad length %d for count %d",
+			cifs_dbg(FYI, "bad length %d for count %d\n",
 				 data_length, count);
 			rc = -EIO;
 			*nbytes = 0;
@@ -1719,7 +1705,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
 			pReadData = (char *) (&pSMBr->hdr.Protocol) +
 					le16_to_cpu(pSMBr->DataOffset);
 /*			if (rc = copy_to_user(buf, pReadData, data_length)) {
-				cERROR(1, "Faulting on read rc = %d",rc);
+				cifs_dbg(VFS, "Faulting on read rc = %d\n",rc);
 				rc = -EFAULT;
 			}*/ /* can not use copy_to_user when using page cache*/
 			if (*buf)
@@ -1767,7 +1753,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
 
 	*nbytes = 0;
 
-	/* cFYI(1, "write at %lld %d bytes", offset, count);*/
+	/* cifs_dbg(FYI, "write at %lld %d bytes\n", offset, count);*/
 	if (tcon->ses == NULL)
 		return -ECONNABORTED;
 
@@ -1852,7 +1838,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
 	if (rc) {
-		cFYI(1, "Send error in write = %d", rc);
+		cifs_dbg(FYI, "Send error in write = %d\n", rc);
 	} else {
 		*nbytes = le16_to_cpu(pSMBr->CountHigh);
 		*nbytes = (*nbytes) << 16;
@@ -1909,9 +1895,12 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
 	} while (rc == -EAGAIN);
 
 	for (i = 0; i < wdata->nr_pages; i++) {
-		if (rc != 0)
-			SetPageError(wdata->pages[i]);
 		unlock_page(wdata->pages[i]);
+		if (rc != 0) {
+			SetPageError(wdata->pages[i]);
+			end_page_writeback(wdata->pages[i]);
+			page_cache_release(wdata->pages[i]);
+		}
 	}
 
 	mapping_set_error(inode->i_mapping, rc);
@@ -1956,7 +1945,7 @@ cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
 
 	/* this would overflow */
 	if (nr_pages == 0) {
-		cERROR(1, "%s: called with nr_pages == 0!", __func__);
+		cifs_dbg(VFS, "%s: called with nr_pages == 0!\n", __func__);
 		return NULL;
 	}
 
@@ -2072,7 +2061,8 @@ cifs_async_writev(struct cifs_writedata *wdata)
 	rqst.rq_pagesz = wdata->pagesz;
 	rqst.rq_tailsz = wdata->tailsz;
 
-	cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes);
+	cifs_dbg(FYI, "async write at %llu %u bytes\n",
+		 wdata->offset, wdata->bytes);
 
 	smb->DataLengthLow = cpu_to_le16(wdata->bytes & 0xFFFF);
 	smb->DataLengthHigh = cpu_to_le16(wdata->bytes >> 16);
@@ -2120,7 +2110,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
 
 	*nbytes = 0;
 
-	cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
+	cifs_dbg(FYI, "write2 at %lld %d bytes\n", (long long)offset, count);
 
 	if (tcon->ses->capabilities & CAP_LARGE_FILES) {
 		wct = 14;
@@ -2179,7 +2169,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
 	rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
 	if (rc) {
-		cFYI(1, "Send error Write2 = %d", rc);
+		cifs_dbg(FYI, "Send error Write2 = %d\n", rc);
 	} else if (resp_buf_type == 0) {
 		/* presumably this can not happen, but best to be safe */
 		rc = -EIO;
@@ -2220,7 +2210,8 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	int resp_buf_type;
 	__u16 count;
 
-	cFYI(1, "cifs_lockv num lock %d num unlock %d", num_lock, num_unlock);
+	cifs_dbg(FYI, "cifs_lockv num lock %d num unlock %d\n",
+		 num_lock, num_unlock);
 
 	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
 	if (rc)
@@ -2246,7 +2237,7 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
 	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
 	if (rc)
-		cFYI(1, "Send error in cifs_lockv = %d", rc);
+		cifs_dbg(FYI, "Send error in cifs_lockv = %d\n", rc);
 
 	return rc;
 }
@@ -2265,7 +2256,8 @@ CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon,
 	int flags = 0;
 	__u16 count;
 
-	cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
+	cifs_dbg(FYI, "CIFSSMBLock timeout %d numLock %d\n",
+		 (int)waitFlag, numLock);
 	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -2314,7 +2306,7 @@ CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon,
 	}
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
 	if (rc)
-		cFYI(1, "Send error in Lock = %d", rc);
+		cifs_dbg(FYI, "Send error in Lock = %d\n", rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 	since file handle passed in no longer valid */
@@ -2338,7 +2330,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
 	__u16 params, param_offset, offset, byte_count, count;
 	struct kvec iov[1];
 
-	cFYI(1, "Posix Lock");
+	cifs_dbg(FYI, "Posix Lock\n");
 
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
@@ -2405,7 +2397,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
 	}
 
 	if (rc) {
-		cFYI(1, "Send error in Posix Lock = %d", rc);
+		cifs_dbg(FYI, "Send error in Posix Lock = %d\n", rc);
 	} else if (pLockData) {
 		/* lock structure can be returned on get */
 		__u16 data_offset;
@@ -2462,7 +2454,7 @@ CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
 {
 	int rc = 0;
 	CLOSE_REQ *pSMB = NULL;
-	cFYI(1, "In CIFSSMBClose");
+	cifs_dbg(FYI, "In CIFSSMBClose\n");
 
 /* do not retry on dead session on close */
 	rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
@@ -2479,7 +2471,7 @@ CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
 	if (rc) {
 		if (rc != -EINTR) {
 			/* EINTR is expected when user ctl-c to kill app */
-			cERROR(1, "Send error in Close = %d", rc);
+			cifs_dbg(VFS, "Send error in Close = %d\n", rc);
 		}
 	}
 
@@ -2495,7 +2487,7 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
 {
 	int rc = 0;
 	FLUSH_REQ *pSMB = NULL;
-	cFYI(1, "In CIFSSMBFlush");
+	cifs_dbg(FYI, "In CIFSSMBFlush\n");
 
 	rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
 	if (rc)
@@ -2506,7 +2498,7 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_flushes);
 	if (rc)
-		cERROR(1, "Send error in Flush = %d", rc);
+		cifs_dbg(VFS, "Send error in Flush = %d\n", rc);
 
 	return rc;
 }
@@ -2524,7 +2516,7 @@ CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
 	__u16 count;
 	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
 
-	cFYI(1, "In CIFSSMBRename");
+	cifs_dbg(FYI, "In CIFSSMBRename\n");
 renameRetry:
 	rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2571,7 +2563,7 @@ renameRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_renames);
 	if (rc)
-		cFYI(1, "Send error in rename = %d", rc);
+		cifs_dbg(FYI, "Send error in rename = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -2595,7 +2587,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
 	int len_of_str;
 	__u16 params, param_offset, offset, count, byte_count;
 
-	cFYI(1, "Rename to File by handle");
+	cifs_dbg(FYI, "Rename to File by handle\n");
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
 			(void **) &pSMBr);
 	if (rc)
@@ -2652,7 +2644,8 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&pTcon->stats.cifs_stats.num_t2renames);
 	if (rc)
-		cFYI(1, "Send error in Rename (by file handle) = %d", rc);
+		cifs_dbg(FYI, "Send error in Rename (by file handle) = %d\n",
+			 rc);
 
 	cifs_buf_release(pSMB);
 
@@ -2674,7 +2667,7 @@ CIFSSMBCopy(const unsigned int xid, struct cifs_tcon *tcon,
 	int name_len, name_len2;
 	__u16 count;
 
-	cFYI(1, "In CIFSSMBCopy");
+	cifs_dbg(FYI, "In CIFSSMBCopy\n");
 copyRetry:
 	rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
 			(void **) &pSMBr);
@@ -2719,8 +2712,8 @@ copyRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in copy = %d with %d files copied",
-			rc, le16_to_cpu(pSMBr->CopyCount));
+		cifs_dbg(FYI, "Send error in copy = %d with %d files copied\n",
+			 rc, le16_to_cpu(pSMBr->CopyCount));
 	}
 	cifs_buf_release(pSMB);
 
@@ -2744,7 +2737,7 @@ CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, "In Symlink Unix style");
+	cifs_dbg(FYI, "In Symlink Unix style\n");
 createSymLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2809,7 +2802,8 @@ createSymLinkRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_symlinks);
 	if (rc)
-		cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
+		cifs_dbg(FYI, "Send error in SetPathInfo create symlink = %d\n",
+			 rc);
 
 	cifs_buf_release(pSMB);
 
@@ -2833,7 +2827,7 @@ CIFSUnixCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, "In Create Hard link Unix style");
+	cifs_dbg(FYI, "In Create Hard link Unix style\n");
 createHardLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2895,7 +2889,8 @@ createHardLinkRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks);
 	if (rc)
-		cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
+		cifs_dbg(FYI, "Send error in SetPathInfo (hard link) = %d\n",
+			 rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -2917,7 +2912,7 @@ CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
 	__u16 count;
 	int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
 
-	cFYI(1, "In CIFSCreateHardLink");
+	cifs_dbg(FYI, "In CIFSCreateHardLink\n");
 winCreateHardLinkRetry:
 
 	rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
@@ -2969,7 +2964,7 @@ winCreateHardLinkRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_hardlinks);
 	if (rc)
-		cFYI(1, "Send error in hard link (NT rename) = %d", rc);
+		cifs_dbg(FYI, "Send error in hard link (NT rename) = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -2992,7 +2987,7 @@ CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
 	__u16 params, byte_count;
 	char *data_start;
 
-	cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
+	cifs_dbg(FYI, "In QPathSymLinkInfo (Unix) for path %s\n", searchName);
 
 querySymLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3039,7 +3034,7 @@ querySymLinkRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QuerySymLinkInfo = %d\n", rc);
 	} else {
 		/* decode response */
 
@@ -3094,7 +3089,8 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	struct smb_com_transaction_ioctl_req *pSMB;
 	struct smb_com_transaction_ioctl_rsp *pSMBr;
 
-	cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
+	cifs_dbg(FYI, "In Windows reparse style QueryLink for path %s\n",
+		 searchName);
 	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
 	if (rc)
@@ -3122,7 +3118,7 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc);
 	} else {		/* decode response */
 		__u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
 		__u32 data_count = le32_to_cpu(pSMBr->DataCount);
@@ -3146,7 +3142,7 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon,
 			if ((reparse_buf->LinkNamesBuf +
 				reparse_buf->TargetNameOffset +
 				reparse_buf->TargetNameLen) > end_of_smb) {
-				cFYI(1, "reparse buf beyond SMB");
+				cifs_dbg(FYI, "reparse buf beyond SMB\n");
 				rc = -EIO;
 				goto qreparse_out;
 			}
@@ -3167,12 +3163,11 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon,
 			}
 		} else {
 			rc = -EIO;
-			cFYI(1, "Invalid return data count on "
-				 "get reparse info ioctl");
+			cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
 		}
 		symlinkinfo[buflen] = 0; /* just in case so the caller
 					does not go off the end of the buffer */
-		cFYI(1, "readlink result - %s", symlinkinfo);
+		cifs_dbg(FYI, "readlink result - %s\n", symlinkinfo);
 	}
 
 qreparse_out:
@@ -3195,7 +3190,10 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace,
 	ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
 	ace->e_tag  = cpu_to_le16(cifs_ace->cifs_e_tag);
 	ace->e_id   = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
-	/* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
+/*
+	cifs_dbg(FYI, "perm %d tag %d id %d\n",
+		 ace->e_perm, ace->e_tag, ace->e_id);
+*/
 
 	return;
 }
@@ -3221,8 +3219,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
 		size += sizeof(struct cifs_posix_ace) * count;
 		/* check if we would go beyond end of SMB */
 		if (size_of_data_area < size) {
-			cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
-				size_of_data_area, size);
+			cifs_dbg(FYI, "bad CIFS POSIX ACL size %d vs. %d\n",
+				 size_of_data_area, size);
 			return -EINVAL;
 		}
 	} else if (acl_type & ACL_TYPE_DEFAULT) {
@@ -3269,7 +3267,10 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
 		cifs_ace->cifs_uid = cpu_to_le64(-1);
 	} else
 		cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
-	/*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
+/*
+	cifs_dbg(FYI, "perm %d tag %d id %d\n",
+		 ace->e_perm, ace->e_tag, ace->e_id);
+*/
 	return rc;
 }
 
@@ -3287,12 +3288,11 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 		return 0;
 
 	count = posix_acl_xattr_count((size_t)buflen);
-	cFYI(1, "setting acl with %d entries from buf of length %d and "
-		"version of %d",
-		count, buflen, le32_to_cpu(local_acl->a_version));
+	cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n",
+		 count, buflen, le32_to_cpu(local_acl->a_version));
 	if (le32_to_cpu(local_acl->a_version) != 2) {
-		cFYI(1, "unknown POSIX ACL version %d",
-		     le32_to_cpu(local_acl->a_version));
+		cifs_dbg(FYI, "unknown POSIX ACL version %d\n",
+			 le32_to_cpu(local_acl->a_version));
 		return 0;
 	}
 	cifs_acl->version = cpu_to_le16(1);
@@ -3301,7 +3301,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 	else if (acl_type == ACL_TYPE_DEFAULT)
 		cifs_acl->default_entry_count = cpu_to_le16(count);
 	else {
-		cFYI(1, "unknown ACL type %d", acl_type);
+		cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
 		return 0;
 	}
 	for (i = 0; i < count; i++) {
@@ -3334,7 +3334,7 @@ CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
+	cifs_dbg(FYI, "In GetPosixACL (Unix) for path %s\n", searchName);
 
 queryAclRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3387,7 +3387,7 @@ queryAclRetry:
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get);
 	if (rc) {
-		cFYI(1, "Send error in Query POSIX ACL = %d", rc);
+		cifs_dbg(FYI, "Send error in Query POSIX ACL = %d\n", rc);
 	} else {
 		/* decode response */
 
@@ -3424,7 +3424,7 @@ CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
 	int bytes_returned = 0;
 	__u16 params, byte_count, data_count, param_offset, offset;
 
-	cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
+	cifs_dbg(FYI, "In SetPosixACL (Unix) for path %s\n", fileName);
 setAclRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -3479,7 +3479,7 @@ setAclRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "Set POSIX ACL returned %d", rc);
+		cifs_dbg(FYI, "Set POSIX ACL returned %d\n", rc);
 
 setACLerrorExit:
 	cifs_buf_release(pSMB);
@@ -3499,7 +3499,7 @@ CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
 	int bytes_returned;
 	__u16 params, byte_count;
 
-	cFYI(1, "In GetExtAttr");
+	cifs_dbg(FYI, "In GetExtAttr\n");
 	if (tcon == NULL)
 		return -ENODEV;
 
@@ -3538,7 +3538,7 @@ GetExtAttrRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "error %d in GetExtAttr", rc);
+		cifs_dbg(FYI, "error %d in GetExtAttr\n", rc);
 	} else {
 		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3553,7 +3553,7 @@ GetExtAttrRetry:
 			struct file_chattr_info *pfinfo;
 			/* BB Do we need a cast or hash here ? */
 			if (count != 16) {
-				cFYI(1, "Illegal size ret in GetExtAttr");
+				cifs_dbg(FYI, "Illegal size ret in GetExtAttr\n");
 				rc = -EIO;
 				goto GetExtAttrOut;
 			}
@@ -3641,21 +3641,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 
 	/* should we also check that parm and data areas do not overlap? */
 	if (*ppparm > end_of_smb) {
-		cFYI(1, "parms start after end of smb");
+		cifs_dbg(FYI, "parms start after end of smb\n");
 		return -EINVAL;
 	} else if (parm_count + *ppparm > end_of_smb) {
-		cFYI(1, "parm end after end of smb");
+		cifs_dbg(FYI, "parm end after end of smb\n");
 		return -EINVAL;
 	} else if (*ppdata > end_of_smb) {
-		cFYI(1, "data starts after end of smb");
+		cifs_dbg(FYI, "data starts after end of smb\n");
 		return -EINVAL;
 	} else if (data_count + *ppdata > end_of_smb) {
-		cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
-			*ppdata, data_count, (data_count + *ppdata),
-			end_of_smb, pSMBr);
+		cifs_dbg(FYI, "data %p + count %d (%p) past smb end %p start %p\n",
+			 *ppdata, data_count, (data_count + *ppdata),
+			 end_of_smb, pSMBr);
 		return -EINVAL;
 	} else if (parm_count + data_count > bcc) {
-		cFYI(1, "parm count and data count larger than SMB");
+		cifs_dbg(FYI, "parm count and data count larger than SMB\n");
 		return -EINVAL;
 	}
 	*pdatalen = data_count;
@@ -3673,7 +3673,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
 	QUERY_SEC_DESC_REQ *pSMB;
 	struct kvec iov[1];
 
-	cFYI(1, "GetCifsACL");
+	cifs_dbg(FYI, "GetCifsACL\n");
 
 	*pbuflen = 0;
 	*acl_inf = NULL;
@@ -3698,7 +3698,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
 			 0);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get);
 	if (rc) {
-		cFYI(1, "Send error in QuerySecDesc = %d", rc);
+		cifs_dbg(FYI, "Send error in QuerySecDesc = %d\n", rc);
 	} else {                /* decode response */
 		__le32 *parm;
 		__u32 parm_len;
@@ -3713,7 +3713,8 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
 			goto qsec_out;
 		pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
 
-		cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
+		cifs_dbg(FYI, "smb %p parm %p data %p\n",
+			 pSMBr, parm, *acl_inf);
 
 		if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
 			rc = -EIO;      /* bad smb */
@@ -3725,8 +3726,8 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
 
 		acl_len = le32_to_cpu(*parm);
 		if (acl_len != *pbuflen) {
-			cERROR(1, "acl length %d does not match %d",
-				   acl_len, *pbuflen);
+			cifs_dbg(VFS, "acl length %d does not match %d\n",
+				 acl_len, *pbuflen);
 			if (*pbuflen > acl_len)
 				*pbuflen = acl_len;
 		}
@@ -3735,16 +3736,15 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
 		   header followed by the smallest SID */
 		if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
 		    (*pbuflen >= 64 * 1024)) {
-			cERROR(1, "bad acl length %d", *pbuflen);
+			cifs_dbg(VFS, "bad acl length %d\n", *pbuflen);
 			rc = -EINVAL;
 			*pbuflen = 0;
 		} else {
-			*acl_inf = kmalloc(*pbuflen, GFP_KERNEL);
+			*acl_inf = kmemdup(pdata, *pbuflen, GFP_KERNEL);
 			if (*acl_inf == NULL) {
 				*pbuflen = 0;
 				rc = -ENOMEM;
 			}
-			memcpy(*acl_inf, pdata, *pbuflen);
 		}
 	}
 qsec_out:
@@ -3806,9 +3806,10 @@ setCifsAclRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 
-	cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
+	cifs_dbg(FYI, "SetCIFSACL bytes_returned: %d, rc: %d\n",
+		 bytes_returned, rc);
 	if (rc)
-		cFYI(1, "Set CIFS ACL returned %d", rc);
+		cifs_dbg(FYI, "Set CIFS ACL returned %d\n", rc);
 	cifs_buf_release(pSMB);
 
 	if (rc == -EAGAIN)
@@ -3832,7 +3833,7 @@ SMBQueryInformation(const unsigned int xid, struct cifs_tcon *tcon,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, "In SMBQPath path %s", search_name);
+	cifs_dbg(FYI, "In SMBQPath path %s\n", search_name);
 QInfRetry:
 	rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -3859,7 +3860,7 @@ QInfRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QueryInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QueryInfo = %d\n", rc);
 	} else if (data) {
 		struct timespec ts;
 		__u32 time = le32_to_cpu(pSMBr->last_write_time);
@@ -3929,11 +3930,12 @@ QFileInfoRetry:
 	pSMB->Pad = 0;
 	pSMB->Fid = netfid;
 	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->t2.ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QPathInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -3970,7 +3972,7 @@ CIFSSMBQPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-	/* cFYI(1, "In QPathInfo path %s", search_name); */
+	/* cifs_dbg(FYI, "In QPathInfo path %s\n", search_name); */
 QPathInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4020,7 +4022,7 @@ QPathInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QPathInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4097,18 +4099,17 @@ UnixQFileInfoRetry:
 	pSMB->Pad = 0;
 	pSMB->Fid = netfid;
 	inc_rfc1001_len(pSMB, byte_count);
+	pSMB->t2.ByteCount = cpu_to_le16(byte_count);
 
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QPathInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 		if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
-			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response. "
-				   "Unix Extensions can be disabled on mount "
-				   "by specifying the nosfu mount option.");
+			cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n");
 			rc = -EIO;	/* bad smb */
 		} else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4140,7 +4141,7 @@ CIFSSMBUnixQPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
+	cifs_dbg(FYI, "In QPathInfo (Unix) the path %s\n", searchName);
 UnixQPathInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4187,14 +4188,12 @@ UnixQPathInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QPathInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 		if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
-			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response. "
-				   "Unix Extensions can be disabled on mount "
-				   "by specifying the nosfu mount option.");
+			cifs_dbg(VFS, "Malformed FILE_UNIX_BASIC_INFO response. Unix Extensions can be disabled on mount by specifying the nosfu mount option.\n");
 			rc = -EIO;	/* bad smb */
 		} else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4228,7 +4227,7 @@ CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon,
 	__u16 params, byte_count;
 	struct nls_table *nls_codepage;
 
-	cFYI(1, "In FindFirst for %s", searchName);
+	cifs_dbg(FYI, "In FindFirst for %s\n", searchName);
 
 findFirstRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -4311,7 +4310,7 @@ findFirstRetry:
 	if (rc) {/* BB add logic to retry regular search if Unix search
 			rejected unexpectedly by server */
 		/* BB Add code to handle unsupported level rc */
-		cFYI(1, "Error in FindFirst = %d", rc);
+		cifs_dbg(FYI, "Error in FindFirst = %d\n", rc);
 
 		cifs_buf_release(pSMB);
 
@@ -4349,7 +4348,7 @@ findFirstRetry:
 				psrch_inf->entries_in_buffer;
 			lnoff = le16_to_cpu(parms->LastNameOffset);
 			if (CIFSMaxBufSize < lnoff) {
-				cERROR(1, "ignoring corrupt resume name");
+				cifs_dbg(VFS, "ignoring corrupt resume name\n");
 				psrch_inf->last_entry = NULL;
 				return rc;
 			}
@@ -4380,7 +4379,7 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
 	unsigned int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, "In FindNext");
+	cifs_dbg(FYI, "In FindNext\n");
 
 	if (psrch_inf->endOfSearch)
 		return -ENOENT;
@@ -4441,7 +4440,7 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
 			cifs_buf_release(pSMB);
 			rc = 0; /* search probably was closed at end of search*/
 		} else
-			cFYI(1, "FindNext returned = %d", rc);
+			cifs_dbg(FYI, "FindNext returned = %d\n", rc);
 	} else {                /* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4476,15 +4475,15 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
 				psrch_inf->entries_in_buffer;
 			lnoff = le16_to_cpu(parms->LastNameOffset);
 			if (CIFSMaxBufSize < lnoff) {
-				cERROR(1, "ignoring corrupt resume name");
+				cifs_dbg(VFS, "ignoring corrupt resume name\n");
 				psrch_inf->last_entry = NULL;
 				return rc;
 			} else
 				psrch_inf->last_entry =
 					psrch_inf->srch_entries_start + lnoff;
 
-/*  cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
-	    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
+/*  cifs_dbg(FYI, "fnxt2 entries in buf %d index_of_last %d\n",
+    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
 
 			/* BB fixme add unlock here */
 		}
@@ -4509,7 +4508,7 @@ CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	FINDCLOSE_REQ *pSMB = NULL;
 
-	cFYI(1, "In CIFSSMBFindClose");
+	cifs_dbg(FYI, "In CIFSSMBFindClose\n");
 	rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
 
 	/* no sense returning error if session restarted
@@ -4523,7 +4522,7 @@ CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon,
 	pSMB->ByteCount = 0;
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
 	if (rc)
-		cERROR(1, "Send error in FindClose = %d", rc);
+		cifs_dbg(VFS, "Send error in FindClose = %d\n", rc);
 
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_fclose);
 
@@ -4545,7 +4544,7 @@ CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon,
 	int name_len, bytes_returned;
 	__u16 params, byte_count;
 
-	cFYI(1, "In GetSrvInodeNum for %s", search_name);
+	cifs_dbg(FYI, "In GetSrvInodeNum for %s\n", search_name);
 	if (tcon == NULL)
 		return -ENODEV;
 
@@ -4596,7 +4595,7 @@ GetInodeNumberRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "error %d in QueryInternalInfo", rc);
+		cifs_dbg(FYI, "error %d in QueryInternalInfo\n", rc);
 	} else {
 		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4611,7 +4610,7 @@ GetInodeNumberRetry:
 			struct file_internal_info *pfinfo;
 			/* BB Do we need a cast or hash here ? */
 			if (count < 8) {
-				cFYI(1, "Illegal size ret in QryIntrnlInf");
+				cifs_dbg(FYI, "Illegal size ret in QryIntrnlInf\n");
 				rc = -EIO;
 				goto GetInodeNumOut;
 			}
@@ -4652,16 +4651,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
 
 	if (*num_of_nodes < 1) {
-		cERROR(1, "num_referrals: must be at least > 0,"
-			"but we get num_referrals = %d", *num_of_nodes);
+		cifs_dbg(VFS, "num_referrals: must be at least > 0, but we get num_referrals = %d\n",
+			 *num_of_nodes);
 		rc = -EINVAL;
 		goto parse_DFS_referrals_exit;
 	}
 
 	ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
 	if (ref->VersionNumber != cpu_to_le16(3)) {
-		cERROR(1, "Referrals of V%d version are not supported,"
-			"should be V3", le16_to_cpu(ref->VersionNumber));
+		cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n",
+			 le16_to_cpu(ref->VersionNumber));
 		rc = -EINVAL;
 		goto parse_DFS_referrals_exit;
 	}
@@ -4670,14 +4669,12 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	data_end = (char *)(&(pSMBr->PathConsumed)) +
 				le16_to_cpu(pSMBr->t2.DataCount);
 
-	cFYI(1, "num_referrals: %d dfs flags: 0x%x ...",
-			*num_of_nodes,
-			le32_to_cpu(pSMBr->DFSFlags));
+	cifs_dbg(FYI, "num_referrals: %d dfs flags: 0x%x ...\n",
+		 *num_of_nodes, le32_to_cpu(pSMBr->DFSFlags));
 
-	*target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
-			*num_of_nodes, GFP_KERNEL);
+	*target_nodes = kcalloc(*num_of_nodes, sizeof(struct dfs_info3_param),
+				GFP_KERNEL);
 	if (*target_nodes == NULL) {
-		cERROR(1, "Failed to allocate buffer for target_nodes");
 		rc = -ENOMEM;
 		goto parse_DFS_referrals_exit;
 	}
@@ -4756,7 +4753,7 @@ CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses,
 	*num_of_nodes = 0;
 	*target_nodes = NULL;
 
-	cFYI(1, "In GetDFSRefer the path %s", search_name);
+	cifs_dbg(FYI, "In GetDFSRefer the path %s\n", search_name);
 	if (ses == NULL)
 		return -ENODEV;
 getDFSRetry:
@@ -4789,11 +4786,8 @@ getDFSRetry:
 		strncpy(pSMB->RequestFileName, search_name, name_len);
 	}
 
-	if (ses->server) {
-		if (ses->server->sec_mode &
-		   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-			pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-	}
+	if (ses->server && ses->server->sign)
+		pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 
 	pSMB->hdr.Uid = ses->Suid;
 
@@ -4824,7 +4818,7 @@ getDFSRetry:
 	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in GetDFSRefer = %d", rc);
+		cifs_dbg(FYI, "Send error in GetDFSRefer = %d\n", rc);
 		goto GetDFSRefExit;
 	}
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4835,9 +4829,8 @@ getDFSRetry:
 		goto GetDFSRefExit;
 	}
 
-	cFYI(1, "Decoding GetDFSRefer response BCC: %d  Offset %d",
-				get_bcc(&pSMBr->hdr),
-				le16_to_cpu(pSMBr->t2.DataOffset));
+	cifs_dbg(FYI, "Decoding GetDFSRefer response BCC: %d  Offset %d\n",
+		 get_bcc(&pSMBr->hdr), le16_to_cpu(pSMBr->t2.DataOffset));
 
 	/* parse returned result into more usable form */
 	rc = parse_DFS_referrals(pSMBr, num_of_nodes,
@@ -4866,7 +4859,7 @@ SMBOldQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "OldQFSInfo");
+	cifs_dbg(FYI, "OldQFSInfo\n");
 oldQFSInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		(void **) &pSMBr);
@@ -4899,7 +4892,7 @@ oldQFSInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QFSInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc);
 	} else {                /* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4907,7 +4900,7 @@ oldQFSInfoRetry:
 			rc = -EIO;      /* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			cFYI(1, "qfsinf resp BCC: %d  Offset %d",
+			cifs_dbg(FYI, "qfsinf resp BCC: %d  Offset %d\n",
 				 get_bcc(&pSMBr->hdr), data_offset);
 
 			response_data = (FILE_SYSTEM_ALLOC_INFO *)
@@ -4920,10 +4913,10 @@ oldQFSInfoRetry:
 			       le32_to_cpu(response_data->TotalAllocationUnits);
 			FSData->f_bfree = FSData->f_bavail =
 				le32_to_cpu(response_data->FreeAllocationUnits);
-			cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
-			     (unsigned long long)FSData->f_blocks,
-			     (unsigned long long)FSData->f_bfree,
-			     FSData->f_bsize);
+			cifs_dbg(FYI, "Blocks: %lld  Free: %lld Block size %ld\n",
+				 (unsigned long long)FSData->f_blocks,
+				 (unsigned long long)FSData->f_bfree,
+				 FSData->f_bsize);
 		}
 	}
 	cifs_buf_release(pSMB);
@@ -4946,7 +4939,7 @@ CIFSSMBQFSInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QFSInfo");
+	cifs_dbg(FYI, "In QFSInfo\n");
 QFSInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4979,7 +4972,7 @@ QFSInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QFSInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QFSInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -5000,10 +4993,10 @@ QFSInfoRetry:
 			    le64_to_cpu(response_data->TotalAllocationUnits);
 			FSData->f_bfree = FSData->f_bavail =
 			    le64_to_cpu(response_data->FreeAllocationUnits);
-			cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
-			     (unsigned long long)FSData->f_blocks,
-			     (unsigned long long)FSData->f_bfree,
-			     FSData->f_bsize);
+			cifs_dbg(FYI, "Blocks: %lld  Free: %lld Block size %ld\n",
+				 (unsigned long long)FSData->f_blocks,
+				 (unsigned long long)FSData->f_bfree,
+				 FSData->f_bsize);
 		}
 	}
 	cifs_buf_release(pSMB);
@@ -5025,7 +5018,7 @@ CIFSSMBQFSAttributeInfo(const unsigned int xid, struct cifs_tcon *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QFSAttributeInfo");
+	cifs_dbg(FYI, "In QFSAttributeInfo\n");
 QFSAttributeRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5059,7 +5052,7 @@ QFSAttributeRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
+		cifs_dbg(VFS, "Send error in QFSAttributeInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -5095,7 +5088,7 @@ CIFSSMBQFSDeviceInfo(const unsigned int xid, struct cifs_tcon *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QFSDeviceInfo");
+	cifs_dbg(FYI, "In QFSDeviceInfo\n");
 QFSDeviceRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5130,7 +5123,7 @@ QFSDeviceRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QFSDeviceInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -5166,7 +5159,7 @@ CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QFSUnixInfo");
+	cifs_dbg(FYI, "In QFSUnixInfo\n");
 QFSUnixRetry:
 	rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
 				   (void **) &pSMB, (void **) &pSMBr);
@@ -5200,7 +5193,7 @@ QFSUnixRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, "Send error in QFSUnixInfo = %d", rc);
+		cifs_dbg(VFS, "Send error in QFSUnixInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -5235,7 +5228,7 @@ CIFSSMBSetFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon, __u64 cap)
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, "In SETFSUnixInfo");
+	cifs_dbg(FYI, "In SETFSUnixInfo\n");
 SETFSUnixRetry:
 	/* BB switch to small buf init to save memory */
 	rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
@@ -5283,7 +5276,7 @@ SETFSUnixRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
+		cifs_dbg(VFS, "Send error in SETFSUnixInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 		if (rc)
@@ -5311,7 +5304,7 @@ CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, "In QFSPosixInfo");
+	cifs_dbg(FYI, "In QFSPosixInfo\n");
 QFSPosixRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5345,7 +5338,7 @@ QFSPosixRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QFSUnixInfo = %d", rc);
+		cifs_dbg(FYI, "Send error in QFSUnixInfo = %d\n", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -5407,7 +5400,7 @@ CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
 
 	__u16 params, byte_count, data_count, param_offset, offset;
 
-	cFYI(1, "In SetEOF");
+	cifs_dbg(FYI, "In SetEOF\n");
 SetEOFRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5473,7 +5466,7 @@ SetEOFRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "SetPathInfo (file size) returned %d", rc);
+		cifs_dbg(FYI, "SetPathInfo (file size) returned %d\n", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -5492,8 +5485,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, "SetFileSize (via SetFileInfo) %lld",
-			(long long)size);
+	cifs_dbg(FYI, "SetFileSize (via SetFileInfo) %lld\n",
+		 (long long)size);
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -5550,7 +5543,8 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
 	if (rc) {
-		cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
+		cifs_dbg(FYI, "Send error in SetFileInfo (SetFileSize) = %d\n",
+			 rc);
 	}
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
@@ -5574,7 +5568,7 @@ CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, "Set Times (via SetFileInfo)");
+	cifs_dbg(FYI, "Set Times (via SetFileInfo)\n");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -5620,7 +5614,8 @@ CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
 	if (rc)
-		cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
+		cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n",
+			 rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
@@ -5637,7 +5632,7 @@ CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, "Set File Disposition (via SetFileInfo)");
+	cifs_dbg(FYI, "Set File Disposition (via SetFileInfo)\n");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -5679,7 +5674,7 @@ CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon,
 	*data_offset = delete_file ? 1 : 0;
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
 	if (rc)
-		cFYI(1, "Send error in SetFileDisposition = %d", rc);
+		cifs_dbg(FYI, "Send error in SetFileDisposition = %d\n", rc);
 
 	return rc;
 }
@@ -5697,7 +5692,7 @@ CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	char *data_offset;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, "In SetTimes");
+	cifs_dbg(FYI, "In SetTimes\n");
 
 SetTimesRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -5753,7 +5748,7 @@ SetTimesRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "SetPathInfo (times) returned %d", rc);
+		cifs_dbg(FYI, "SetPathInfo (times) returned %d\n", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -5778,7 +5773,7 @@ CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, char *fileName,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, "In SetAttrLegacy");
+	cifs_dbg(FYI, "In SetAttrLegacy\n");
 
 SetAttrLgcyRetry:
 	rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
@@ -5804,7 +5799,7 @@ SetAttrLgcyRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "Error in LegacySetAttr = %d", rc);
+		cifs_dbg(FYI, "Error in LegacySetAttr = %d\n", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -5819,8 +5814,14 @@ static void
 cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
 			const struct cifs_unix_set_info_args *args)
 {
+	u64 uid = NO_CHANGE_64, gid = NO_CHANGE_64;
 	u64 mode = args->mode;
 
+	if (uid_valid(args->uid))
+		uid = from_kuid(&init_user_ns, args->uid);
+	if (gid_valid(args->gid))
+		gid = from_kgid(&init_user_ns, args->gid);
+
 	/*
 	 * Samba server ignores set of file size to zero due to bugs in some
 	 * older clients, but we should be precise - we use SetFileSize to
@@ -5833,8 +5834,8 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
 	data_offset->LastStatusChange = cpu_to_le64(args->ctime);
 	data_offset->LastAccessTime = cpu_to_le64(args->atime);
 	data_offset->LastModificationTime = cpu_to_le64(args->mtime);
-	data_offset->Uid = cpu_to_le64(args->uid);
-	data_offset->Gid = cpu_to_le64(args->gid);
+	data_offset->Uid = cpu_to_le64(uid);
+	data_offset->Gid = cpu_to_le64(gid);
 	/* better to leave device as zero when it is  */
 	data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
 	data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
@@ -5866,7 +5867,7 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, "Set Unix Info (via SetFileInfo)");
+	cifs_dbg(FYI, "Set Unix Info (via SetFileInfo)\n");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -5912,7 +5913,8 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
 	if (rc)
-		cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
+		cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n",
+			 rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
@@ -5934,7 +5936,7 @@ CIFSSMBUnixSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	FILE_UNIX_BASIC_INFO *data_offset;
 	__u16 params, param_offset, offset, count, byte_count;
 
-	cFYI(1, "In SetUID/GID/Mode");
+	cifs_dbg(FYI, "In SetUID/GID/Mode\n");
 setPermsRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5990,7 +5992,7 @@ setPermsRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "SetPathInfo (perms) returned %d", rc);
+		cifs_dbg(FYI, "SetPathInfo (perms) returned %d\n", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -6027,7 +6029,7 @@ CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon,
 	__u16 params, byte_count, data_offset;
 	unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0;
 
-	cFYI(1, "In Query All EAs path %s", searchName);
+	cifs_dbg(FYI, "In Query All EAs path %s\n", searchName);
 QAllEAsRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -6074,7 +6076,7 @@ QAllEAsRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, "Send error in QueryAllEAs = %d", rc);
+		cifs_dbg(FYI, "Send error in QueryAllEAs = %d\n", rc);
 		goto QAllEAsOut;
 	}
 
@@ -6102,16 +6104,16 @@ QAllEAsRetry:
 				(((char *) &pSMBr->hdr.Protocol) + data_offset);
 
 	list_len = le32_to_cpu(ea_response_data->list_len);
-	cFYI(1, "ea length %d", list_len);
+	cifs_dbg(FYI, "ea length %d\n", list_len);
 	if (list_len <= 8) {
-		cFYI(1, "empty EA list returned from server");
+		cifs_dbg(FYI, "empty EA list returned from server\n");
 		goto QAllEAsOut;
 	}
 
 	/* make sure list_len doesn't go past end of SMB */
 	end_of_smb = (char *)pByteArea(&pSMBr->hdr) + get_bcc(&pSMBr->hdr);
 	if ((char *)ea_response_data + list_len > end_of_smb) {
-		cFYI(1, "EA list appears to go beyond SMB");
+		cifs_dbg(FYI, "EA list appears to go beyond SMB\n");
 		rc = -EIO;
 		goto QAllEAsOut;
 	}
@@ -6128,7 +6130,7 @@ QAllEAsRetry:
 		temp_ptr += 4;
 		/* make sure we can read name_len and value_len */
 		if (list_len < 0) {
-			cFYI(1, "EA entry goes beyond length of list");
+			cifs_dbg(FYI, "EA entry goes beyond length of list\n");
 			rc = -EIO;
 			goto QAllEAsOut;
 		}
@@ -6137,7 +6139,7 @@ QAllEAsRetry:
 		value_len = le16_to_cpu(temp_fea->value_len);
 		list_len -= name_len + 1 + value_len;
 		if (list_len < 0) {
-			cFYI(1, "EA entry goes beyond length of list");
+			cifs_dbg(FYI, "EA entry goes beyond length of list\n");
 			rc = -EIO;
 			goto QAllEAsOut;
 		}
@@ -6205,7 +6207,7 @@ CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
 	int bytes_returned = 0;
 	__u16 params, param_offset, byte_count, offset, count;
 
-	cFYI(1, "In SetEA");
+	cifs_dbg(FYI, "In SetEA\n");
 SetEARetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -6287,7 +6289,7 @@ SetEARetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, "SetPathInfo (EA) returned %d", rc);
+		cifs_dbg(FYI, "SetPathInfo (EA) returned %d\n", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -6330,7 +6332,7 @@ int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon,
 	struct dir_notify_req *dnotify_req;
 	int bytes_returned;
 
-	cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
+	cifs_dbg(FYI, "In CIFSSMBNotify for file handle %d\n", (int)netfid);
 	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
 	if (rc)
@@ -6359,7 +6361,7 @@ int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon,
 			 (struct smb_hdr *)pSMBr, &bytes_returned,
 			 CIFS_ASYNC_OP);
 	if (rc) {
-		cFYI(1, "Error in Notify = %d", rc);
+		cifs_dbg(FYI, "Error in Notify = %d\n", rc);
 	} else {
 		/* Add file to outstanding requests */
 		/* BB change to kmem cache alloc */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 12b3da39733..afcb8a1a33b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -85,7 +85,7 @@ enum {
 	Opt_acl, Opt_noacl, Opt_locallease,
 	Opt_sign, Opt_seal, Opt_noac,
 	Opt_fsc, Opt_mfsymlinks,
-	Opt_multiuser, Opt_sloppy,
+	Opt_multiuser, Opt_sloppy, Opt_nosharesock,
 
 	/* Mount options which take numeric value */
 	Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -95,9 +95,7 @@ enum {
 
 	/* Mount options which take string value */
 	Opt_user, Opt_pass, Opt_ip,
-	Opt_unc, Opt_domain,
-	Opt_srcaddr, Opt_prefixpath,
-	Opt_iocharset, Opt_sockopt,
+	Opt_domain, Opt_srcaddr, Opt_iocharset,
 	Opt_netbiosname, Opt_servern,
 	Opt_ver, Opt_vers, Opt_sec, Opt_cache,
 
@@ -167,6 +165,7 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_mfsymlinks, "mfsymlinks" },
 	{ Opt_multiuser, "multiuser" },
 	{ Opt_sloppy, "sloppy" },
+	{ Opt_nosharesock, "nosharesock" },
 
 	{ Opt_backupuid, "backupuid=%s" },
 	{ Opt_backupgid, "backupgid=%s" },
@@ -193,16 +192,15 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_blank_ip, "addr=" },
 	{ Opt_ip, "ip=%s" },
 	{ Opt_ip, "addr=%s" },
-	{ Opt_unc, "unc=%s" },
-	{ Opt_unc, "target=%s" },
-	{ Opt_unc, "path=%s" },
+	{ Opt_ignore, "unc=%s" },
+	{ Opt_ignore, "target=%s" },
+	{ Opt_ignore, "path=%s" },
 	{ Opt_domain, "dom=%s" },
 	{ Opt_domain, "domain=%s" },
 	{ Opt_domain, "workgroup=%s" },
 	{ Opt_srcaddr, "srcaddr=%s" },
-	{ Opt_prefixpath, "prefixpath=%s" },
+	{ Opt_ignore, "prefixpath=%s" },
 	{ Opt_iocharset, "iocharset=%s" },
-	{ Opt_sockopt, "sockopt=%s" },
 	{ Opt_netbiosname, "netbiosname=%s" },
 	{ Opt_servern, "servern=%s" },
 	{ Opt_ver, "ver=%s" },
@@ -278,6 +276,7 @@ static const match_table_t cifs_smb_version_tokens = {
 	{ Smb_20, SMB20_VERSION_STRING},
 	{ Smb_21, SMB21_VERSION_STRING },
 	{ Smb_30, SMB30_VERSION_STRING },
+	{ Smb_302, SMB302_VERSION_STRING },
 };
 
 static int ip_connect(struct TCP_Server_Info *server);
@@ -319,11 +318,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	server->max_read = 0;
 #endif
 
-	cFYI(1, "Reconnecting tcp session");
+	cifs_dbg(FYI, "Reconnecting tcp session\n");
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
 		and the tid bad so they are not used until reconnected */
-	cFYI(1, "%s: marking sessions and tcons for reconnect", __func__);
+	cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect\n",
+		 __func__);
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &server->smb_ses_list) {
 		ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
@@ -337,15 +337,14 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	spin_unlock(&cifs_tcp_ses_lock);
 
 	/* do not want to be sending data on a socket we are freeing */
-	cFYI(1, "%s: tearing down socket", __func__);
+	cifs_dbg(FYI, "%s: tearing down socket\n", __func__);
 	mutex_lock(&server->srv_mutex);
 	if (server->ssocket) {
-		cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
-			server->ssocket->flags);
+		cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n",
+			 server->ssocket->state, server->ssocket->flags);
 		kernel_sock_shutdown(server->ssocket, SHUT_WR);
-		cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
-			server->ssocket->state,
-			server->ssocket->flags);
+		cifs_dbg(FYI, "Post shutdown state: 0x%x Flags: 0x%lx\n",
+			 server->ssocket->state, server->ssocket->flags);
 		sock_release(server->ssocket);
 		server->ssocket = NULL;
 	}
@@ -359,7 +358,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 
 	/* mark submitted MIDs for retry and issue callback */
 	INIT_LIST_HEAD(&retry_list);
-	cFYI(1, "%s: moving mids to private list", __func__);
+	cifs_dbg(FYI, "%s: moving mids to private list\n", __func__);
 	spin_lock(&GlobalMid_Lock);
 	list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
@@ -369,7 +368,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	}
 	spin_unlock(&GlobalMid_Lock);
 
-	cFYI(1, "%s: issuing mid callbacks", __func__);
+	cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
 	list_for_each_safe(tmp, tmp2, &retry_list) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
 		list_del_init(&mid_entry->qhead);
@@ -382,7 +381,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		/* we should try only the port we connected to before */
 		rc = generic_ip_connect(server);
 		if (rc) {
-			cFYI(1, "reconnect error %d", rc);
+			cifs_dbg(FYI, "reconnect error %d\n", rc);
 			msleep(3000);
 		} else {
 			atomic_inc(&tcpSesReconnectCount);
@@ -416,8 +415,8 @@ cifs_echo_request(struct work_struct *work)
 
 	rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS;
 	if (rc)
-		cFYI(1, "Unable to send echo request to server: %s",
-			server->hostname);
+		cifs_dbg(FYI, "Unable to send echo request to server: %s\n",
+			 server->hostname);
 
 requeue_echo:
 	queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL);
@@ -429,7 +428,7 @@ allocate_buffers(struct TCP_Server_Info *server)
 	if (!server->bigbuf) {
 		server->bigbuf = (char *)cifs_buf_get();
 		if (!server->bigbuf) {
-			cERROR(1, "No memory for large SMB response");
+			cifs_dbg(VFS, "No memory for large SMB response\n");
 			msleep(3000);
 			/* retry will check if exiting */
 			return false;
@@ -442,7 +441,7 @@ allocate_buffers(struct TCP_Server_Info *server)
 	if (!server->smallbuf) {
 		server->smallbuf = (char *)cifs_small_buf_get();
 		if (!server->smallbuf) {
-			cERROR(1, "No memory for SMB response");
+			cifs_dbg(VFS, "No memory for SMB response\n");
 			msleep(1000);
 			/* retry will check if exiting */
 			return false;
@@ -472,9 +471,8 @@ server_unresponsive(struct TCP_Server_Info *server)
 	 */
 	if (server->tcpStatus == CifsGood &&
 	    time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {
-		cERROR(1, "Server %s has not responded in %d seconds. "
-			  "Reconnecting...", server->hostname,
-			  (2 * SMB_ECHO_INTERVAL) / HZ);
+		cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n",
+			 server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ);
 		cifs_reconnect(server);
 		wake_up(&server->response_q);
 		return true;
@@ -585,8 +583,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
 			length = 0;
 			continue;
 		} else if (length <= 0) {
-			cFYI(1, "Received no data or error: expecting %d "
-				"got %d", to_read, length);
+			cifs_dbg(FYI, "Received no data or error: expecting %d\n"
+				 "got %d", to_read, length);
 			cifs_reconnect(server);
 			total_read = -EAGAIN;
 			break;
@@ -620,17 +618,17 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
 		/* Regular SMB response */
 		return true;
 	case RFC1002_SESSION_KEEP_ALIVE:
-		cFYI(1, "RFC 1002 session keep alive");
+		cifs_dbg(FYI, "RFC 1002 session keep alive\n");
 		break;
 	case RFC1002_POSITIVE_SESSION_RESPONSE:
-		cFYI(1, "RFC 1002 positive session response");
+		cifs_dbg(FYI, "RFC 1002 positive session response\n");
 		break;
 	case RFC1002_NEGATIVE_SESSION_RESPONSE:
 		/*
 		 * We get this from Windows 98 instead of an error on
 		 * SMB negprot response.
 		 */
-		cFYI(1, "RFC 1002 negative session response");
+		cifs_dbg(FYI, "RFC 1002 negative session response\n");
 		/* give server a second to clean up */
 		msleep(1000);
 		/*
@@ -644,7 +642,7 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
 		wake_up(&server->response_q);
 		break;
 	default:
-		cERROR(1, "RFC 1002 unknown response type 0x%x", type);
+		cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", type);
 		cifs_reconnect(server);
 	}
 
@@ -730,7 +728,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
 		spin_lock(&GlobalMid_Lock);
 		list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
 			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-			cFYI(1, "Clearing mid 0x%llx", mid_entry->mid);
+			cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid);
 			mid_entry->mid_state = MID_SHUTDOWN;
 			list_move(&mid_entry->qhead, &dispose_list);
 		}
@@ -739,7 +737,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
 		/* now walk dispose list and issue callbacks */
 		list_for_each_safe(tmp, tmp2, &dispose_list) {
 			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-			cFYI(1, "Callback mid 0x%llx", mid_entry->mid);
+			cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid);
 			list_del_init(&mid_entry->qhead);
 			mid_entry->callback(mid_entry);
 		}
@@ -756,7 +754,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
 		 * least 45 seconds before giving up on a request getting a
 		 * response and going ahead and killing cifsd.
 		 */
-		cFYI(1, "Wait for exit from demultiplex thread");
+		cifs_dbg(FYI, "Wait for exit from demultiplex thread\n");
 		msleep(46000);
 		/*
 		 * If threads still have not exited they are probably never
@@ -783,8 +781,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 
 	/* make sure this will fit in a large buffer */
 	if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server) - 4) {
-		cERROR(1, "SMB response too long (%u bytes)",
-			pdu_length);
+		cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
 		cifs_reconnect(server);
 		wake_up(&server->response_q);
 		return -EAGAIN;
@@ -842,7 +839,7 @@ cifs_demultiplex_thread(void *p)
 	struct mid_q_entry *mid_entry;
 
 	current->flags |= PF_MEMALLOC;
-	cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
+	cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current));
 
 	length = atomic_inc_return(&tcpSesAllocCount);
 	if (length > 1)
@@ -872,14 +869,14 @@ cifs_demultiplex_thread(void *p)
 		 */
 		pdu_length = get_rfc1002_length(buf);
 
-		cFYI(1, "RFC1002 header 0x%x", pdu_length);
+		cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length);
 		if (!is_smb_response(server, buf[0]))
 			continue;
 
 		/* make sure we have enough to get to the MID */
 		if (pdu_length < HEADER_SIZE(server) - 1 - 4) {
-			cERROR(1, "SMB response too short (%u bytes)",
-				pdu_length);
+			cifs_dbg(VFS, "SMB response too short (%u bytes)\n",
+				 pdu_length);
 			cifs_reconnect(server);
 			wake_up(&server->response_q);
 			continue;
@@ -911,8 +908,8 @@ cifs_demultiplex_thread(void *p)
 				mid_entry->callback(mid_entry);
 		} else if (!server->ops->is_oplock_break ||
 			   !server->ops->is_oplock_break(buf, server)) {
-			cERROR(1, "No task to wake, unknown frame received! "
-				   "NumMids %d", atomic_read(&midCount));
+			cifs_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n",
+				 atomic_read(&midCount));
 			cifs_dump_mem("Received Data is: ", buf,
 				      HEADER_SIZE(server));
 #ifdef CONFIG_CIFS_DEBUG2
@@ -987,6 +984,41 @@ static int get_option_ul(substring_t args[], unsigned long *option)
 	return rc;
 }
 
+static int get_option_uid(substring_t args[], kuid_t *result)
+{
+	unsigned long value;
+	kuid_t uid;
+	int rc;
+
+	rc = get_option_ul(args, &value);
+	if (rc)
+		return rc;
+
+	uid = make_kuid(current_user_ns(), value);
+	if (!uid_valid(uid))
+		return -EINVAL;
+
+	*result = uid;
+	return 0;
+}
+
+static int get_option_gid(substring_t args[], kgid_t *result)
+{
+	unsigned long value;
+	kgid_t gid;
+	int rc;
+
+	rc = get_option_ul(args, &value);
+	if (rc)
+		return rc;
+
+	gid = make_kgid(current_user_ns(), value);
+	if (!gid_valid(gid))
+		return -EINVAL;
+
+	*result = gid;
+	return 0;
+}
 
 static int cifs_parse_security_flavors(char *value,
 				       struct smb_vol *vol)
@@ -994,46 +1026,51 @@ static int cifs_parse_security_flavors(char *value,
 
 	substring_t args[MAX_OPT_ARGS];
 
+	/*
+	 * With mount options, the last one should win. Reset any existing
+	 * settings back to default.
+	 */
+	vol->sectype = Unspecified;
+	vol->sign = false;
+
 	switch (match_token(value, cifs_secflavor_tokens, args)) {
-	case Opt_sec_krb5:
-		vol->secFlg |= CIFSSEC_MAY_KRB5;
-		break;
-	case Opt_sec_krb5i:
-		vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN;
-		break;
 	case Opt_sec_krb5p:
-		/* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */
-		cERROR(1, "Krb5 cifs privacy not supported");
-		break;
-	case Opt_sec_ntlmssp:
-		vol->secFlg |= CIFSSEC_MAY_NTLMSSP;
+		cifs_dbg(VFS, "sec=krb5p is not supported!\n");
+		return 1;
+	case Opt_sec_krb5i:
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_sec_krb5:
+		vol->sectype = Kerberos;
 		break;
 	case Opt_sec_ntlmsspi:
-		vol->secFlg |= CIFSSEC_MAY_NTLMSSP | CIFSSEC_MUST_SIGN;
-		break;
-	case Opt_ntlm:
-		/* ntlm is default so can be turned off too */
-		vol->secFlg |= CIFSSEC_MAY_NTLM;
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_sec_ntlmssp:
+		vol->sectype = RawNTLMSSP;
 		break;
 	case Opt_sec_ntlmi:
-		vol->secFlg |= CIFSSEC_MAY_NTLM | CIFSSEC_MUST_SIGN;
-		break;
-	case Opt_sec_ntlmv2:
-		vol->secFlg |= CIFSSEC_MAY_NTLMV2;
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_ntlm:
+		vol->sectype = NTLM;
 		break;
 	case Opt_sec_ntlmv2i:
-		vol->secFlg |= CIFSSEC_MAY_NTLMV2 | CIFSSEC_MUST_SIGN;
+		vol->sign = true;
+		/* Fallthrough */
+	case Opt_sec_ntlmv2:
+		vol->sectype = NTLMv2;
 		break;
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 	case Opt_sec_lanman:
-		vol->secFlg |= CIFSSEC_MAY_LANMAN;
+		vol->sectype = LANMAN;
 		break;
 #endif
 	case Opt_sec_none:
 		vol->nullauth = 1;
 		break;
 	default:
-		cERROR(1, "bad security option: %s", value);
+		cifs_dbg(VFS, "bad security option: %s\n", value);
 		return 1;
 	}
 
@@ -1059,7 +1096,7 @@ cifs_parse_cache_flavor(char *value, struct smb_vol *vol)
 		vol->strict_io = false;
 		break;
 	default:
-		cERROR(1, "bad cache= option: %s", value);
+		cifs_dbg(VFS, "bad cache= option: %s\n", value);
 		return 1;
 	}
 	return 0;
@@ -1088,9 +1125,13 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
 		vol->ops = &smb30_operations;
 		vol->vals = &smb30_values;
 		break;
+	case Smb_302:
+		vol->ops = &smb30_operations; /* currently identical with 3.0 */
+		vol->vals = &smb302_values;
+		break;
 #endif
 	default:
-		cERROR(1, "Unknown vers= option specified: %s", value);
+		cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
 		return 1;
 	}
 	return 0;
@@ -1221,20 +1262,24 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			separator[0] = options[4];
 			options += 5;
 		} else {
-			cFYI(1, "Null separator not allowed");
+			cifs_dbg(FYI, "Null separator not allowed\n");
 		}
 	}
 	vol->backupuid_specified = false; /* no backup intent for a user */
 	vol->backupgid_specified = false; /* no backup intent for a group */
 
-	/*
-	 * For now, we ignore -EINVAL errors under the assumption that the
-	 * unc= and prefixpath= options will be usable.
-	 */
-	if (cifs_parse_devname(devname, vol) == -ENOMEM) {
-		printk(KERN_ERR "CIFS: Unable to allocate memory to parse "
-				"device string.\n");
-		goto out_nomem;
+	switch (cifs_parse_devname(devname, vol)) {
+	case 0:
+		break;
+	case -ENOMEM:
+		cifs_dbg(VFS, "Unable to allocate memory for devname.\n");
+		goto cifs_parse_mount_err;
+	case -EINVAL:
+		cifs_dbg(VFS, "Malformed UNC in devname.\n");
+		goto cifs_parse_mount_err;
+	default:
+		cifs_dbg(VFS, "Unknown error parsing devname.\n");
+		goto cifs_parse_mount_err;
 	}
 
 	while ((data = strsep(&options, separator)) != NULL) {
@@ -1389,7 +1434,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			vol->local_lease = 1;
 			break;
 		case Opt_sign:
-			vol->secFlg |= CIFSSEC_MUST_SIGN;
+			vol->sign = true;
 			break;
 		case Opt_seal:
 			/* we do not do the following in secFlags because seal
@@ -1406,8 +1451,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			break;
 		case Opt_fsc:
 #ifndef CONFIG_CIFS_FSCACHE
-			cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE "
-				  "kernel config option set");
+			cifs_dbg(VFS, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n");
 			goto cifs_parse_mount_err;
 #endif
 			vol->fsc = true;
@@ -1421,64 +1465,62 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 		case Opt_sloppy:
 			sloppy = true;
 			break;
+		case Opt_nosharesock:
+			vol->nosharesock = true;
+			break;
 
 		/* Numeric Values */
 		case Opt_backupuid:
-			if (get_option_ul(args, &option)) {
-				cERROR(1, "%s: Invalid backupuid value",
-					__func__);
+			if (get_option_uid(args, &vol->backupuid)) {
+				cifs_dbg(VFS, "%s: Invalid backupuid value\n",
+					 __func__);
 				goto cifs_parse_mount_err;
 			}
-			vol->backupuid = option;
 			vol->backupuid_specified = true;
 			break;
 		case Opt_backupgid:
-			if (get_option_ul(args, &option)) {
-				cERROR(1, "%s: Invalid backupgid value",
-					__func__);
+			if (get_option_gid(args, &vol->backupgid)) {
+				cifs_dbg(VFS, "%s: Invalid backupgid value\n",
+					 __func__);
 				goto cifs_parse_mount_err;
 			}
-			vol->backupgid = option;
 			vol->backupgid_specified = true;
 			break;
 		case Opt_uid:
-			if (get_option_ul(args, &option)) {
-				cERROR(1, "%s: Invalid uid value",
-					__func__);
+			if (get_option_uid(args, &vol->linux_uid)) {
+				cifs_dbg(VFS, "%s: Invalid uid value\n",
+					 __func__);
 				goto cifs_parse_mount_err;
 			}
-			vol->linux_uid = option;
 			uid_specified = true;
 			break;
 		case Opt_cruid:
-			if (get_option_ul(args, &option)) {
-				cERROR(1, "%s: Invalid cruid value",
-					__func__);
+			if (get_option_uid(args, &vol->cred_uid)) {
+				cifs_dbg(VFS, "%s: Invalid cruid value\n",
+					 __func__);
 				goto cifs_parse_mount_err;
 			}
-			vol->cred_uid = option;
 			break;
 		case Opt_gid:
-			if (get_option_ul(args, &option)) {
-				cERROR(1, "%s: Invalid gid value",
-						__func__);
+			if (get_option_gid(args, &vol->linux_gid)) {
+				cifs_dbg(VFS, "%s: Invalid gid value\n",
+					 __func__);
 				goto cifs_parse_mount_err;
 			}
-			vol->linux_gid = option;
 			gid_specified = true;
 			break;
 		case Opt_file_mode:
 			if (get_option_ul(args, &option)) {
-				cERROR(1, "%s: Invalid file_mode value",
-					__func__);
+				cifs_dbg(VFS, "%s: Invalid file_mode value\n",
+					 __func__);
 				goto cifs_parse_mount_err;
 			}
 			vol->file_mode = option;
 			break;
 		case Opt_dirmode:
 			if (get_option_ul(args, &option)) {
-				cERROR(1, "%s: Invalid dir_mode value",
-					__func__);
+				cifs_dbg(VFS, "%s: Invalid dir_mode value\n",
+					 __func__);
 				goto cifs_parse_mount_err;
 			}
 			vol->dir_mode = option;
@@ -1486,37 +1528,37 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 		case Opt_port:
 			if (get_option_ul(args, &option) ||
 			    option > USHRT_MAX) {
-				cERROR(1, "%s: Invalid port value", __func__);
+				cifs_dbg(VFS, "%s: Invalid port value\n",
+					 __func__);
 				goto cifs_parse_mount_err;
 			}
 			port = (unsigned short)option;
 			break;
 		case Opt_rsize:
 			if (get_option_ul(args, &option)) {
-				cERROR(1, "%s: Invalid rsize value",
-					__func__);
+				cifs_dbg(VFS, "%s: Invalid rsize value\n",
+					 __func__);
 				goto cifs_parse_mount_err;
 			}
 			vol->rsize = option;
 			break;
 		case Opt_wsize:
 			if (get_option_ul(args, &option)) {
-				cERROR(1, "%s: Invalid wsize value",
-					__func__);
+				cifs_dbg(VFS, "%s: Invalid wsize value\n",
+					 __func__);
 				goto cifs_parse_mount_err;
 			}
 			vol->wsize = option;
 			break;
 		case Opt_actimeo:
 			if (get_option_ul(args, &option)) {
-				cERROR(1, "%s: Invalid actimeo value",
-					__func__);
+				cifs_dbg(VFS, "%s: Invalid actimeo value\n",
+					 __func__);
 				goto cifs_parse_mount_err;
 			}
 			vol->actimeo = HZ * option;
 			if (vol->actimeo > CIFS_MAX_ACTIMEO) {
-				cERROR(1, "CIFS: attribute cache"
-					  "timeout too large");
+				cifs_dbg(VFS, "attribute cache timeout too large\n");
 				goto cifs_parse_mount_err;
 			}
 			break;
@@ -1539,21 +1581,28 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 				goto cifs_parse_mount_err;
 			}
 			vol->username = kstrdup(string, GFP_KERNEL);
-			if (!vol->username) {
-				printk(KERN_WARNING "CIFS: no memory "
-						    "for username\n");
+			if (!vol->username)
 				goto cifs_parse_mount_err;
-			}
 			break;
 		case Opt_blank_pass:
-			vol->password = NULL;
-			break;
-		case Opt_pass:
 			/* passwords have to be handled differently
 			 * to allow the character used for deliminator
 			 * to be passed within them
 			 */
 
+			/*
+			 * Check if this is a case where the  password
+			 * starts with a delimiter
+			 */
+			tmp_end = strchr(data, '=');
+			tmp_end++;
+			if (!(tmp_end < end && tmp_end[1] == delim)) {
+				/* No it is not. Set the password to NULL */
+				vol->password = NULL;
+				break;
+			}
+			/* Yes it is. Drop down to Opt_pass below.*/
+		case Opt_pass:
 			/* Obtain the value string */
 			value = strchr(data, '=');
 			value++;
@@ -1621,30 +1670,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			}
 			got_ip = true;
 			break;
-		case Opt_unc:
-			string = vol->UNC;
-			vol->UNC = match_strdup(args);
-			if (vol->UNC == NULL)
-				goto out_nomem;
-
-			convert_delimiter(vol->UNC, '\\');
-			if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') {
-				printk(KERN_ERR "CIFS: UNC Path does not "
-						"begin with // or \\\\\n");
-				goto cifs_parse_mount_err;
-			}
-
-			/* Compare old unc= option to new one */
-			if (!string || strcmp(string, vol->UNC))
-				printk(KERN_WARNING "CIFS: the value of the "
-					"unc= mount option does not match the "
-					"device string. Using the unc= option "
-					"for now. In 3.10, that option will "
-					"be ignored and the contents of the "
-					"device string will be used "
-					"instead. (%s != %s)\n", string,
-					vol->UNC);
-			break;
 		case Opt_domain:
 			string = match_strdup(args);
 			if (string == NULL)
@@ -1662,7 +1687,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 						    "for domainname\n");
 				goto cifs_parse_mount_err;
 			}
-			cFYI(1, "Domain name set");
+			cifs_dbg(FYI, "Domain name set\n");
 			break;
 		case Opt_srcaddr:
 			string = match_strdup(args);
@@ -1677,26 +1702,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 				goto cifs_parse_mount_err;
 			}
 			break;
-		case Opt_prefixpath:
-			/* skip over any leading delimiter */
-			if (*args[0].from == '/' || *args[0].from == '\\')
-				args[0].from++;
-
-			string = vol->prepath;
-			vol->prepath = match_strdup(args);
-			if (vol->prepath == NULL)
-				goto out_nomem;
-			/* Compare old prefixpath= option to new one */
-			if (!string || strcmp(string, vol->prepath))
-				printk(KERN_WARNING "CIFS: the value of the "
-					"prefixpath= mount option does not "
-					"match the device string. Using the "
-					"prefixpath= option for now. In 3.10, "
-					"that option will be ignored and the "
-					"contents of the device string will be "
-					"used instead.(%s != %s)\n", string,
-					vol->prepath);
-			break;
 		case Opt_iocharset:
 			string = match_strdup(args);
 			if (string == NULL)
@@ -1720,20 +1725,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			/* if iocharset not set then load_nls_default
 			 * is used by caller
 			 */
-			cFYI(1, "iocharset set to %s", string);
-			break;
-		case Opt_sockopt:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-
-			if (strnicmp(string, "TCP_NODELAY", 11) == 0) {
-				printk(KERN_WARNING "CIFS: the "
-					"sockopt=TCP_NODELAY option has been "
-					"deprecated and will be removed "
-					"in 3.9\n");
-				vol->sockopt_tcp_nodelay = 1;
-			}
+			 cifs_dbg(FYI, "iocharset set to %s\n", string);
 			break;
 		case Opt_netbiosname:
 			string = match_strdup(args);
@@ -1847,20 +1839,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 #ifndef CONFIG_KEYS
 	/* Muliuser mounts require CONFIG_KEYS support */
 	if (vol->multiuser) {
-		cERROR(1, "Multiuser mounts require kernels with "
-			  "CONFIG_KEYS enabled.");
+		cifs_dbg(VFS, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n");
 		goto cifs_parse_mount_err;
 	}
 #endif
 	if (!vol->UNC) {
-		cERROR(1, "CIFS mount error: No usable UNC path provided in "
-			  "device string or in unc= option!");
+		cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n");
 		goto cifs_parse_mount_err;
 	}
 
 	/* make sure UNC has a share name */
 	if (!strchr(vol->UNC + 3, '\\')) {
-		cERROR(1, "Malformed UNC. Unable to find share name.");
+		cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n");
 		goto cifs_parse_mount_err;
 	}
 
@@ -2001,47 +1991,21 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
 static bool
 match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
-	unsigned int secFlags;
-
-	if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
-		secFlags = vol->secFlg;
-	else
-		secFlags = global_secflags | vol->secFlg;
-
-	switch (server->secType) {
-	case LANMAN:
-		if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
-			return false;
-		break;
-	case NTLMv2:
-		if (!(secFlags & CIFSSEC_MAY_NTLMV2))
-			return false;
-		break;
-	case NTLM:
-		if (!(secFlags & CIFSSEC_MAY_NTLM))
-			return false;
-		break;
-	case Kerberos:
-		if (!(secFlags & CIFSSEC_MAY_KRB5))
-			return false;
-		break;
-	case RawNTLMSSP:
-		if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
-			return false;
-		break;
-	default:
-		/* shouldn't happen */
+	/*
+	 * The select_sectype function should either return the vol->sectype
+	 * that was specified, or "Unspecified" if that sectype was not
+	 * compatible with the given NEGOTIATE request.
+	 */
+	if (select_sectype(server, vol->sectype) == Unspecified)
 		return false;
-	}
 
-	/* now check if signing mode is acceptable */
-	if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
-	    (server->sec_mode & SECMODE_SIGN_REQUIRED))
-			return false;
-	else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) &&
-		 (server->sec_mode &
-		  (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0)
-			return false;
+	/*
+	 * Now check if signing mode is acceptable. No need to check
+	 * global_secflags at this point since if MUST_SIGN is set then
+	 * the server->sign had better be too.
+	 */
+	if (vol->sign && !server->sign)
+		return false;
 
 	return true;
 }
@@ -2050,6 +2014,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
 {
 	struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
 
+	if (vol->nosharesock)
+		return 0;
+
 	if ((server->vals != vol->vals) || (server->ops != vol->ops))
 		return 0;
 
@@ -2081,7 +2048,7 @@ cifs_find_tcp_session(struct smb_vol *vol)
 
 		++server->srv_count;
 		spin_unlock(&cifs_tcp_ses_lock);
-		cFYI(1, "Existing tcp session with server found");
+		cifs_dbg(FYI, "Existing tcp session with server found\n");
 		return server;
 	}
 	spin_unlock(&cifs_tcp_ses_lock);
@@ -2128,7 +2095,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	struct TCP_Server_Info *tcp_ses = NULL;
 	int rc;
 
-	cFYI(1, "UNC: %s", volume_info->UNC);
+	cifs_dbg(FYI, "UNC: %s\n", volume_info->UNC);
 
 	/* see if we already have a matching tcp_ses */
 	tcp_ses = cifs_find_tcp_session(volume_info);
@@ -2143,7 +2110,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 
 	rc = cifs_crypto_shash_allocate(tcp_ses);
 	if (rc) {
-		cERROR(1, "could not setup hash structures rc %d", rc);
+		cifs_dbg(VFS, "could not setup hash structures rc %d\n", rc);
 		goto out_err;
 	}
 
@@ -2190,7 +2157,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 
 	rc = ip_connect(tcp_ses);
 	if (rc < 0) {
-		cERROR(1, "Error connecting to socket. Aborting operation");
+		cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n");
 		goto out_err_crypto_release;
 	}
 
@@ -2203,7 +2170,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 				  tcp_ses, "cifsd");
 	if (IS_ERR(tcp_ses->tsk)) {
 		rc = PTR_ERR(tcp_ses->tsk);
-		cERROR(1, "error %d create cifsd thread", rc);
+		cifs_dbg(VFS, "error %d create cifsd thread\n", rc);
 		module_put(THIS_MODULE);
 		goto out_err_crypto_release;
 	}
@@ -2239,9 +2206,13 @@ out_err:
 
 static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
 {
-	switch (ses->server->secType) {
+	if (vol->sectype != Unspecified &&
+	    vol->sectype != ses->sectype)
+		return 0;
+
+	switch (ses->sectype) {
 	case Kerberos:
-		if (vol->cred_uid != ses->cred_uid)
+		if (!uid_eq(vol->cred_uid, ses->cred_uid))
 			return 0;
 		break;
 	default:
@@ -2290,7 +2261,7 @@ cifs_put_smb_ses(struct cifs_ses *ses)
 	unsigned int xid;
 	struct TCP_Server_Info *server = ses->server;
 
-	cFYI(1, "%s: ses_count=%d", __func__, ses->ses_count);
+	cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
 	spin_lock(&cifs_tcp_ses_lock);
 	if (--ses->ses_count > 0) {
 		spin_unlock(&cifs_tcp_ses_lock);
@@ -2342,23 +2313,24 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 		sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr);
 		break;
 	default:
-		cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family);
+		cifs_dbg(FYI, "Bad ss_family (%hu)\n",
+			 server->dstaddr.ss_family);
 		rc = -EINVAL;
 		goto out_err;
 	}
 
-	cFYI(1, "%s: desc=%s", __func__, desc);
+	cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
 	key = request_key(&key_type_logon, desc, "");
 	if (IS_ERR(key)) {
 		if (!ses->domainName) {
-			cFYI(1, "domainName is NULL");
+			cifs_dbg(FYI, "domainName is NULL\n");
 			rc = PTR_ERR(key);
 			goto out_err;
 		}
 
 		/* didn't work, try to find a domain key */
 		sprintf(desc, "cifs:d:%s", ses->domainName);
-		cFYI(1, "%s: desc=%s", __func__, desc);
+		cifs_dbg(FYI, "%s: desc=%s\n", __func__, desc);
 		key = request_key(&key_type_logon, desc, "");
 		if (IS_ERR(key)) {
 			rc = PTR_ERR(key);
@@ -2376,32 +2348,34 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 	/* find first : in payload */
 	payload = (char *)upayload->data;
 	delim = strnchr(payload, upayload->datalen, ':');
-	cFYI(1, "payload=%s", payload);
+	cifs_dbg(FYI, "payload=%s\n", payload);
 	if (!delim) {
-		cFYI(1, "Unable to find ':' in payload (datalen=%d)",
-				upayload->datalen);
+		cifs_dbg(FYI, "Unable to find ':' in payload (datalen=%d)\n",
+			 upayload->datalen);
 		rc = -EINVAL;
 		goto out_key_put;
 	}
 
 	len = delim - payload;
 	if (len > MAX_USERNAME_SIZE || len <= 0) {
-		cFYI(1, "Bad value from username search (len=%zd)", len);
+		cifs_dbg(FYI, "Bad value from username search (len=%zd)\n",
+			 len);
 		rc = -EINVAL;
 		goto out_key_put;
 	}
 
 	vol->username = kstrndup(payload, len, GFP_KERNEL);
 	if (!vol->username) {
-		cFYI(1, "Unable to allocate %zd bytes for username", len);
+		cifs_dbg(FYI, "Unable to allocate %zd bytes for username\n",
+			 len);
 		rc = -ENOMEM;
 		goto out_key_put;
 	}
-	cFYI(1, "%s: username=%s", __func__, vol->username);
+	cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username);
 
 	len = key->datalen - (len + 1);
 	if (len > MAX_PASSWORD_SIZE || len <= 0) {
-		cFYI(1, "Bad len for password search (len=%zd)", len);
+		cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len);
 		rc = -EINVAL;
 		kfree(vol->username);
 		vol->username = NULL;
@@ -2411,7 +2385,8 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 	++delim;
 	vol->password = kstrndup(delim, len, GFP_KERNEL);
 	if (!vol->password) {
-		cFYI(1, "Unable to allocate %zd bytes for password", len);
+		cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n",
+			 len);
 		rc = -ENOMEM;
 		kfree(vol->username);
 		vol->username = NULL;
@@ -2423,7 +2398,7 @@ out_key_put:
 	key_put(key);
 out_err:
 	kfree(desc);
-	cFYI(1, "%s: returning %d", __func__, rc);
+	cifs_dbg(FYI, "%s: returning %d\n", __func__, rc);
 	return rc;
 }
 #else /* ! CONFIG_KEYS */
@@ -2448,7 +2423,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 
 	ses = cifs_find_smb_ses(server, volume_info);
 	if (ses) {
-		cFYI(1, "Existing smb sess found (status=%d)", ses->status);
+		cifs_dbg(FYI, "Existing smb sess found (status=%d)\n",
+			 ses->status);
 
 		mutex_lock(&ses->session_mutex);
 		rc = cifs_negotiate_protocol(xid, ses);
@@ -2460,7 +2436,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 			return ERR_PTR(rc);
 		}
 		if (ses->need_reconnect) {
-			cFYI(1, "Session needs reconnect");
+			cifs_dbg(FYI, "Session needs reconnect\n");
 			rc = cifs_setup_session(xid, ses,
 						volume_info->local_nls);
 			if (rc) {
@@ -2479,7 +2455,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 		return ses;
 	}
 
-	cFYI(1, "Existing smb sess not found");
+	cifs_dbg(FYI, "Existing smb sess not found\n");
 	ses = sesInfoAlloc();
 	if (ses == NULL)
 		goto get_ses_fail;
@@ -2511,7 +2487,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 	ses->cred_uid = volume_info->cred_uid;
 	ses->linux_uid = volume_info->linux_uid;
 
-	ses->overrideSecFlg = volume_info->secFlg;
+	ses->sectype = volume_info->sectype;
+	ses->sign = volume_info->sign;
 
 	mutex_lock(&ses->session_mutex);
 	rc = cifs_negotiate_protocol(xid, ses);
@@ -2569,7 +2546,7 @@ cifs_put_tcon(struct cifs_tcon *tcon)
 	unsigned int xid;
 	struct cifs_ses *ses = tcon->ses;
 
-	cFYI(1, "%s: tc_count=%d", __func__, tcon->tc_count);
+	cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
 	spin_lock(&cifs_tcp_ses_lock);
 	if (--tcon->tc_count > 0) {
 		spin_unlock(&cifs_tcp_ses_lock);
@@ -2597,12 +2574,11 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 
 	tcon = cifs_find_tcon(ses, volume_info->UNC);
 	if (tcon) {
-		cFYI(1, "Found match on UNC path");
+		cifs_dbg(FYI, "Found match on UNC path\n");
 		/* existing tcon already has a reference */
 		cifs_put_smb_ses(ses);
 		if (tcon->seal != volume_info->seal)
-			cERROR(1, "transport encryption setting "
-				   "conflicts with existing tid");
+			cifs_dbg(VFS, "transport encryption setting conflicts with existing tid\n");
 		return tcon;
 	}
 
@@ -2634,13 +2610,13 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 	rc = ses->server->ops->tree_connect(xid, ses, volume_info->UNC, tcon,
 					    volume_info->local_nls);
 	free_xid(xid);
-	cFYI(1, "Tcon rc = %d", rc);
+	cifs_dbg(FYI, "Tcon rc = %d\n", rc);
 	if (rc)
 		goto out_fail;
 
 	if (volume_info->nodfs) {
 		tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
-		cFYI(1, "DFS disabled (%d)", tcon->Flags);
+		cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
 	}
 	tcon->seal = volume_info->seal;
 	/*
@@ -2713,7 +2689,7 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
 	if (new->rsize && new->rsize < old->rsize)
 		return 0;
 
-	if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
+	if (!uid_eq(old->mnt_uid, new->mnt_uid) || !gid_eq(old->mnt_gid, new->mnt_gid))
 		return 0;
 
 	if (old->mnt_file_mode != new->mnt_file_mode ||
@@ -2794,7 +2770,7 @@ get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path,
 		strcpy(temp_unc + 2 + strlen(ses->serverName), "\\IPC$");
 		rc = ses->server->ops->tree_connect(xid, ses, temp_unc, NULL,
 						    nls_codepage);
-		cFYI(1, "Tcon rc = %d ipc_tid = %d", rc, ses->ipc_tid);
+		cifs_dbg(FYI, "Tcon rc = %d ipc_tid = %d\n", rc, ses->ipc_tid);
 		kfree(temp_unc);
 	}
 	if (rc == 0)
@@ -2872,13 +2848,11 @@ bind_socket(struct TCP_Server_Info *server)
 			saddr4 = (struct sockaddr_in *)&server->srcaddr;
 			saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
 			if (saddr6->sin6_family == AF_INET6)
-				cERROR(1, "cifs: "
-				       "Failed to bind to: %pI6c, error: %d",
-				       &saddr6->sin6_addr, rc);
+				cifs_dbg(VFS, "Failed to bind to: %pI6c, error: %d\n",
+					 &saddr6->sin6_addr, rc);
 			else
-				cERROR(1, "cifs: "
-				       "Failed to bind to: %pI4, error: %d",
-				       &saddr4->sin_addr.s_addr, rc);
+				cifs_dbg(VFS, "Failed to bind to: %pI4, error: %d\n",
+					 &saddr4->sin_addr.s_addr, rc);
 		}
 	}
 	return rc;
@@ -2983,13 +2957,13 @@ generic_ip_connect(struct TCP_Server_Info *server)
 		rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
 				   IPPROTO_TCP, &socket, 1);
 		if (rc < 0) {
-			cERROR(1, "Error %d creating socket", rc);
+			cifs_dbg(VFS, "Error %d creating socket\n", rc);
 			server->ssocket = NULL;
 			return rc;
 		}
 
 		/* BB other socket options to set KEEPALIVE, NODELAY? */
-		cFYI(1, "Socket created");
+		cifs_dbg(FYI, "Socket created\n");
 		server->ssocket = socket;
 		socket->sk->sk_allocation = GFP_NOFS;
 		if (sfamily == AF_INET6)
@@ -3023,16 +2997,17 @@ generic_ip_connect(struct TCP_Server_Info *server)
 		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
 				(char *)&val, sizeof(val));
 		if (rc)
-			cFYI(1, "set TCP_NODELAY socket option error %d", rc);
+			cifs_dbg(FYI, "set TCP_NODELAY socket option error %d\n",
+				 rc);
 	}
 
-	 cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
+	cifs_dbg(FYI, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx\n",
 		 socket->sk->sk_sndbuf,
 		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
 
 	rc = socket->ops->connect(socket, saddr, slen, 0);
 	if (rc < 0) {
-		cFYI(1, "Error %d connecting to server", rc);
+		cifs_dbg(FYI, "Error %d connecting to server\n", rc);
 		sock_release(socket);
 		server->ssocket = NULL;
 		return rc;
@@ -3090,19 +3065,19 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
 	if (vol_info && vol_info->no_linux_ext) {
 		tcon->fsUnixInfo.Capability = 0;
 		tcon->unix_ext = 0; /* Unix Extensions disabled */
-		cFYI(1, "Linux protocol extensions disabled");
+		cifs_dbg(FYI, "Linux protocol extensions disabled\n");
 		return;
 	} else if (vol_info)
 		tcon->unix_ext = 1; /* Unix Extensions supported */
 
 	if (tcon->unix_ext == 0) {
-		cFYI(1, "Unix extensions disabled so not set on reconnect");
+		cifs_dbg(FYI, "Unix extensions disabled so not set on reconnect\n");
 		return;
 	}
 
 	if (!CIFSSMBQFSUnixInfo(xid, tcon)) {
 		__u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
-		cFYI(1, "unix caps which server supports %lld", cap);
+		cifs_dbg(FYI, "unix caps which server supports %lld\n", cap);
 		/* check for reconnect case in which we do not
 		   want to change the mount behavior if we can avoid it */
 		if (vol_info == NULL) {
@@ -3112,22 +3087,22 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
 				cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
 			if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
 				if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-					cERROR(1, "POSIXPATH support change");
+					cifs_dbg(VFS, "POSIXPATH support change\n");
 				cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
 			} else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
-				cERROR(1, "possible reconnect error");
-				cERROR(1, "server disabled POSIX path support");
+				cifs_dbg(VFS, "possible reconnect error\n");
+				cifs_dbg(VFS, "server disabled POSIX path support\n");
 			}
 		}
 
 		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
-			cERROR(1, "per-share encryption not supported yet");
+			cifs_dbg(VFS, "per-share encryption not supported yet\n");
 
 		cap &= CIFS_UNIX_CAP_MASK;
 		if (vol_info && vol_info->no_psx_acl)
 			cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
 		else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
-			cFYI(1, "negotiated posix acl support");
+			cifs_dbg(FYI, "negotiated posix acl support\n");
 			if (cifs_sb)
 				cifs_sb->mnt_cifs_flags |=
 					CIFS_MOUNT_POSIXACL;
@@ -3136,43 +3111,38 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
 		if (vol_info && vol_info->posix_paths == 0)
 			cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
 		else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
-			cFYI(1, "negotiate posix pathnames");
+			cifs_dbg(FYI, "negotiate posix pathnames\n");
 			if (cifs_sb)
 				cifs_sb->mnt_cifs_flags |=
 					CIFS_MOUNT_POSIX_PATHS;
 		}
 
-		cFYI(1, "Negotiate caps 0x%x", (int)cap);
+		cifs_dbg(FYI, "Negotiate caps 0x%x\n", (int)cap);
 #ifdef CONFIG_CIFS_DEBUG2
 		if (cap & CIFS_UNIX_FCNTL_CAP)
-			cFYI(1, "FCNTL cap");
+			cifs_dbg(FYI, "FCNTL cap\n");
 		if (cap & CIFS_UNIX_EXTATTR_CAP)
-			cFYI(1, "EXTATTR cap");
+			cifs_dbg(FYI, "EXTATTR cap\n");
 		if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-			cFYI(1, "POSIX path cap");
+			cifs_dbg(FYI, "POSIX path cap\n");
 		if (cap & CIFS_UNIX_XATTR_CAP)
-			cFYI(1, "XATTR cap");
+			cifs_dbg(FYI, "XATTR cap\n");
 		if (cap & CIFS_UNIX_POSIX_ACL_CAP)
-			cFYI(1, "POSIX ACL cap");
+			cifs_dbg(FYI, "POSIX ACL cap\n");
 		if (cap & CIFS_UNIX_LARGE_READ_CAP)
-			cFYI(1, "very large read cap");
+			cifs_dbg(FYI, "very large read cap\n");
 		if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
-			cFYI(1, "very large write cap");
+			cifs_dbg(FYI, "very large write cap\n");
 		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP)
-			cFYI(1, "transport encryption cap");
+			cifs_dbg(FYI, "transport encryption cap\n");
 		if (cap & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)
-			cFYI(1, "mandatory transport encryption cap");
+			cifs_dbg(FYI, "mandatory transport encryption cap\n");
 #endif /* CIFS_DEBUG2 */
 		if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
 			if (vol_info == NULL) {
-				cFYI(1, "resetting capabilities failed");
+				cifs_dbg(FYI, "resetting capabilities failed\n");
 			} else
-				cERROR(1, "Negotiating Unix capabilities "
-					   "with the server failed. Consider "
-					   "mounting with the Unix Extensions "
-					   "disabled if problems are found "
-					   "by specifying the nounix mount "
-					   "option.");
+				cifs_dbg(VFS, "Negotiating Unix capabilities with the server failed. Consider mounting with the Unix Extensions disabled if problems are found by specifying the nounix mount option.\n");
 
 		}
 	}
@@ -3197,8 +3167,8 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 	cifs_sb->mnt_gid = pvolume_info->linux_gid;
 	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
 	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
-	cFYI(1, "file mode: 0x%hx  dir mode: 0x%hx",
-		cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
+	cifs_dbg(FYI, "file mode: 0x%hx  dir mode: 0x%hx\n",
+		 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
 
 	cifs_sb->actimeo = pvolume_info->actimeo;
 	cifs_sb->local_nls = pvolume_info->local_nls;
@@ -3247,21 +3217,19 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 	if (pvolume_info->strict_io)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
 	if (pvolume_info->direct_io) {
-		cFYI(1, "mounting share using direct i/o");
+		cifs_dbg(FYI, "mounting share using direct i/o\n");
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
 	}
 	if (pvolume_info->mfsymlinks) {
 		if (pvolume_info->sfu_emul) {
-			cERROR(1,  "mount option mfsymlinks ignored if sfu "
-				   "mount option is used");
+			cifs_dbg(VFS, "mount option mfsymlinks ignored if sfu mount option is used\n");
 		} else {
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
 		}
 	}
 
 	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
-		cERROR(1, "mount option dynperm ignored if cifsacl "
-			   "mount option supported");
+		cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n");
 }
 
 static void
@@ -3306,14 +3274,14 @@ build_unc_path_to_root(const struct smb_vol *vol,
 	pos = full_path + unc_len;
 
 	if (pplen) {
-		*pos++ = CIFS_DIR_SEP(cifs_sb);
-		strncpy(pos, vol->prepath, pplen);
+		*pos = CIFS_DIR_SEP(cifs_sb);
+		strncpy(pos + 1, vol->prepath, pplen);
 		pos += pplen;
 	}
 
 	*pos = '\0'; /* add trailing null */
 	convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
-	cFYI(1, "%s: full_path=%s", __func__, full_path);
+	cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path);
 	return full_path;
 }
 
@@ -3384,14 +3352,14 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
 		return -EINVAL;
 
 	if (volume_info->nullauth) {
-		cFYI(1, "Anonymous login");
+		cifs_dbg(FYI, "Anonymous login\n");
 		kfree(volume_info->username);
 		volume_info->username = NULL;
 	} else if (volume_info->username) {
 		/* BB fixme parse for domain name here */
-		cFYI(1, "Username: %s", volume_info->username);
+		cifs_dbg(FYI, "Username: %s\n", volume_info->username);
 	} else {
-		cifserror("No username specified");
+		cifs_dbg(VFS, "No username specified\n");
 	/* In userspace mount helper we can get user name from alternate
 	   locations such as env variables and files on disk */
 		return -EINVAL;
@@ -3404,7 +3372,7 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
 	} else {
 		volume_info->local_nls = load_nls(volume_info->iocharset);
 		if (volume_info->local_nls == NULL) {
-			cERROR(1, "CIFS mount error: iocharset %s not found",
+			cifs_dbg(VFS, "CIFS mount error: iocharset %s not found\n",
 				 volume_info->iocharset);
 			return -ELIBACC;
 		}
@@ -3683,7 +3651,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
 		   NTLMv2 password here) */
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 		if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
-		    (ses->server->secType == LANMAN))
+		    (ses->sectype == LANMAN))
 			calc_lanman_hash(tcon->password, ses->server->cryptkey,
 					 ses->server->sec_mode &
 					    SECMODE_PW_ENCRYPT ? true : false,
@@ -3701,8 +3669,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
 		}
 	}
 
-	if (ses->server->sec_mode &
-			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+	if (ses->server->sign)
 		smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 
 	if (ses->capabilities & CAP_STATUS32) {
@@ -3754,18 +3721,18 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
 		if (length == 3) {
 			if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
 			    (bcc_ptr[2] == 'C')) {
-				cFYI(1, "IPC connection");
+				cifs_dbg(FYI, "IPC connection\n");
 				tcon->ipc = 1;
 			}
 		} else if (length == 2) {
 			if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
 				/* the most common case */
-				cFYI(1, "disk share connection");
+				cifs_dbg(FYI, "disk share connection\n");
 			}
 		}
 		bcc_ptr += length + 1;
 		bytes_left -= (length + 1);
-		strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
+		strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
 
 		/* mostly informational -- no need to fail on error here */
 		kfree(tcon->nativeFileSystem);
@@ -3773,7 +3740,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
 						      bytes_left, is_unicode,
 						      nls_codepage);
 
-		cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
+		cifs_dbg(FYI, "nativeFileSystem=%s\n", tcon->nativeFileSystem);
 
 		if ((smb_buffer_response->WordCount == 3) ||
 			 (smb_buffer_response->WordCount == 7))
@@ -3781,7 +3748,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
 			tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
 		else
 			tcon->Flags = 0;
-		cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
+		cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags);
 	} else if ((rc == 0) && tcon == NULL) {
 		/* all we need to save for IPC$ connection */
 		ses->ipc_tid = smb_buffer_response->Tid;
@@ -3854,31 +3821,32 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
 	int rc = -ENOSYS;
 	struct TCP_Server_Info *server = ses->server;
 
-	ses->flags = 0;
 	ses->capabilities = server->capabilities;
 	if (linuxExtEnabled == 0)
 		ses->capabilities &= (~server->vals->cap_unix);
 
-	cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
+	cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n",
 		 server->sec_mode, server->capabilities, server->timeAdj);
 
 	if (server->ops->sess_setup)
 		rc = server->ops->sess_setup(xid, ses, nls_info);
 
 	if (rc) {
-		cERROR(1, "Send error in SessSetup = %d", rc);
+		cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc);
 	} else {
-		mutex_lock(&ses->server->srv_mutex);
+		mutex_lock(&server->srv_mutex);
 		if (!server->session_estab) {
 			server->session_key.response = ses->auth_key.response;
 			server->session_key.len = ses->auth_key.len;
 			server->sequence_number = 0x2;
 			server->session_estab = true;
 			ses->auth_key.response = NULL;
+			if (server->ops->generate_signingkey)
+				server->ops->generate_signingkey(server);
 		}
 		mutex_unlock(&server->srv_mutex);
 
-		cFYI(1, "CIFS Session Established successfully");
+		cifs_dbg(FYI, "CIFS Session Established successfully\n");
 		spin_lock(&GlobalMid_Lock);
 		ses->status = CifsGood;
 		ses->need_reconnect = false;
@@ -3897,29 +3865,17 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
 static int
 cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
 {
-	switch (ses->server->secType) {
-	case Kerberos:
-		vol->secFlg = CIFSSEC_MUST_KRB5;
+	vol->sectype = ses->sectype;
+
+	/* krb5 is special, since we don't need username or pw */
+	if (vol->sectype == Kerberos)
 		return 0;
-	case NTLMv2:
-		vol->secFlg = CIFSSEC_MUST_NTLMV2;
-		break;
-	case NTLM:
-		vol->secFlg = CIFSSEC_MUST_NTLM;
-		break;
-	case RawNTLMSSP:
-		vol->secFlg = CIFSSEC_MUST_NTLMSSP;
-		break;
-	case LANMAN:
-		vol->secFlg = CIFSSEC_MUST_LANMAN;
-		break;
-	}
 
 	return cifs_set_cifscreds(vol, ses);
 }
 
 static struct cifs_tcon *
-cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
 {
 	int rc;
 	struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
@@ -3939,6 +3895,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 	vol_info->nocase = master_tcon->nocase;
 	vol_info->local_lease = master_tcon->local_lease;
 	vol_info->no_linux_ext = !master_tcon->unix_ext;
+	vol_info->sectype = master_tcon->ses->sectype;
+	vol_info->sign = master_tcon->ses->sign;
 
 	rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
 	if (rc) {
@@ -3989,7 +3947,7 @@ cifs_sb_tcon_pending_wait(void *unused)
 
 /* find and return a tlink with given uid */
 static struct tcon_link *
-tlink_rb_search(struct rb_root *root, uid_t uid)
+tlink_rb_search(struct rb_root *root, kuid_t uid)
 {
 	struct rb_node *node = root->rb_node;
 	struct tcon_link *tlink;
@@ -3997,9 +3955,9 @@ tlink_rb_search(struct rb_root *root, uid_t uid)
 	while (node) {
 		tlink = rb_entry(node, struct tcon_link, tl_rbnode);
 
-		if (tlink->tl_uid > uid)
+		if (uid_gt(tlink->tl_uid, uid))
 			node = node->rb_left;
-		else if (tlink->tl_uid < uid)
+		else if (uid_lt(tlink->tl_uid, uid))
 			node = node->rb_right;
 		else
 			return tlink;
@@ -4018,7 +3976,7 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
 		tlink = rb_entry(*new, struct tcon_link, tl_rbnode);
 		parent = *new;
 
-		if (tlink->tl_uid > new_tlink->tl_uid)
+		if (uid_gt(tlink->tl_uid, new_tlink->tl_uid))
 			new = &((*new)->rb_left);
 		else
 			new = &((*new)->rb_right);
@@ -4048,7 +4006,7 @@ struct tcon_link *
 cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
 {
 	int ret;
-	uid_t fsuid = current_fsuid();
+	kuid_t fsuid = current_fsuid();
 	struct tcon_link *tlink, *newtlink;
 
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 8719bbe0dcc..5175aebf673 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -102,7 +102,7 @@ cifs_bp_rename_retry:
 		namelen += (1 + temp->d_name.len);
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			cERROR(1, "corrupt dentry");
+			cifs_dbg(VFS, "corrupt dentry\n");
 			rcu_read_unlock();
 			return NULL;
 		}
@@ -124,12 +124,12 @@ cifs_bp_rename_retry:
 			full_path[namelen] = dirsep;
 			strncpy(full_path + namelen + 1, temp->d_name.name,
 				temp->d_name.len);
-			cFYI(0, "name: %s", full_path + namelen);
+			cifs_dbg(FYI, "name: %s\n", full_path + namelen);
 		}
 		spin_unlock(&temp->d_lock);
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			cERROR(1, "corrupt dentry");
+			cifs_dbg(VFS, "corrupt dentry\n");
 			rcu_read_unlock();
 			kfree(full_path);
 			return NULL;
@@ -137,8 +137,8 @@ cifs_bp_rename_retry:
 	}
 	rcu_read_unlock();
 	if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
-		cFYI(1, "did not end path lookup where expected. namelen=%d "
-			"dfsplen=%d", namelen, dfsplen);
+		cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n",
+			 namelen, dfsplen);
 		/* presumably this is only possible if racing with a rename
 		of one of the parent directories  (we can not lock the dentries
 		above us to prevent this, but retrying should be harmless) */
@@ -178,7 +178,7 @@ check_name(struct dentry *direntry)
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) {
 		for (i = 0; i < direntry->d_name.len; i++) {
 			if (direntry->d_name.name[i] == '\\') {
-				cFYI(1, "Invalid file name");
+				cifs_dbg(FYI, "Invalid file name\n");
 				return -EINVAL;
 			}
 		}
@@ -291,7 +291,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
 	else if ((oflags & O_CREAT) == O_CREAT)
 		disposition = FILE_OPEN_IF;
 	else
-		cFYI(1, "Create flag not set in create function");
+		cifs_dbg(FYI, "Create flag not set in create function\n");
 
 	/*
 	 * BB add processing to set equivalent of mode - e.g. via CreateX with
@@ -323,7 +323,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
 			       desired_access, create_options, fid, oplock,
 			       buf, cifs_sb);
 	if (rc) {
-		cFYI(1, "cifs_create returned 0x%x", rc);
+		cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
 		goto out;
 	}
 
@@ -342,14 +342,14 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
 
 		*created |= FILE_CREATED;
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-			args.uid = (__u64) current_fsuid();
+			args.uid = current_fsuid();
 			if (inode->i_mode & S_ISGID)
-				args.gid = (__u64) inode->i_gid;
+				args.gid = inode->i_gid;
 			else
-				args.gid = (__u64) current_fsgid();
+				args.gid = current_fsgid();
 		} else {
-			args.uid = NO_CHANGE_64;
-			args.gid = NO_CHANGE_64;
+			args.uid = INVALID_UID; /* no change */
+			args.gid = INVALID_GID; /* no change */
 		}
 		CIFSSMBUnixSetFileInfo(xid, tcon, &args, fid->netfid,
 				       current->tgid);
@@ -389,7 +389,8 @@ cifs_create_get_file_info:
 
 cifs_create_set_dentry:
 	if (rc != 0) {
-		cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
+		cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n",
+			 rc);
 		if (server->ops->close)
 			server->ops->close(xid, tcon, fid);
 		goto out;
@@ -452,12 +453,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
 
 	xid = get_xid();
 
-	cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
-	     inode, direntry->d_name.name, direntry);
+	cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n",
+		 inode, direntry->d_name.name, direntry);
 
 	tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
-	if (IS_ERR(tlink))
+	if (IS_ERR(tlink)) {
+		rc = PTR_ERR(tlink);
 		goto out_free_xid;
+	}
 
 	tcon = tlink_tcon(tlink);
 	server = tcon->ses->server;
@@ -518,8 +521,8 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
 	__u32 oplock;
 	int created = FILE_CREATED;
 
-	cFYI(1, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p",
-	     inode, direntry->d_name.name, direntry);
+	cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n",
+		 inode, direntry->d_name.name, direntry);
 
 	tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
 	rc = PTR_ERR(tlink);
@@ -588,11 +591,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 			.device	= device_number,
 		};
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-			args.uid = (__u64) current_fsuid();
-			args.gid = (__u64) current_fsgid();
+			args.uid = current_fsuid();
+			args.gid = current_fsgid();
 		} else {
-			args.uid = NO_CHANGE_64;
-			args.gid = NO_CHANGE_64;
+			args.uid = INVALID_UID; /* no change */
+			args.gid = INVALID_GID; /* no change */
 		}
 		rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
 					    cifs_sb->local_nls,
@@ -613,7 +616,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
 		goto mknod_out;
 
 
-	cFYI(1, "sfu compat create special file");
+	cifs_dbg(FYI, "sfu compat create special file\n");
 
 	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 	if (buf == NULL) {
@@ -688,8 +691,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 
 	xid = get_xid();
 
-	cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
-	      parent_dir_inode, direntry->d_name.name, direntry);
+	cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n",
+		 parent_dir_inode, direntry->d_name.name, direntry);
 
 	/* check whether path exists */
 
@@ -715,11 +718,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	}
 
 	if (direntry->d_inode != NULL) {
-		cFYI(1, "non-NULL inode in lookup");
+		cifs_dbg(FYI, "non-NULL inode in lookup\n");
 	} else {
-		cFYI(1, "NULL inode in lookup");
+		cifs_dbg(FYI, "NULL inode in lookup\n");
 	}
-	cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
+	cifs_dbg(FYI, "Full path: %s inode = 0x%p\n",
+		 full_path, direntry->d_inode);
 
 	if (pTcon->unix_ext) {
 		rc = cifs_get_inode_info_unix(&newInode, full_path,
@@ -742,7 +746,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	/*	if it was once a directory (but how can we tell?) we could do
 		shrink_dcache_parent(direntry); */
 	} else if (rc != -EACCES) {
-		cERROR(1, "Unexpected lookup error %d", rc);
+		cifs_dbg(VFS, "Unexpected lookup error %d\n", rc);
 		/* We special case check for Access Denied - since that
 		is a common return code */
 	}
@@ -807,7 +811,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
 {
 	int rc = 0;
 
-	cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
+	cifs_dbg(FYI, "In cifs d_delete, name = %s\n", direntry->d_name.name);
 
 	return rc;
 }     */
@@ -818,8 +822,7 @@ const struct dentry_operations cifs_dentry_ops = {
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
 
-static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *q)
+static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
 {
 	struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
 	unsigned long hash;
@@ -834,12 +837,10 @@ static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
 	return 0;
 }
 
-static int cifs_ci_compare(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
+	struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
 
 	if ((name->len == len) &&
 	    (nls_strnicmp(codepage, name->name, str, len) == 0))
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 1d2d91d9bf6..7ede7306599 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -34,7 +34,7 @@
 
 /**
  * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
- * @unc: UNC path specifying the server
+ * @unc: UNC path specifying the server (with '/' as delimiter)
  * @ip_addr: Where to return the IP address.
  *
  * The IP address will be returned in string form, and the caller is
@@ -55,7 +55,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 
 	len = strlen(unc);
 	if (len < 3) {
-		cFYI(1, "%s: unc is too short: %s", __func__, unc);
+		cifs_dbg(FYI, "%s: unc is too short: %s\n", __func__, unc);
 		return -EINVAL;
 	}
 
@@ -64,12 +64,12 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 	hostname = unc + 2;
 
 	/* Search for server name delimiter */
-	sep = memchr(hostname, '\\', len);
+	sep = memchr(hostname, '/', len);
 	if (sep)
 		len = sep - hostname;
 	else
-		cFYI(1, "%s: probably server name is whole unc: %s",
-		     __func__, unc);
+		cifs_dbg(FYI, "%s: probably server name is whole unc: %s\n",
+			 __func__, unc);
 
 	/* Try to interpret hostname as an IPv4 or IPv6 address */
 	rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
@@ -79,11 +79,11 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 	/* Perform the upcall */
 	rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
 	if (rc < 0)
-		cFYI(1, "%s: unable to resolve: %*.*s",
-			__func__, len, len, hostname);
+		cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
+			 __func__, len, len, hostname);
 	else
-		cFYI(1, "%s: resolved: %*.*s to %s",
-		     __func__, len, len, hostname, *ip_addr);
+		cifs_dbg(FYI, "%s: resolved: %*.*s to %s\n",
+			 __func__, len, len, hostname, *ip_addr);
 	return rc;
 
 name_is_IP_address:
@@ -92,7 +92,8 @@ name_is_IP_address:
 		return -ENOMEM;
 	memcpy(name, hostname, len);
 	name[len] = 0;
-	cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
+	cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %s\n",
+		 __func__, name);
 	*ip_addr = name;
 	return 0;
 }
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 9c7ecdccf2f..ce8b7f677c5 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -49,7 +49,7 @@
 static struct dentry *cifs_get_parent(struct dentry *dentry)
 {
 	/* BB need to add code here eventually to enable export via NFSD */
-	cFYI(1, "get parent for %p", dentry);
+	cifs_dbg(FYI, "get parent for %p\n", dentry);
 	return ERR_PTR(-EACCES);
 }
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8ea6ca50a66..91d8629e69a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -43,6 +43,7 @@
 #include "cifs_fs_sb.h"
 #include "fscache.h"
 
+
 static inline int cifs_convert_flags(unsigned int flags)
 {
 	if ((flags & O_ACCMODE) == O_RDONLY)
@@ -72,10 +73,14 @@ static u32 cifs_posix_convert_flags(unsigned int flags)
 	else if ((flags & O_ACCMODE) == O_RDWR)
 		posix_flags = SMB_O_RDWR;
 
-	if (flags & O_CREAT)
+	if (flags & O_CREAT) {
 		posix_flags |= SMB_O_CREAT;
-	if (flags & O_EXCL)
-		posix_flags |= SMB_O_EXCL;
+		if (flags & O_EXCL)
+			posix_flags |= SMB_O_EXCL;
+	} else if (flags & O_EXCL)
+		cifs_dbg(FYI, "Application %s pid %d has incorrectly set O_EXCL flag but not O_CREAT on file open. Ignoring O_EXCL\n",
+			 current->comm, current->tgid);
+
 	if (flags & O_TRUNC)
 		posix_flags |= SMB_O_TRUNC;
 	/* be safe and imply O_SYNC for O_DSYNC */
@@ -117,7 +122,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 	struct tcon_link *tlink;
 	struct cifs_tcon *tcon;
 
-	cFYI(1, "posix open %s", full_path);
+	cifs_dbg(FYI, "posix open %s\n", full_path);
 
 	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
 	if (presp_data == NULL)
@@ -294,13 +299,15 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
 	INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
 	mutex_init(&cfile->fh_mutex);
 
+	cifs_sb_active(inode->i_sb);
+
 	/*
 	 * If the server returned a read oplock and we have mandatory brlocks,
 	 * set oplock level to None.
 	 */
 	if (oplock == server->vals->oplock_read &&
 						cifs_has_mand_locks(cinode)) {
-		cFYI(1, "Reset oplock val from read to None due to mand locks");
+		cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n");
 		oplock = 0;
 	}
 
@@ -343,7 +350,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
 	struct TCP_Server_Info *server = tcon->ses->server;
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
-	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct super_block *sb = inode->i_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsLockInfo *li, *tmp;
 	struct cifs_fid fid;
 	struct cifs_pending_open open;
@@ -365,8 +373,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 	list_del(&cifs_file->tlist);
 
 	if (list_empty(&cifsi->openFileList)) {
-		cFYI(1, "closing last open instance for inode %p",
-			cifs_file->dentry->d_inode);
+		cifs_dbg(FYI, "closing last open instance for inode %p\n",
+			 cifs_file->dentry->d_inode);
 		/*
 		 * In strict cache mode we need invalidate mapping on the last
 		 * close  because it may cause a error when we open this file
@@ -408,6 +416,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 
 	cifs_put_tlink(cifs_file->tlink);
 	dput(cifs_file->dentry);
+	cifs_sb_deactive(sb);
 	kfree(cifs_file);
 }
 
@@ -444,7 +453,7 @@ int cifs_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 
-	cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
+	cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n",
 		 inode, file->f_flags, full_path);
 
 	if (server->oplocks)
@@ -460,16 +469,13 @@ int cifs_open(struct inode *inode, struct file *file)
 				cifs_sb->mnt_file_mode /* ignored */,
 				file->f_flags, &oplock, &fid.netfid, xid);
 		if (rc == 0) {
-			cFYI(1, "posix open succeeded");
+			cifs_dbg(FYI, "posix open succeeded\n");
 			posix_open_ok = true;
 		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			if (tcon->ses->serverNOS)
-				cERROR(1, "server %s of type %s returned"
-					   " unexpected error on SMB posix open"
-					   ", disabling posix open support."
-					   " Check if server update available.",
-					   tcon->ses->serverName,
-					   tcon->ses->serverNOS);
+				cifs_dbg(VFS, "server %s of type %s returned unexpected error on SMB posix open, disabling posix open support. Check if server update available.\n",
+					 tcon->ses->serverName,
+					 tcon->ses->serverNOS);
 			tcon->broken_posix_open = true;
 		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
 			 (rc != -EOPNOTSUPP)) /* path not found or net err */
@@ -515,8 +521,8 @@ int cifs_open(struct inode *inode, struct file *file)
 		 */
 		struct cifs_unix_set_info_args args = {
 			.mode	= inode->i_mode,
-			.uid	= NO_CHANGE_64,
-			.gid	= NO_CHANGE_64,
+			.uid	= INVALID_UID, /* no change */
+			.gid	= INVALID_GID, /* no change */
 			.ctime	= NO_CHANGE_64,
 			.atime	= NO_CHANGE_64,
 			.mtime	= NO_CHANGE_64,
@@ -611,8 +617,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
 		return rc;
 	}
 
-	cFYI(1, "inode = 0x%p file flags 0x%x for %s", inode, cfile->f_flags,
-	     full_path);
+	cifs_dbg(FYI, "inode = 0x%p file flags 0x%x for %s\n",
+		 inode, cfile->f_flags, full_path);
 
 	if (tcon->ses->server->oplocks)
 		oplock = REQ_OPLOCK;
@@ -633,7 +639,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
 				     cifs_sb->mnt_file_mode /* ignored */,
 				     oflags, &oplock, &fid.netfid, xid);
 		if (rc == 0) {
-			cFYI(1, "posix reopen succeeded");
+			cifs_dbg(FYI, "posix reopen succeeded\n");
 			goto reopen_success;
 		}
 		/*
@@ -662,8 +668,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
 			       NULL, cifs_sb);
 	if (rc) {
 		mutex_unlock(&cfile->fh_mutex);
-		cFYI(1, "cifs_reopen returned 0x%x", rc);
-		cFYI(1, "oplock: %d", oplock);
+		cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc);
+		cifs_dbg(FYI, "oplock: %d\n", oplock);
 		goto reopen_error_exit;
 	}
 
@@ -719,7 +725,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 	struct TCP_Server_Info *server;
 	char *buf;
 
-	cFYI(1, "Closedir inode = 0x%p", inode);
+	cifs_dbg(FYI, "Closedir inode = 0x%p\n", inode);
 
 	if (cfile == NULL)
 		return rc;
@@ -728,7 +734,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 	tcon = tlink_tcon(cfile->tlink);
 	server = tcon->ses->server;
 
-	cFYI(1, "Freeing private data in close dir");
+	cifs_dbg(FYI, "Freeing private data in close dir\n");
 	spin_lock(&cifs_file_list_lock);
 	if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
 		cfile->invalidHandle = true;
@@ -737,7 +743,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 			rc = server->ops->close_dir(xid, tcon, &cfile->fid);
 		else
 			rc = -ENOSYS;
-		cFYI(1, "Closing uncompleted readdir with rc %d", rc);
+		cifs_dbg(FYI, "Closing uncompleted readdir with rc %d\n", rc);
 		/* not much we can do if it fails anyway, ignore rc */
 		rc = 0;
 	} else
@@ -745,7 +751,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 
 	buf = cfile->srch_inf.ntwrk_buf_start;
 	if (buf) {
-		cFYI(1, "closedir free smb buf in srch struct");
+		cifs_dbg(FYI, "closedir free smb buf in srch struct\n");
 		cfile->srch_inf.ntwrk_buf_start = NULL;
 		if (cfile->srch_inf.smallBuf)
 			cifs_small_buf_release(buf);
@@ -947,7 +953,7 @@ static int
 cifs_posix_lock_test(struct file *file, struct file_lock *flock)
 {
 	int rc = 0;
-	struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+	struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
 	unsigned char saved_type = flock->fl_type;
 
 	if ((flock->fl_flags & FL_POSIX) == 0)
@@ -974,7 +980,7 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
 static int
 cifs_posix_lock_set(struct file *file, struct file_lock *flock)
 {
-	struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode);
+	struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
 	int rc = 1;
 
 	if ((flock->fl_flags & FL_POSIX) == 0)
@@ -993,7 +999,7 @@ try_again:
 		rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
 		if (!rc)
 			goto try_again;
-		locks_delete_block(flock);
+		posix_unblock_lock(flock);
 	}
 	return rc;
 }
@@ -1086,6 +1092,7 @@ struct lock_to_push {
 static int
 cifs_push_posix_locks(struct cifsFileInfo *cfile)
 {
+	struct inode *inode = cfile->dentry->d_inode;
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 	struct file_lock *flock, **before;
 	unsigned int count = 0, i = 0;
@@ -1096,12 +1103,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 
 	xid = get_xid();
 
-	lock_flocks();
-	cifs_for_each_lock(cfile->dentry->d_inode, before) {
+	spin_lock(&inode->i_lock);
+	cifs_for_each_lock(inode, before) {
 		if ((*before)->fl_flags & FL_POSIX)
 			count++;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 
 	INIT_LIST_HEAD(&locks_to_send);
 
@@ -1120,8 +1127,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	}
 
 	el = locks_to_send.next;
-	lock_flocks();
-	cifs_for_each_lock(cfile->dentry->d_inode, before) {
+	spin_lock(&inode->i_lock);
+	cifs_for_each_lock(inode, before) {
 		flock = *before;
 		if ((flock->fl_flags & FL_POSIX) == 0)
 			continue;
@@ -1130,7 +1137,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 			 * The list ended. We don't have enough allocated
 			 * structures - something is really wrong.
 			 */
-			cERROR(1, "Can't push all brlocks!");
+			cifs_dbg(VFS, "Can't push all brlocks!\n");
 			break;
 		}
 		length = 1 + flock->fl_end - flock->fl_start;
@@ -1146,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 		lck->offset = flock->fl_start;
 		el = el->next;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 
 	list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
 		int stored_rc;
@@ -1203,47 +1210,46 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
 		bool *wait_flag, struct TCP_Server_Info *server)
 {
 	if (flock->fl_flags & FL_POSIX)
-		cFYI(1, "Posix");
+		cifs_dbg(FYI, "Posix\n");
 	if (flock->fl_flags & FL_FLOCK)
-		cFYI(1, "Flock");
+		cifs_dbg(FYI, "Flock\n");
 	if (flock->fl_flags & FL_SLEEP) {
-		cFYI(1, "Blocking lock");
+		cifs_dbg(FYI, "Blocking lock\n");
 		*wait_flag = true;
 	}
 	if (flock->fl_flags & FL_ACCESS)
-		cFYI(1, "Process suspended by mandatory locking - "
-			"not implemented yet");
+		cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n");
 	if (flock->fl_flags & FL_LEASE)
-		cFYI(1, "Lease on file - not implemented yet");
+		cifs_dbg(FYI, "Lease on file - not implemented yet\n");
 	if (flock->fl_flags &
 	    (~(FL_POSIX | FL_FLOCK | FL_SLEEP |
 	       FL_ACCESS | FL_LEASE | FL_CLOSE)))
-		cFYI(1, "Unknown lock flags 0x%x", flock->fl_flags);
+		cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags);
 
 	*type = server->vals->large_lock_type;
 	if (flock->fl_type == F_WRLCK) {
-		cFYI(1, "F_WRLCK ");
+		cifs_dbg(FYI, "F_WRLCK\n");
 		*type |= server->vals->exclusive_lock_type;
 		*lock = 1;
 	} else if (flock->fl_type == F_UNLCK) {
-		cFYI(1, "F_UNLCK");
+		cifs_dbg(FYI, "F_UNLCK\n");
 		*type |= server->vals->unlock_lock_type;
 		*unlock = 1;
 		/* Check if unlock includes more than one lock range */
 	} else if (flock->fl_type == F_RDLCK) {
-		cFYI(1, "F_RDLCK");
+		cifs_dbg(FYI, "F_RDLCK\n");
 		*type |= server->vals->shared_lock_type;
 		*lock = 1;
 	} else if (flock->fl_type == F_EXLCK) {
-		cFYI(1, "F_EXLCK");
+		cifs_dbg(FYI, "F_EXLCK\n");
 		*type |= server->vals->exclusive_lock_type;
 		*lock = 1;
 	} else if (flock->fl_type == F_SHLCK) {
-		cFYI(1, "F_SHLCK");
+		cifs_dbg(FYI, "F_SHLCK\n");
 		*type |= server->vals->shared_lock_type;
 		*lock = 1;
 	} else
-		cFYI(1, "Unknown type of lock");
+		cifs_dbg(FYI, "Unknown type of lock\n");
 }
 
 static int
@@ -1286,8 +1292,8 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
 					    type, 0, 1, false);
 		flock->fl_type = F_UNLCK;
 		if (rc != 0)
-			cERROR(1, "Error unlocking previously locked "
-				  "range %d during test of lock", rc);
+			cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
+				 rc);
 		return 0;
 	}
 
@@ -1306,8 +1312,8 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
 			type | server->vals->shared_lock_type, 0, 1, false);
 		flock->fl_type = F_RDLCK;
 		if (rc != 0)
-			cERROR(1, "Error unlocking previously locked "
-				  "range %d during test of lock", rc);
+			cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
+				 rc);
 	} else
 		flock->fl_type = F_WRLCK;
 
@@ -1498,8 +1504,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
 		if (!CIFS_I(inode)->clientCanCacheAll &&
 					CIFS_I(inode)->clientCanCacheRead) {
 			cifs_invalidate_mapping(inode);
-			cFYI(1, "Set no oplock for inode=%p due to mand locks",
-			     inode);
+			cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n",
+				 inode);
 			CIFS_I(inode)->clientCanCacheRead = false;
 		}
 
@@ -1536,9 +1542,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
 	rc = -EACCES;
 	xid = get_xid();
 
-	cFYI(1, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld "
-		"end: %lld", cmd, flock->fl_flags, flock->fl_type,
-		flock->fl_start, flock->fl_end);
+	cifs_dbg(FYI, "Lock parm: 0x%x flockflags: 0x%x flocktype: 0x%x start: %lld end: %lld\n",
+		 cmd, flock->fl_flags, flock->fl_type,
+		 flock->fl_start, flock->fl_end);
 
 	cfile = (struct cifsFileInfo *)file->private_data;
 	tcon = tlink_tcon(cfile->tlink);
@@ -1548,7 +1554,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	netfid = cfile->fid.netfid;
-	cinode = CIFS_I(file->f_path.dentry->d_inode);
+	cinode = CIFS_I(file_inode(file));
 
 	if (cap_unix(tcon->ses) &&
 	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
@@ -1610,8 +1616,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
 
 	cifs_sb = CIFS_SB(dentry->d_sb);
 
-	cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
-	     *offset, dentry->d_name.name);
+	cifs_dbg(FYI, "write %zd bytes to offset %lld of %s\n",
+		 write_size, *offset, dentry->d_name.name);
 
 	tcon = tlink_tcon(open_file->tlink);
 	server = tcon->ses->server;
@@ -1693,7 +1699,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
 	   are always at the end of the list but since the first entry might
 	   have a close pending, we go through the whole list */
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-		if (fsuid_only && open_file->uid != current_fsuid())
+		if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
 			continue;
 		if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
 			if (!open_file->invalidHandle) {
@@ -1726,7 +1732,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode,
 	it being zero) during stress testcases so we need to check for it */
 
 	if (cifs_inode == NULL) {
-		cERROR(1, "Null inode passed to cifs_writeable_file");
+		cifs_dbg(VFS, "Null inode passed to cifs_writeable_file\n");
 		dump_stack();
 		return NULL;
 	}
@@ -1746,7 +1752,7 @@ refind_writable:
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
 		if (!any_available && open_file->pid != current->tgid)
 			continue;
-		if (fsuid_only && open_file->uid != current_fsuid())
+		if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
 			continue;
 		if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
 			if (!open_file->invalidHandle) {
@@ -1838,7 +1844,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 		else if (bytes_written < 0)
 			rc = bytes_written;
 	} else {
-		cFYI(1, "No writeable filehandles for inode");
+		cifs_dbg(FYI, "No writeable filehandles for inode\n");
 		rc = -EIO;
 	}
 
@@ -2005,7 +2011,7 @@ retry:
 			wdata->cfile = find_writable_file(CIFS_I(mapping->host),
 							  false);
 			if (!wdata->cfile) {
-				cERROR(1, "No writable handles for inode");
+				cifs_dbg(VFS, "No writable handles for inode\n");
 				rc = -EBADF;
 				break;
 			}
@@ -2066,7 +2072,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
 /* BB add check for wbc flags */
 	page_cache_get(page);
 	if (!PageUptodate(page))
-		cFYI(1, "ppw - page not up to date");
+		cifs_dbg(FYI, "ppw - page not up to date\n");
 
 	/*
 	 * Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -2117,7 +2123,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	else
 		pid = current->tgid;
 
-	cFYI(1, "write_end for page %p from pos %lld with %d bytes",
+	cifs_dbg(FYI, "write_end for page %p from pos %lld with %d bytes\n",
 		 page, pos, copied);
 
 	if (PageChecked(page)) {
@@ -2171,7 +2177,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
 	struct cifs_tcon *tcon;
 	struct TCP_Server_Info *server;
 	struct cifsFileInfo *smbfile = file->private_data;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
 	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
@@ -2181,13 +2187,13 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
 
 	xid = get_xid();
 
-	cFYI(1, "Sync file - name: %s datasync: 0x%x",
-		file->f_path.dentry->d_name.name, datasync);
+	cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
+		 file->f_path.dentry->d_name.name, datasync);
 
 	if (!CIFS_I(inode)->clientCanCacheRead) {
 		rc = cifs_invalidate_mapping(inode);
 		if (rc) {
-			cFYI(1, "rc: %d during invalidate phase", rc);
+			cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc);
 			rc = 0; /* don't care about it in fsync */
 		}
 	}
@@ -2223,8 +2229,8 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 
 	xid = get_xid();
 
-	cFYI(1, "Sync file - name: %s datasync: 0x%x",
-		file->f_path.dentry->d_name.name, datasync);
+	cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
+		 file->f_path.dentry->d_name.name, datasync);
 
 	tcon = tlink_tcon(smbfile->tlink);
 	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
@@ -2246,13 +2252,13 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
  */
 int cifs_flush(struct file *file, fl_owner_t id)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	int rc = 0;
 
 	if (file->f_mode & FMODE_WRITE)
 		rc = filemap_write_and_wait(inode->i_mapping);
 
-	cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
+	cifs_dbg(FYI, "Flush inode %p file %p rc %d\n", inode, file, rc);
 
 	return rc;
 }
@@ -2480,7 +2486,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t written;
 	struct inode *inode;
 
-	inode = iocb->ki_filp->f_path.dentry->d_inode;
+	inode = file_inode(iocb->ki_filp);
 
 	/*
 	 * BB - optimize the way when signing is disabled. We can drop this
@@ -2510,8 +2516,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
 
 	BUG_ON(iocb->ki_pos != pos);
 
-	sb_start_write(inode->i_sb);
-
 	/*
 	 * We need to hold the sem to be sure nobody modifies lock list
 	 * with a brlock that prevents writing.
@@ -2535,7 +2539,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
 	}
 
 	up_read(&cinode->lock_sem);
-	sb_end_write(inode->i_sb);
 	return rc;
 }
 
@@ -2543,7 +2546,7 @@ ssize_t
 cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
 		   unsigned long nr_segs, loff_t pos)
 {
-	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(iocb->ki_filp);
 	struct cifsInodeInfo *cinode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsFileInfo *cfile = (struct cifsFileInfo *)
@@ -2572,8 +2575,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
 		 * an old data.
 		 */
 		cifs_invalidate_mapping(inode);
-		cFYI(1, "Set no oplock for inode=%p after a write operation",
-		     inode);
+		cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
+			 inode);
 		cinode->clientCanCacheRead = false;
 	}
 	return written;
@@ -2749,15 +2752,15 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
 			/* enough data to fill the page */
 			iov.iov_base = kmap(page);
 			iov.iov_len = PAGE_SIZE;
-			cFYI(1, "%u: iov_base=%p iov_len=%zu",
-				i, iov.iov_base, iov.iov_len);
+			cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n",
+				 i, iov.iov_base, iov.iov_len);
 			len -= PAGE_SIZE;
 		} else if (len > 0) {
 			/* enough for partial page, fill and zero the rest */
 			iov.iov_base = kmap(page);
 			iov.iov_len = len;
-			cFYI(1, "%u: iov_base=%p iov_len=%zu",
-				i, iov.iov_base, iov.iov_len);
+			cifs_dbg(FYI, "%u: iov_base=%p iov_len=%zu\n",
+				 i, iov.iov_base, iov.iov_len);
 			memset(iov.iov_base + len, '\0', PAGE_SIZE - len);
 			rdata->tailsz = len;
 			len = 0;
@@ -2817,7 +2820,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 		pid = current->tgid;
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-		cFYI(1, "attempting read on write only file instance");
+		cifs_dbg(FYI, "attempting read on write only file instance\n");
 
 	do {
 		cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
@@ -2915,7 +2918,7 @@ ssize_t
 cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
 		  unsigned long nr_segs, loff_t pos)
 {
-	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(iocb->ki_filp);
 	struct cifsInodeInfo *cinode = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsFileInfo *cfile = (struct cifsFileInfo *)
@@ -2996,7 +2999,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
 		pid = current->tgid;
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-		cFYI(1, "attempting read on write only file instance");
+		cifs_dbg(FYI, "attempting read on write only file instance\n");
 
 	for (total_read = 0, cur_offset = read_data; read_size > total_read;
 	     total_read += bytes_read, cur_offset += bytes_read) {
@@ -3063,7 +3066,7 @@ static struct vm_operations_struct cifs_file_vm_ops = {
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	int rc, xid;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 
 	xid = get_xid();
 
@@ -3087,7 +3090,8 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	xid = get_xid();
 	rc = cifs_revalidate_file(file);
 	if (rc) {
-		cFYI(1, "Validation prior to mmap failed, error=%d", rc);
+		cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n",
+			 rc);
 		free_xid(xid);
 		return rc;
 	}
@@ -3140,7 +3144,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
 	/* determine the eof that the server (probably) has */
 	eof = CIFS_I(rdata->mapping->host)->server_eof;
 	eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
-	cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
+	cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
 
 	rdata->tailsz = PAGE_CACHE_SIZE;
 	for (i = 0; i < nr_pages; i++) {
@@ -3150,15 +3154,15 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
 			/* enough data to fill the page */
 			iov.iov_base = kmap(page);
 			iov.iov_len = PAGE_CACHE_SIZE;
-			cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
-				i, page->index, iov.iov_base, iov.iov_len);
+			cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
+				 i, page->index, iov.iov_base, iov.iov_len);
 			len -= PAGE_CACHE_SIZE;
 		} else if (len > 0) {
 			/* enough for partial page, fill and zero the rest */
 			iov.iov_base = kmap(page);
 			iov.iov_len = len;
-			cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
-				i, page->index, iov.iov_base, iov.iov_len);
+			cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
+				 i, page->index, iov.iov_base, iov.iov_len);
 			memset(iov.iov_base + len,
 				'\0', PAGE_CACHE_SIZE - len);
 			rdata->tailsz = len;
@@ -3238,8 +3242,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	rc = 0;
 	INIT_LIST_HEAD(&tmplist);
 
-	cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file,
-		mapping, num_pages);
+	cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
+		 __func__, file, mapping, num_pages);
 
 	/*
 	 * Start with the page at end of list and move it to private
@@ -3356,7 +3360,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	int rc;
 
 	/* Is the page cached? */
-	rc = cifs_readpage_from_fscache(file->f_path.dentry->d_inode, page);
+	rc = cifs_readpage_from_fscache(file_inode(file), page);
 	if (rc == 0)
 		goto read_complete;
 
@@ -3369,10 +3373,10 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	if (rc < 0)
 		goto io_error;
 	else
-		cFYI(1, "Bytes read %d", rc);
+		cifs_dbg(FYI, "Bytes read %d\n", rc);
 
-	file->f_path.dentry->d_inode->i_atime =
-		current_fs_time(file->f_path.dentry->d_inode->i_sb);
+	file_inode(file)->i_atime =
+		current_fs_time(file_inode(file)->i_sb);
 
 	if (PAGE_CACHE_SIZE > rc)
 		memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc);
@@ -3381,7 +3385,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	SetPageUptodate(page);
 
 	/* send this page to the cache */
-	cifs_readpage_to_fscache(file->f_path.dentry->d_inode, page);
+	cifs_readpage_to_fscache(file_inode(file), page);
 
 	rc = 0;
 
@@ -3407,7 +3411,7 @@ static int cifs_readpage(struct file *file, struct page *page)
 		return rc;
 	}
 
-	cFYI(1, "readpage %p at offset %d 0x%x",
+	cifs_dbg(FYI, "readpage %p at offset %d 0x%x\n",
 		 page, (int)offset, (int)offset);
 
 	rc = cifs_readpage_worker(file, page, &offset);
@@ -3474,7 +3478,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 	int rc = 0;
 
-	cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
+	cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len);
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
@@ -3543,11 +3547,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
 	return cifs_fscache_release_page(page, gfp);
 }
 
-static void cifs_invalidate_page(struct page *page, unsigned long offset)
+static void cifs_invalidate_page(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
 
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
 }
 
@@ -3563,7 +3568,7 @@ static int cifs_launder_page(struct page *page)
 		.range_end = range_end,
 	};
 
-	cFYI(1, "Launder page: %p", page);
+	cifs_dbg(FYI, "Launder page: %p\n", page);
 
 	if (clear_page_dirty_for_io(page))
 		rc = cifs_writepage_locked(page, &wbc);
@@ -3583,8 +3588,8 @@ void cifs_oplock_break(struct work_struct *work)
 
 	if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead &&
 						cifs_has_mand_locks(cinode)) {
-		cFYI(1, "Reset oplock to None for inode=%p due to mand locks",
-		     inode);
+		cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n",
+			 inode);
 		cinode->clientCanCacheRead = false;
 	}
 
@@ -3599,12 +3604,12 @@ void cifs_oplock_break(struct work_struct *work)
 			mapping_set_error(inode->i_mapping, rc);
 			cifs_invalidate_mapping(inode);
 		}
-		cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
+		cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc);
 	}
 
 	rc = cifs_push_locks(cfile);
 	if (rc)
-		cERROR(1, "Push locks rc = %d", rc);
+		cifs_dbg(VFS, "Push locks rc = %d\n", rc);
 
 	/*
 	 * releasing stale oplock after recent reconnect of smb session using
@@ -3615,7 +3620,7 @@ void cifs_oplock_break(struct work_struct *work)
 	if (!cfile->oplock_break_cancelled) {
 		rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid,
 							     cinode);
-		cFYI(1, "Oplock release rc = %d", rc);
+		cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
 	}
 }
 
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 42e5363b410..2f4bc5a5805 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -28,14 +28,14 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
 	server->fscache =
 		fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
 				&cifs_fscache_server_index_def, server);
-	cFYI(1, "%s: (0x%p/0x%p)", __func__, server,
-			server->fscache);
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+		 __func__, server, server->fscache);
 }
 
 void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server)
 {
-	cFYI(1, "%s: (0x%p/0x%p)", __func__, server,
-			server->fscache);
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+		 __func__, server, server->fscache);
 	fscache_relinquish_cookie(server->fscache, 0);
 	server->fscache = NULL;
 }
@@ -47,13 +47,13 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
 	tcon->fscache =
 		fscache_acquire_cookie(server->fscache,
 				&cifs_fscache_super_index_def, tcon);
-	cFYI(1, "%s: (0x%p/0x%p)", __func__, server->fscache,
-			tcon->fscache);
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+		 __func__, server->fscache, tcon->fscache);
 }
 
 void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
 {
-	cFYI(1, "%s: (0x%p)", __func__, tcon->fscache);
+	cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache);
 	fscache_relinquish_cookie(tcon->fscache, 0);
 	tcon->fscache = NULL;
 }
@@ -70,8 +70,8 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
 		cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
 				&cifs_fscache_inode_object_def, cifsi);
-		cFYI(1, "%s: got FH cookie (0x%p/0x%p)", __func__,
-				tcon->fscache, cifsi->fscache);
+		cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
+			 __func__, tcon->fscache, cifsi->fscache);
 	}
 }
 
@@ -80,7 +80,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 
 	if (cifsi->fscache) {
-		cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache);
+		cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
 		fscache_relinquish_cookie(cifsi->fscache, 0);
 		cifsi->fscache = NULL;
 	}
@@ -91,7 +91,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode)
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 
 	if (cifsi->fscache) {
-		cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache);
+		cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache);
 		fscache_uncache_all_inode_pages(cifsi->fscache, inode);
 		fscache_relinquish_cookie(cifsi->fscache, 1);
 		cifsi->fscache = NULL;
@@ -120,8 +120,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
 					cifs_sb_master_tcon(cifs_sb)->fscache,
 					&cifs_fscache_inode_object_def,
 					cifsi);
-		cFYI(1, "%s: new cookie 0x%p oldcookie 0x%p",
-				__func__, cifsi->fscache, old);
+		cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n",
+			 __func__, cifsi->fscache, old);
 	}
 }
 
@@ -131,8 +131,8 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp)
 		struct inode *inode = page->mapping->host;
 		struct cifsInodeInfo *cifsi = CIFS_I(inode);
 
-		cFYI(1, "%s: (0x%p/0x%p)", __func__, page,
-				cifsi->fscache);
+		cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
+			 __func__, page, cifsi->fscache);
 		if (!fscache_maybe_release_page(cifsi->fscache, page, gfp))
 			return 0;
 	}
@@ -143,7 +143,7 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp)
 static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx,
 						int error)
 {
-	cFYI(1, "%s: (0x%p/%d)", __func__, page, error);
+	cifs_dbg(FYI, "%s: (0x%p/%d)\n", __func__, page, error);
 	if (!error)
 		SetPageUptodate(page);
 	unlock_page(page);
@@ -156,8 +156,8 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
 {
 	int ret;
 
-	cFYI(1, "%s: (fsc:%p, p:%p, i:0x%p", __func__,
-			CIFS_I(inode)->fscache, page, inode);
+	cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n",
+		 __func__, CIFS_I(inode)->fscache, page, inode);
 	ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page,
 					 cifs_readpage_from_fscache_complete,
 					 NULL,
@@ -165,15 +165,15 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
 	switch (ret) {
 
 	case 0: /* page found in fscache, read submitted */
-		cFYI(1, "%s: submitted", __func__);
+		cifs_dbg(FYI, "%s: submitted\n", __func__);
 		return ret;
 	case -ENOBUFS:	/* page won't be cached */
 	case -ENODATA:	/* page not in cache */
-		cFYI(1, "%s: %d", __func__, ret);
+		cifs_dbg(FYI, "%s: %d\n", __func__, ret);
 		return 1;
 
 	default:
-		cERROR(1, "unknown error ret = %d", ret);
+		cifs_dbg(VFS, "unknown error ret = %d\n", ret);
 	}
 	return ret;
 }
@@ -188,8 +188,8 @@ int __cifs_readpages_from_fscache(struct inode *inode,
 {
 	int ret;
 
-	cFYI(1, "%s: (0x%p/%u/0x%p)", __func__,
-			CIFS_I(inode)->fscache, *nr_pages, inode);
+	cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n",
+		 __func__, CIFS_I(inode)->fscache, *nr_pages, inode);
 	ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping,
 					  pages, nr_pages,
 					  cifs_readpage_from_fscache_complete,
@@ -197,16 +197,16 @@ int __cifs_readpages_from_fscache(struct inode *inode,
 					  mapping_gfp_mask(mapping));
 	switch (ret) {
 	case 0:	/* read submitted to the cache for all pages */
-		cFYI(1, "%s: submitted", __func__);
+		cifs_dbg(FYI, "%s: submitted\n", __func__);
 		return ret;
 
 	case -ENOBUFS:	/* some pages are not cached and can't be */
 	case -ENODATA:	/* some pages are not cached */
-		cFYI(1, "%s: no page", __func__);
+		cifs_dbg(FYI, "%s: no page\n", __func__);
 		return 1;
 
 	default:
-		cFYI(1, "unknown error ret = %d", ret);
+		cifs_dbg(FYI, "unknown error ret = %d\n", ret);
 	}
 
 	return ret;
@@ -216,8 +216,8 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
 {
 	int ret;
 
-	cFYI(1, "%s: (fsc: %p, p: %p, i: %p)", __func__,
-			CIFS_I(inode)->fscache, page, inode);
+	cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
+		 __func__, CIFS_I(inode)->fscache, page, inode);
 	ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL);
 	if (ret != 0)
 		fscache_uncache_page(CIFS_I(inode)->fscache, page);
@@ -228,7 +228,7 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode)
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 	struct fscache_cookie *cookie = cifsi->fscache;
 
-	cFYI(1, "%s: (0x%p/0x%p)", __func__, page, cookie);
+	cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie);
 	fscache_wait_on_page_write(cookie, page);
 	fscache_uncache_page(cookie, page);
 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index ed6208ff85a..20efd81266c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -91,30 +91,32 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 {
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 
-	cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
+	cifs_dbg(FYI, "%s: revalidating inode %llu\n",
+		 __func__, cifs_i->uniqueid);
 
 	if (inode->i_state & I_NEW) {
-		cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
+		cifs_dbg(FYI, "%s: inode %llu is new\n",
+			 __func__, cifs_i->uniqueid);
 		return;
 	}
 
 	/* don't bother with revalidation if we have an oplock */
 	if (cifs_i->clientCanCacheRead) {
-		cFYI(1, "%s: inode %llu is oplocked", __func__,
-			 cifs_i->uniqueid);
+		cifs_dbg(FYI, "%s: inode %llu is oplocked\n",
+			 __func__, cifs_i->uniqueid);
 		return;
 	}
 
 	 /* revalidate if mtime or size have changed */
 	if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
 	    cifs_i->server_eof == fattr->cf_eof) {
-		cFYI(1, "%s: inode %llu is unchanged", __func__,
-			 cifs_i->uniqueid);
+		cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
+			 __func__, cifs_i->uniqueid);
 		return;
 	}
 
-	cFYI(1, "%s: invalidating inode %llu mapping", __func__,
-		 cifs_i->uniqueid);
+	cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n",
+		 __func__, cifs_i->uniqueid);
 	cifs_i->invalid_mapping = true;
 }
 
@@ -169,7 +171,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 
 	if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
 		inode->i_flags |= S_AUTOMOUNT;
-	cifs_set_ops(inode);
+	if (inode->i_state & I_NEW)
+		cifs_set_ops(inode);
 }
 
 void
@@ -240,19 +243,29 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
 		/* safest to call it a file if we do not know */
 		fattr->cf_mode |= S_IFREG;
 		fattr->cf_dtype = DT_REG;
-		cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
+		cifs_dbg(FYI, "unknown type %d\n", le32_to_cpu(info->Type));
 		break;
 	}
 
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
-		fattr->cf_uid = cifs_sb->mnt_uid;
-	else
-		fattr->cf_uid = le64_to_cpu(info->Uid);
-
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
-		fattr->cf_gid = cifs_sb->mnt_gid;
-	else
-		fattr->cf_gid = le64_to_cpu(info->Gid);
+	fattr->cf_uid = cifs_sb->mnt_uid;
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) {
+		u64 id = le64_to_cpu(info->Uid);
+		if (id < ((uid_t)-1)) {
+			kuid_t uid = make_kuid(&init_user_ns, id);
+			if (uid_valid(uid))
+				fattr->cf_uid = uid;
+		}
+	}
+	
+	fattr->cf_gid = cifs_sb->mnt_gid;
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) {
+		u64 id = le64_to_cpu(info->Gid);
+		if (id < ((gid_t)-1)) {
+			kgid_t gid = make_kgid(&init_user_ns, id);
+			if (gid_valid(gid))
+				fattr->cf_gid = gid;
+		}
+	}
 
 	fattr->cf_nlink = le64_to_cpu(info->Nlinks);
 }
@@ -269,7 +282,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
-	cFYI(1, "creating fake fattr for DFS referral");
+	cifs_dbg(FYI, "creating fake fattr for DFS referral\n");
 
 	memset(fattr, 0, sizeof(*fattr));
 	fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -289,7 +302,7 @@ cifs_get_file_info_unix(struct file *filp)
 	unsigned int xid;
 	FILE_UNIX_BASIC_INFO find_data;
 	struct cifs_fattr fattr;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsFileInfo *cfile = filp->private_data;
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
@@ -319,7 +332,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 	struct tcon_link *tlink;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
-	cFYI(1, "Getting info on %s", full_path);
+	cifs_dbg(FYI, "Getting info on %s\n", full_path);
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
@@ -345,7 +358,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
 		int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
 		if (tmprc)
-			cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+			cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
 	}
 
 	if (*pinode == NULL) {
@@ -412,7 +425,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 				 &buf_type);
 		if ((rc == 0) && (bytes_read >= 8)) {
 			if (memcmp("IntxBLK", pbuf, 8) == 0) {
-				cFYI(1, "Block device");
+				cifs_dbg(FYI, "Block device\n");
 				fattr->cf_mode |= S_IFBLK;
 				fattr->cf_dtype = DT_BLK;
 				if (bytes_read == 24) {
@@ -424,7 +437,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 					fattr->cf_rdev = MKDEV(mjr, mnr);
 				}
 			} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
-				cFYI(1, "Char device");
+				cifs_dbg(FYI, "Char device\n");
 				fattr->cf_mode |= S_IFCHR;
 				fattr->cf_dtype = DT_CHR;
 				if (bytes_read == 24) {
@@ -436,7 +449,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 					fattr->cf_rdev = MKDEV(mjr, mnr);
 				}
 			} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
-				cFYI(1, "Symlink");
+				cifs_dbg(FYI, "Symlink\n");
 				fattr->cf_mode |= S_IFLNK;
 				fattr->cf_dtype = DT_LNK;
 			} else {
@@ -487,10 +500,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 	else if (rc > 3) {
 		mode = le32_to_cpu(*((__le32 *)ea_value));
 		fattr->cf_mode &= ~SFBITS_MASK;
-		cFYI(1, "special bits 0%o org mode 0%o", mode,
-			 fattr->cf_mode);
+		cifs_dbg(FYI, "special bits 0%o org mode 0%o\n",
+			 mode, fattr->cf_mode);
 		fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
-		cFYI(1, "special mode bits 0%o", mode);
+		cifs_dbg(FYI, "special mode bits 0%o\n", mode);
 	}
 
 	return 0;
@@ -558,7 +571,7 @@ cifs_get_file_info(struct file *filp)
 	unsigned int xid;
 	FILE_ALL_INFO find_data;
 	struct cifs_fattr fattr;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsFileInfo *cfile = filp->private_data;
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
@@ -625,11 +638,11 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 	tcon = tlink_tcon(tlink);
 	server = tcon->ses->server;
 
-	cFYI(1, "Getting info on %s", full_path);
+	cifs_dbg(FYI, "Getting info on %s\n", full_path);
 
 	if ((data == NULL) && (*inode != NULL)) {
 		if (CIFS_I(*inode)->clientCanCacheRead) {
-			cFYI(1, "No need to revalidate cached inode sizes");
+			cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
 			goto cgii_exit;
 		}
 	}
@@ -704,7 +717,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 						tcon, cifs_sb, full_path,
 						&fattr.cf_uniqueid, data);
 				if (tmprc) {
-					cFYI(1, "GetSrvInodeNum rc %d", tmprc);
+					cifs_dbg(FYI, "GetSrvInodeNum rc %d\n",
+						 tmprc);
 					fattr.cf_uniqueid = iunique(sb, ROOT_I);
 					cifs_autodisable_serverino(cifs_sb);
 				}
@@ -719,7 +733,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 	    cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
 		tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
 		if (tmprc)
-			cFYI(1, "cifs_sfu_type failed: %d", tmprc);
+			cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc);
 	}
 
 #ifdef CONFIG_CIFS_ACL
@@ -727,8 +741,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 		rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, full_path, fid);
 		if (rc) {
-			cFYI(1, "%s: Getting ACL failed with error: %d",
-				__func__, rc);
+			cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n",
+				 __func__, rc);
 			goto cgii_exit;
 		}
 	}
@@ -742,7 +756,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
 		tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
 		if (tmprc)
-			cFYI(1, "CIFSCheckMFSymlink: %d", tmprc);
+			cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
 	}
 
 	if (!*inode) {
@@ -806,10 +820,9 @@ static bool
 inode_has_hashed_dentries(struct inode *inode)
 {
 	struct dentry *dentry;
-	struct hlist_node *p;
 
 	spin_lock(&inode->i_lock);
-	hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
 			spin_unlock(&inode->i_lock);
 			return true;
@@ -827,7 +840,7 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
 	struct inode *inode;
 
 retry_iget5_locked:
-	cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
+	cifs_dbg(FYI, "looking for uniqueid=%llu\n", fattr->cf_uniqueid);
 
 	/* hash down to 32-bits on 32-bit arch */
 	hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
@@ -890,7 +903,7 @@ struct inode *cifs_root_iget(struct super_block *sb)
 #endif
 
 	if (rc && tcon->ipc) {
-		cFYI(1, "ipc connection - fake read inode");
+		cifs_dbg(FYI, "ipc connection - fake read inode\n");
 		spin_lock(&inode->i_lock);
 		inode->i_mode |= S_IFDIR;
 		set_nlink(inode, 2);
@@ -949,7 +962,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid,
 	 * server times.
 	 */
 	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-		cFYI(1, "CIFS - CTIME changed");
+		cifs_dbg(FYI, "CIFS - CTIME changed\n");
 		info_buf.ChangeTime =
 		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
 	} else
@@ -986,6 +999,15 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
 		return PTR_ERR(tlink);
 	tcon = tlink_tcon(tlink);
 
+	/*
+	 * We cannot rename the file if the server doesn't support
+	 * CAP_INFOLEVEL_PASSTHRU
+	 */
+	if (!(tcon->ses->capabilities & CAP_INFOLEVEL_PASSTHRU)) {
+		rc = -EBUSY;
+		goto out;
+	}
+
 	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
 			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
 			 &netfid, &oplock, NULL, cifs_sb->local_nls,
@@ -1014,7 +1036,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
 					current->tgid);
 		/* although we would like to mark the file hidden
  		   if that fails we will still try to rename it */
-		if (rc != 0)
+		if (!rc)
 			cifsInode->cifsAttrs = dosattr;
 		else
 			dosattr = origattr; /* since not able to change them */
@@ -1025,7 +1047,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
 				   cifs_sb->mnt_cifs_flags &
 					    CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc != 0) {
-		rc = -ETXTBSY;
+		rc = -EBUSY;
 		goto undo_setattr;
 	}
 
@@ -1044,7 +1066,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
 		if (rc == -ENOENT)
 			rc = 0;
 		else if (rc != 0) {
-			rc = -ETXTBSY;
+			rc = -EBUSY;
 			goto undo_rename;
 		}
 		cifsInode->delete_pending = true;
@@ -1109,7 +1131,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	struct iattr *attrs = NULL;
 	__u32 dosattr = 0, origattr = 0;
 
-	cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
+	cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry);
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink))
@@ -1132,7 +1154,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 		rc = CIFSPOSIXDelFile(xid, tcon, full_path,
 			SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-		cFYI(1, "posix del rc %d", rc);
+		cifs_dbg(FYI, "posix del rc %d\n", rc);
 		if ((rc == 0) || (rc == -ENOENT))
 			goto psx_del_no_retry;
 	}
@@ -1151,15 +1173,13 @@ psx_del_no_retry:
 			cifs_drop_nlink(inode);
 	} else if (rc == -ENOENT) {
 		d_drop(dentry);
-	} else if (rc == -ETXTBSY) {
+	} else if (rc == -EBUSY) {
 		if (server->ops->rename_pending_delete) {
 			rc = server->ops->rename_pending_delete(full_path,
 								dentry, xid);
 			if (rc == 0)
 				cifs_drop_nlink(inode);
 		}
-		if (rc == -ETXTBSY)
-			rc = -EBUSY;
 	} else if ((rc == -EACCES) && (dosattr == 0) && inode) {
 		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
 		if (attrs == NULL) {
@@ -1245,14 +1265,14 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
 			.device	= 0,
 		};
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-			args.uid = (__u64)current_fsuid();
+			args.uid = current_fsuid();
 			if (parent->i_mode & S_ISGID)
-				args.gid = (__u64)parent->i_gid;
+				args.gid = parent->i_gid;
 			else
-				args.gid = (__u64)current_fsgid();
+				args.gid = current_fsgid();
 		} else {
-			args.uid = NO_CHANGE_64;
-			args.gid = NO_CHANGE_64;
+			args.uid = INVALID_UID; /* no change */
+			args.gid = INVALID_GID; /* no change */
 		}
 		CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
 				       cifs_sb->local_nls,
@@ -1304,7 +1324,7 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
 	if (rc == -EOPNOTSUPP)
 		goto posix_mkdir_out;
 	else if (rc) {
-		cFYI(1, "posix mkdir returned 0x%x", rc);
+		cifs_dbg(FYI, "posix mkdir returned 0x%x\n", rc);
 		d_drop(dentry);
 		goto posix_mkdir_out;
 	}
@@ -1326,11 +1346,12 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
 	d_instantiate(dentry, newinode);
 
 #ifdef CONFIG_CIFS_DEBUG2
-	cFYI(1, "instantiated dentry %p %s to inode %p", dentry,
-	     dentry->d_name.name, newinode);
+	cifs_dbg(FYI, "instantiated dentry %p %s to inode %p\n",
+		 dentry, dentry->d_name.name, newinode);
 
 	if (newinode->i_nlink != 2)
-		cFYI(1, "unexpected number of links %d", newinode->i_nlink);
+		cifs_dbg(FYI, "unexpected number of links %d\n",
+			 newinode->i_nlink);
 #endif
 
 posix_mkdir_out:
@@ -1352,7 +1373,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
 	struct TCP_Server_Info *server;
 	char *full_path;
 
-	cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode);
+	cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n",
+		 mode, inode);
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
@@ -1386,7 +1408,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
 	/* BB add setting the equivalent of mode via CreateX w/ACLs */
 	rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb);
 	if (rc) {
-		cFYI(1, "cifs_mkdir returned 0x%x", rc);
+		cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc);
 		d_drop(direntry);
 		goto mkdir_out;
 	}
@@ -1416,7 +1438,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	char *full_path = NULL;
 	struct cifsInodeInfo *cifsInode;
 
-	cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
+	cifs_dbg(FYI, "cifs_rmdir, inode = 0x%p\n", inode);
 
 	xid = get_xid();
 
@@ -1500,7 +1522,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
 	 * source. Note that cross directory moves do not work with
 	 * rename by filehandle to various Windows servers.
 	 */
-	if (rc == 0 || rc != -ETXTBSY)
+	if (rc == 0 || rc != -EBUSY)
 		goto do_rename_exit;
 
 	/* open-file renames don't work across directories */
@@ -1665,8 +1687,8 @@ cifs_invalidate_mapping(struct inode *inode)
 	if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
 		rc = invalidate_inode_pages2(inode->i_mapping);
 		if (rc) {
-			cERROR(1, "%s: could not invalidate inode %p", __func__,
-			       inode);
+			cifs_dbg(VFS, "%s: could not invalidate inode %p\n",
+				 __func__, inode);
 			cifs_i->invalid_mapping = true;
 		}
 	}
@@ -1678,7 +1700,7 @@ cifs_invalidate_mapping(struct inode *inode)
 int cifs_revalidate_file_attr(struct file *filp)
 {
 	int rc = 0;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
 
 	if (!cifs_inode_needs_reval(inode))
@@ -1716,8 +1738,8 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
 		goto out;
 	}
 
-	cFYI(1, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time "
-		 "%ld jiffies %ld", full_path, inode, inode->i_count.counter,
+	cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n",
+		 full_path, inode, inode->i_count.counter,
 		 dentry, dentry->d_time, jiffies);
 
 	if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
@@ -1735,7 +1757,7 @@ out:
 int cifs_revalidate_file(struct file *filp)
 {
 	int rc;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
 	rc = cifs_revalidate_file_attr(filp);
 	if (rc)
@@ -1867,7 +1889,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 		else
 			rc = -ENOSYS;
 		cifsFileInfo_put(open_file);
-		cFYI(1, "SetFSize for attrs rc = %d", rc);
+		cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc);
 		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			unsigned int bytes_written;
 
@@ -1878,7 +1900,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 			io_parms.length = attrs->ia_size;
 			rc = CIFSSMBWrite(xid, &io_parms, &bytes_written,
 					  NULL, NULL, 1);
-			cFYI(1, "Wrt seteof rc %d", rc);
+			cifs_dbg(FYI, "Wrt seteof rc %d\n", rc);
 		}
 	} else
 		rc = -EINVAL;
@@ -1904,7 +1926,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 						attrs->ia_size, cifs_sb, false);
 	else
 		rc = -ENOSYS;
-	cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
+	cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc);
 	if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 		__u16 netfid;
 		int oplock = 0;
@@ -1924,7 +1946,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 			io_parms.length = attrs->ia_size;
 			rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL,
 					  NULL,  1);
-			cFYI(1, "wrt seteof rc %d", rc);
+			cifs_dbg(FYI, "wrt seteof rc %d\n", rc);
 			CIFSSMBClose(xid, tcon, netfid);
 		}
 	}
@@ -1955,7 +1977,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	struct cifs_unix_set_info_args *args = NULL;
 	struct cifsFileInfo *open_file;
 
-	cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
+	cifs_dbg(FYI, "setattr_unix on file %s attrs->ia_valid=0x%x\n",
 		 direntry->d_name.name, attrs->ia_valid);
 
 	xid = get_xid();
@@ -2013,12 +2035,12 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	if (attrs->ia_valid & ATTR_UID)
 		args->uid = attrs->ia_uid;
 	else
-		args->uid = NO_CHANGE_64;
+		args->uid = INVALID_UID; /* no change */
 
 	if (attrs->ia_valid & ATTR_GID)
 		args->gid = attrs->ia_gid;
 	else
-		args->gid = NO_CHANGE_64;
+		args->gid = INVALID_GID; /* no change */
 
 	if (attrs->ia_valid & ATTR_ATIME)
 		args->atime = cifs_UnixTimeToNT(attrs->ia_atime);
@@ -2086,8 +2108,8 @@ static int
 cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 {
 	unsigned int xid;
-	uid_t uid = NO_CHANGE_32;
-	gid_t gid = NO_CHANGE_32;
+	kuid_t uid = INVALID_UID;
+	kgid_t gid = INVALID_GID;
 	struct inode *inode = direntry->d_inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
@@ -2098,7 +2120,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 
 	xid = get_xid();
 
-	cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
+	cifs_dbg(FYI, "setattr on file %s attrs->iavalid 0x%x\n",
 		 direntry->d_name.name, attrs->ia_valid);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
@@ -2146,12 +2168,12 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 
 #ifdef CONFIG_CIFS_ACL
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-		if (uid != NO_CHANGE_32 || gid != NO_CHANGE_32) {
+		if (uid_valid(uid) || gid_valid(gid)) {
 			rc = id_mode_to_cifs_acl(inode, full_path, NO_CHANGE_64,
 							uid, gid);
 			if (rc) {
-				cFYI(1, "%s: Setting id failed with error: %d",
-					__func__, rc);
+				cifs_dbg(FYI, "%s: Setting id failed with error: %d\n",
+					 __func__, rc);
 				goto cifs_setattr_exit;
 			}
 		}
@@ -2170,10 +2192,10 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 #ifdef CONFIG_CIFS_ACL
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 			rc = id_mode_to_cifs_acl(inode, full_path, mode,
-						NO_CHANGE_32, NO_CHANGE_32);
+						INVALID_UID, INVALID_GID);
 			if (rc) {
-				cFYI(1, "%s: Setting ACL failed with error: %d",
-					__func__, rc);
+				cifs_dbg(FYI, "%s: Setting ACL failed with error: %d\n",
+					 __func__, rc);
 				goto cifs_setattr_exit;
 			}
 		} else
@@ -2261,7 +2283,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #if 0
 void cifs_delete_inode(struct inode *inode)
 {
-	cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
+	cifs_dbg(FYI, "In cifs_delete_inode, inode = 0x%p\n", inode);
 	/* may have to add back in if and when safe distributed caching of
 	   directories added e.g. via FindNotify */
 }
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index fd5009d56f9..3e084558585 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -30,7 +30,7 @@
 
 long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 {
-	struct inode *inode = filep->f_dentry->d_inode;
+	struct inode *inode = file_inode(filep);
 	int rc = -ENOTTY; /* strange error - but the precedent */
 	unsigned int xid;
 	struct cifs_sb_info *cifs_sb;
@@ -44,7 +44,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 
 	xid = get_xid();
 
-	cFYI(1, "ioctl file %p  cmd %u  arg %lu", filep, command, arg);
+	cifs_dbg(FYI, "ioctl file %p  cmd %u  arg %lu\n", filep, command, arg);
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 
@@ -83,11 +83,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 				 *		       &ExtAttrMask);
 				 */
 			}
-			cFYI(1, "set flags not implemented yet");
+			cifs_dbg(FYI, "set flags not implemented yet\n");
 			break;
 #endif /* CONFIG_CIFS_POSIX */
 		default:
-			cFYI(1, "unsupported ioctl");
+			cifs_dbg(FYI, "unsupported ioctl\n");
 			break;
 	}
 
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 51dc2fb6e85..b83c3f5646b 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -56,14 +56,14 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
 	md5 = crypto_alloc_shash("md5", 0, 0);
 	if (IS_ERR(md5)) {
 		rc = PTR_ERR(md5);
-		cERROR(1, "%s: Crypto md5 allocation error %d", __func__, rc);
+		cifs_dbg(VFS, "%s: Crypto md5 allocation error %d\n",
+			 __func__, rc);
 		return rc;
 	}
 	size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
 	sdescmd5 = kmalloc(size, GFP_KERNEL);
 	if (!sdescmd5) {
 		rc = -ENOMEM;
-		cERROR(1, "%s: Memory allocation failure", __func__);
 		goto symlink_hash_err;
 	}
 	sdescmd5->shash.tfm = md5;
@@ -71,17 +71,17 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
 
 	rc = crypto_shash_init(&sdescmd5->shash);
 	if (rc) {
-		cERROR(1, "%s: Could not init md5 shash", __func__);
+		cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__);
 		goto symlink_hash_err;
 	}
 	rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);
 	if (rc) {
-		cERROR(1, "%s: Could not update iwth link_str", __func__);
+		cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
 		goto symlink_hash_err;
 	}
 	rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
 	if (rc)
-		cERROR(1, "%s: Could not generate md5 hash", __func__);
+		cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
 
 symlink_hash_err:
 	crypto_free_shash(md5);
@@ -115,7 +115,7 @@ CIFSParseMFSymlink(const u8 *buf,
 
 	rc = symlink_hash(link_len, link_str, md5_hash);
 	if (rc) {
-		cFYI(1, "%s: MD5 hash failure: %d", __func__, rc);
+		cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
 		return rc;
 	}
 
@@ -154,7 +154,7 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 
 	rc = symlink_hash(link_len, link_str, md5_hash);
 	if (rc) {
-		cFYI(1, "%s: MD5 hash failure: %d", __func__, rc);
+		cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
 		return rc;
 	}
 
@@ -521,7 +521,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	if (!full_path)
 		goto out;
 
-	cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
+	cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode);
 
 	rc = -EACCES;
 	/*
@@ -578,8 +578,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 		goto symlink_exit;
 	}
 
-	cFYI(1, "Full path: %s", full_path);
-	cFYI(1, "symname is %s", symname);
+	cifs_dbg(FYI, "Full path: %s\n", full_path);
+	cifs_dbg(FYI, "symname is %s\n", symname);
 
 	/* BB what if DFS and this volume is on different share? BB */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
@@ -601,8 +601,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 						 inode->i_sb, xid, NULL);
 
 		if (rc != 0) {
-			cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
-			      rc);
+			cifs_dbg(FYI, "Create symlink ok, getinodeinfo fail rc = %d\n",
+				 rc);
 		} else {
 			d_instantiate(direntry, newinode);
 		}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3a00c0d0cea..f7d4b2285ef 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -54,7 +54,7 @@ _get_xid(void)
 	if (GlobalTotalActiveXid > GlobalMaxActiveXid)
 		GlobalMaxActiveXid = GlobalTotalActiveXid;
 	if (GlobalTotalActiveXid > 65000)
-		cFYI(1, "warning: more than 65000 requests active");
+		cifs_dbg(FYI, "warning: more than 65000 requests active\n");
 	xid = GlobalCurrentXid++;
 	spin_unlock(&GlobalMid_Lock);
 	return xid;
@@ -91,7 +91,7 @@ void
 sesInfoFree(struct cifs_ses *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		cFYI(1, "Null buffer passed to sesInfoFree");
+		cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n");
 		return;
 	}
 
@@ -130,7 +130,7 @@ void
 tconInfoFree(struct cifs_tcon *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		cFYI(1, "Null buffer passed to tconInfoFree");
+		cifs_dbg(FYI, "Null buffer passed to tconInfoFree\n");
 		return;
 	}
 	atomic_dec(&tconInfoAllocCount);
@@ -180,7 +180,7 @@ void
 cifs_buf_release(void *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		/* cFYI(1, "Null buffer passed to cifs_buf_release");*/
+		/* cifs_dbg(FYI, "Null buffer passed to cifs_buf_release\n");*/
 		return;
 	}
 	mempool_free(buf_to_free, cifs_req_poolp);
@@ -216,7 +216,7 @@ cifs_small_buf_release(void *buf_to_free)
 {
 
 	if (buf_to_free == NULL) {
-		cFYI(1, "Null buffer passed to cifs_small_buf_release");
+		cifs_dbg(FYI, "Null buffer passed to cifs_small_buf_release\n");
 		return;
 	}
 	mempool_free(buf_to_free, cifs_sm_req_poolp);
@@ -267,8 +267,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 		if (treeCon->nocase)
 			buffer->Flags  |= SMBFLG_CASELESS;
 		if ((treeCon->ses) && (treeCon->ses->server))
-			if (treeCon->ses->server->sec_mode &
-			  (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+			if (treeCon->ses->server->sign)
 				buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 	}
 
@@ -282,15 +281,15 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 {
 	/* does it have the right SMB "signature" ? */
 	if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
-		cERROR(1, "Bad protocol string signature header 0x%x",
-			*(unsigned int *)smb->Protocol);
+		cifs_dbg(VFS, "Bad protocol string signature header 0x%x\n",
+			 *(unsigned int *)smb->Protocol);
 		return 1;
 	}
 
 	/* Make sure that message ids match */
 	if (mid != smb->Mid) {
-		cERROR(1, "Mids do not match. received=%u expected=%u",
-			smb->Mid, mid);
+		cifs_dbg(VFS, "Mids do not match. received=%u expected=%u\n",
+			 smb->Mid, mid);
 		return 1;
 	}
 
@@ -302,7 +301,7 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 	if (smb->Command == SMB_COM_LOCKING_ANDX)
 		return 0;
 
-	cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
+	cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", smb->Mid);
 	return 1;
 }
 
@@ -313,8 +312,8 @@ checkSMB(char *buf, unsigned int total_read)
 	__u16 mid = smb->Mid;
 	__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
 	__u32 clc_len;  /* calculated length */
-	cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x",
-		total_read, rfclen);
+	cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n",
+		 total_read, rfclen);
 
 	/* is this frame too small to even get to a BCC? */
 	if (total_read < 2 + sizeof(struct smb_hdr)) {
@@ -340,9 +339,9 @@ checkSMB(char *buf, unsigned int total_read)
 				tmp[sizeof(struct smb_hdr)+1] = 0;
 				return 0;
 			}
-			cERROR(1, "rcvd invalid byte count (bcc)");
+			cifs_dbg(VFS, "rcvd invalid byte count (bcc)\n");
 		} else {
-			cERROR(1, "Length less than smb header size");
+			cifs_dbg(VFS, "Length less than smb header size\n");
 		}
 		return -EIO;
 	}
@@ -353,8 +352,8 @@ checkSMB(char *buf, unsigned int total_read)
 	clc_len = smbCalcSize(smb);
 
 	if (4 + rfclen != total_read) {
-		cERROR(1, "Length read does not match RFC1001 length %d",
-				rfclen);
+		cifs_dbg(VFS, "Length read does not match RFC1001 length %d\n",
+			 rfclen);
 		return -EIO;
 	}
 
@@ -365,12 +364,12 @@ checkSMB(char *buf, unsigned int total_read)
 			if (((4 + rfclen) & 0xFFFF) == (clc_len & 0xFFFF))
 				return 0; /* bcc wrapped */
 		}
-		cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
-				clc_len, 4 + rfclen, smb->Mid);
+		cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n",
+			 clc_len, 4 + rfclen, smb->Mid);
 
 		if (4 + rfclen < clc_len) {
-			cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
-					rfclen, smb->Mid);
+			cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n",
+				 rfclen, smb->Mid);
 			return -EIO;
 		} else if (rfclen > clc_len + 512) {
 			/*
@@ -382,8 +381,8 @@ checkSMB(char *buf, unsigned int total_read)
 			 * trailing data, we choose limit the amount of extra
 			 * data to 512 bytes.
 			 */
-			cERROR(1, "RFC1001 size %u more than 512 bytes larger "
-				  "than SMB for mid=%u", rfclen, smb->Mid);
+			cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n",
+				 rfclen, smb->Mid);
 			return -EIO;
 		}
 	}
@@ -401,7 +400,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
 	struct cifsInodeInfo *pCifsInode;
 	struct cifsFileInfo *netfile;
 
-	cFYI(1, "Checking for oplock break or dnotify response");
+	cifs_dbg(FYI, "Checking for oplock break or dnotify response\n");
 	if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
 	   (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
 		struct smb_com_transaction_change_notify_rsp *pSMBr =
@@ -413,15 +412,15 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
 
 			pnotify = (struct file_notify_information *)
 				((char *)&pSMBr->hdr.Protocol + data_offset);
-			cFYI(1, "dnotify on %s Action: 0x%x",
+			cifs_dbg(FYI, "dnotify on %s Action: 0x%x\n",
 				 pnotify->FileName, pnotify->Action);
 			/*   cifs_dump_mem("Rcvd notify Data: ",buf,
 				sizeof(struct smb_hdr)+60); */
 			return true;
 		}
 		if (pSMBr->hdr.Status.CifsError) {
-			cFYI(1, "notify err 0x%d",
-				pSMBr->hdr.Status.CifsError);
+			cifs_dbg(FYI, "notify err 0x%d\n",
+				 pSMBr->hdr.Status.CifsError);
 			return true;
 		}
 		return false;
@@ -435,7 +434,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
 		   large dirty files cached on the client */
 		if ((NT_STATUS_INVALID_HANDLE) ==
 		   le32_to_cpu(pSMB->hdr.Status.CifsError)) {
-			cFYI(1, "invalid handle on oplock break");
+			cifs_dbg(FYI, "invalid handle on oplock break\n");
 			return true;
 		} else if (ERRbadfid ==
 		   le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
@@ -447,7 +446,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
 	if (pSMB->hdr.WordCount != 8)
 		return false;
 
-	cFYI(1, "oplock type 0x%d level 0x%d",
+	cifs_dbg(FYI, "oplock type 0x%d level 0x%d\n",
 		 pSMB->LockType, pSMB->OplockLevel);
 	if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
 		return false;
@@ -469,7 +468,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
 				if (pSMB->Fid != netfile->fid.netfid)
 					continue;
 
-				cFYI(1, "file id match, oplock break");
+				cifs_dbg(FYI, "file id match, oplock break\n");
 				pCifsInode = CIFS_I(netfile->dentry->d_inode);
 
 				cifs_set_oplock_level(pCifsInode,
@@ -484,12 +483,12 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
 			}
 			spin_unlock(&cifs_file_list_lock);
 			spin_unlock(&cifs_tcp_ses_lock);
-			cFYI(1, "No matching file for oplock break");
+			cifs_dbg(FYI, "No matching file for oplock break\n");
 			return true;
 		}
 	}
 	spin_unlock(&cifs_tcp_ses_lock);
-	cFYI(1, "Can not process oplock break for non-existent connection");
+	cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
 	return true;
 }
 
@@ -536,12 +535,8 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
-		cERROR(1, "Autodisabling the use of server inode numbers on "
-			   "%s. This server doesn't seem to support them "
-			   "properly. Hardlinks will not be recognized on this "
-			   "mount. Consider mounting with the \"noserverino\" "
-			   "option to silence this message.",
-			   cifs_sb_master_tcon(cifs_sb)->treeName);
+		cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s. This server doesn't seem to support them properly. Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n",
+			 cifs_sb_master_tcon(cifs_sb)->treeName);
 	}
 }
 
@@ -552,13 +547,13 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
 	if (oplock == OPLOCK_EXCLUSIVE) {
 		cinode->clientCanCacheAll = true;
 		cinode->clientCanCacheRead = true;
-		cFYI(1, "Exclusive Oplock granted on inode %p",
-		     &cinode->vfs_inode);
+		cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
 	} else if (oplock == OPLOCK_READ) {
 		cinode->clientCanCacheAll = false;
 		cinode->clientCanCacheRead = true;
-		cFYI(1, "Level II Oplock granted on inode %p",
-		    &cinode->vfs_inode);
+		cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
 	} else {
 		cinode->clientCanCacheAll = false;
 		cinode->clientCanCacheRead = false;
@@ -569,7 +564,7 @@ bool
 backup_cred(struct cifs_sb_info *cifs_sb)
 {
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) {
-		if (cifs_sb->mnt_backupuid == current_fsuid())
+		if (uid_eq(cifs_sb->mnt_backupuid, current_fsuid()))
 			return true;
 	}
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) {
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index a82bc51fdc8..af847e1cf1c 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -62,7 +62,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
 	{ERRdiffdevice, -EXDEV},
 	{ERRnofiles, -ENOENT},
 	{ERRwriteprot, -EROFS},
-	{ERRbadshare, -ETXTBSY},
+	{ERRbadshare, -EBUSY},
 	{ERRlock, -EACCES},
 	{ERRunsup, -EINVAL},
 	{ERRnosuchshare, -ENXIO},
@@ -150,8 +150,8 @@ cifs_inet_pton(const int address_family, const char *cp, int len, void *dst)
 	else if (address_family == AF_INET6)
 		ret = in6_pton(cp, len, dst , '\\', NULL);
 
-	cFYI(DBG2, "address conversion returned %d for %*.*s",
-	     ret, len, len, cp);
+	cifs_dbg(NOISY, "address conversion returned %d for %*.*s\n",
+		 ret, len, len, cp);
 	if (ret > 0)
 		ret = 1;
 	return ret;
@@ -887,7 +887,7 @@ map_smb_to_linux_error(char *buf, bool logErr)
 	}
 	/* else ERRHRD class errors or junk  - return EIO */
 
-	cFYI(1, "Mapping smb error code 0x%x to POSIX err %d",
+	cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n",
 		 le32_to_cpu(smb->Status.CifsError), rc);
 
 	/* generic corrective action e.g. reconnect SMB session on
@@ -951,20 +951,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 	SMB_TIME *st = (SMB_TIME *)&time;
 	SMB_DATE *sd = (SMB_DATE *)&date;
 
-	cFYI(1, "date %d time %d", date, time);
+	cifs_dbg(FYI, "date %d time %d\n", date, time);
 
 	sec = 2 * st->TwoSeconds;
 	min = st->Minutes;
 	if ((sec > 59) || (min > 59))
-		cERROR(1, "illegal time min %d sec %d", min, sec);
+		cifs_dbg(VFS, "illegal time min %d sec %d\n", min, sec);
 	sec += (min * 60);
 	sec += 60 * 60 * st->Hours;
 	if (st->Hours > 24)
-		cERROR(1, "illegal hours %d", st->Hours);
+		cifs_dbg(VFS, "illegal hours %d\n", st->Hours);
 	days = sd->Day;
 	month = sd->Month;
 	if ((days > 31) || (month > 12)) {
-		cERROR(1, "illegal date, month %d day: %d", month, days);
+		cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days);
 		if (month > 12)
 			month = 12;
 	}
@@ -990,7 +990,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 
 	ts.tv_sec = sec + offset;
 
-	/* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
+	/* cifs_dbg(FYI, "sec after cnvrt dos to unix time %d\n",sec); */
 
 	ts.tv_nsec = 0;
 	return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index cdd6ff48246..ab877846939 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -48,15 +48,15 @@ static void dump_cifs_file_struct(struct file *file, char *label)
 	if (file) {
 		cf = file->private_data;
 		if (cf == NULL) {
-			cFYI(1, "empty cifs private file data");
+			cifs_dbg(FYI, "empty cifs private file data\n");
 			return;
 		}
 		if (cf->invalidHandle)
-			cFYI(1, "invalid handle");
+			cifs_dbg(FYI, "invalid handle\n");
 		if (cf->srch_inf.endOfSearch)
-			cFYI(1, "end of search");
+			cifs_dbg(FYI, "end of search\n");
 		if (cf->srch_inf.emptyDir)
-			cFYI(1, "empty dir");
+			cifs_dbg(FYI, "empty dir\n");
 	}
 }
 #else
@@ -80,14 +80,12 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
 	struct super_block *sb = parent->d_inode->i_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
-	cFYI(1, "%s: for %s", __func__, name->name);
+	cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
 
-	if (parent->d_op && parent->d_op->d_hash)
-		parent->d_op->d_hash(parent, parent->d_inode, name);
-	else
-		name->hash = full_name_hash(name->name, name->len);
+	dentry = d_hash_and_lookup(parent, name);
+	if (unlikely(IS_ERR(dentry)))
+		return;
 
-	dentry = d_lookup(parent, name);
 	if (dentry) {
 		int err;
 
@@ -128,6 +126,22 @@ out:
 	dput(dentry);
 }
 
+/*
+ * Is it possible that this directory might turn out to be a DFS referral
+ * once we go to try and use it?
+ */
+static bool
+cifs_dfs_is_possible(struct cifs_sb_info *cifs_sb)
+{
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+		return true;
+#endif
+	return false;
+}
+
 static void
 cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 {
@@ -137,6 +151,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
 		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
 		fattr->cf_dtype = DT_DIR;
+		/*
+		 * Windows CIFS servers generally make DFS referrals look
+		 * like directories in FIND_* responses with the reparse
+		 * attribute flag also set (since DFS junctions are
+		 * reparse points). We must revalidate at least these
+		 * directory inodes before trying to use them (if
+		 * they are DFS we will get PATH_NOT_COVERED back
+		 * when queried directly and can then try to connect
+		 * to the DFS target)
+		 */
+		if (cifs_dfs_is_possible(cifs_sb) &&
+		    (fattr->cf_cifsattrs & ATTR_REPARSE))
+			fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
 	} else {
 		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
 		fattr->cf_dtype = DT_REG;
@@ -235,7 +262,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
 				fid,
 				cifs_sb->local_nls);
 		if (CIFSSMBClose(xid, ptcon, fid)) {
-			cFYI(1, "Error closing temporary reparsepoint open");
+			cifs_dbg(FYI, "Error closing temporary reparsepoint open\n");
 		}
 	}
 }
@@ -287,7 +314,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file)
 		goto error_exit;
 	}
 
-	cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
+	cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos);
 
 ffirst_retry:
 	/* test for Unix extensions */
@@ -338,7 +365,7 @@ static int cifs_unicode_bytelen(const char *str)
 		if (ustr[len] == 0)
 			return len << 1;
 	}
-	cFYI(1, "Unicode string longer than PATH_MAX found");
+	cifs_dbg(FYI, "Unicode string longer than PATH_MAX found\n");
 	return len << 1;
 }
 
@@ -355,18 +382,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 				pfData->FileNameLength;
 	} else
 		new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
-	cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
+	cifs_dbg(FYI, "new entry %p old entry %p\n", new_entry, old_entry);
 	/* validate that new_entry is not past end of SMB */
 	if (new_entry >= end_of_smb) {
-		cERROR(1, "search entry %p began after end of SMB %p old entry %p",
-			new_entry, end_of_smb, old_entry);
+		cifs_dbg(VFS, "search entry %p began after end of SMB %p old entry %p\n",
+			 new_entry, end_of_smb, old_entry);
 		return NULL;
 	} else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
 		    (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
 		  || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
 		   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
-		cERROR(1, "search entry %p extends after end of SMB %p",
-			new_entry, end_of_smb);
+		cifs_dbg(VFS, "search entry %p extends after end of SMB %p\n",
+			 new_entry, end_of_smb);
 		return NULL;
 	} else
 		return new_entry;
@@ -459,7 +486,7 @@ static int cifs_fill_dirent(struct cifs_dirent *de, const void *info,
 		cifs_fill_dirent_std(de, info);
 		break;
 	default:
-		cFYI(1, "Unknown findfirst level %d", level);
+		cifs_dbg(FYI, "Unknown findfirst level %d\n", level);
 		return -EINVAL;
 	}
 
@@ -505,7 +532,7 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode)
    whether we can use the cached search results from the previous search */
 static int is_dir_changed(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
 
 	if (cifsInfo->time == 0)
@@ -539,14 +566,14 @@ static int cifs_save_resume_key(const char *current_entry,
  * every entry (do not increment for . or .. entry).
  */
 static int
-find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
+find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
 		struct file *file, char **current_entry, int *num_to_ret)
 {
 	__u16 search_flags;
 	int rc = 0;
 	int pos_in_buf = 0;
 	loff_t first_entry_in_buffer;
-	loff_t index_to_find = file->f_pos;
+	loff_t index_to_find = pos;
 	struct cifsFileInfo *cfile = file->private_data;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	struct TCP_Server_Info *server = tcon->ses->server;
@@ -574,7 +601,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
 	if (((index_to_find < cfile->srch_inf.index_of_last_entry) &&
 	     is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
-		cFYI(1, "search backing up - close and restart search");
+		cifs_dbg(FYI, "search backing up - close and restart search\n");
 		spin_lock(&cifs_file_list_lock);
 		if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) {
 			cfile->invalidHandle = true;
@@ -584,7 +611,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
 		} else
 			spin_unlock(&cifs_file_list_lock);
 		if (cfile->srch_inf.ntwrk_buf_start) {
-			cFYI(1, "freeing SMB ff cache buf on search rewind");
+			cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n");
 			if (cfile->srch_inf.smallBuf)
 				cifs_small_buf_release(cfile->srch_inf.
 						ntwrk_buf_start);
@@ -595,7 +622,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
 		}
 		rc = initiate_cifs_search(xid, file);
 		if (rc) {
-			cFYI(1, "error %d reinitiating a search on rewind",
+			cifs_dbg(FYI, "error %d reinitiating a search on rewind\n",
 				 rc);
 			return rc;
 		}
@@ -610,7 +637,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
 
 	while ((index_to_find >= cfile->srch_inf.index_of_last_entry) &&
 	       (rc == 0) && !cfile->srch_inf.endOfSearch) {
-		cFYI(1, "calling findnext2");
+		cifs_dbg(FYI, "calling findnext2\n");
 		rc = server->ops->query_dir_next(xid, tcon, &cfile->fid,
 						 search_flags,
 						 &cfile->srch_inf);
@@ -633,7 +660,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
 		first_entry_in_buffer = cfile->srch_inf.index_of_last_entry
 					- cfile->srch_inf.entries_in_buffer;
 		pos_in_buf = index_to_find - first_entry_in_buffer;
-		cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
+		cifs_dbg(FYI, "found entry - pos_in_buf %d\n", pos_in_buf);
 
 		for (i = 0; (i < (pos_in_buf)) && (cur_ent != NULL); i++) {
 			/* go entry by entry figuring out which is first */
@@ -642,19 +669,18 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
 		}
 		if ((cur_ent == NULL) && (i < pos_in_buf)) {
 			/* BB fixme - check if we should flag this error */
-			cERROR(1, "reached end of buf searching for pos in buf"
-				  " %d index to find %lld rc %d", pos_in_buf,
-				  index_to_find, rc);
+			cifs_dbg(VFS, "reached end of buf searching for pos in buf %d index to find %lld rc %d\n",
+				 pos_in_buf, index_to_find, rc);
 		}
 		rc = 0;
 		*current_entry = cur_ent;
 	} else {
-		cFYI(1, "index not in buffer - could not findnext into it");
+		cifs_dbg(FYI, "index not in buffer - could not findnext into it\n");
 		return 0;
 	}
 
 	if (pos_in_buf >= cfile->srch_inf.entries_in_buffer) {
-		cFYI(1, "can not return entries pos_in_buf beyond last");
+		cifs_dbg(FYI, "can not return entries pos_in_buf beyond last\n");
 		*num_to_ret = 0;
 	} else
 		*num_to_ret = cfile->srch_inf.entries_in_buffer - pos_in_buf;
@@ -662,8 +688,9 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
 	return rc;
 }
 
-static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
-		void *dirent, char *scratch_buf, unsigned int max_len)
+static int cifs_filldir(char *find_entry, struct file *file,
+		struct dir_context *ctx,
+		char *scratch_buf, unsigned int max_len)
 {
 	struct cifsFileInfo *file_info = file->private_data;
 	struct super_block *sb = file->f_path.dentry->d_sb;
@@ -680,8 +707,8 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
 		return rc;
 
 	if (de.namelen > max_len) {
-		cERROR(1, "bad search response length %zd past smb end",
-			  de.namelen);
+		cifs_dbg(VFS, "bad search response length %zd past smb end\n",
+			 de.namelen);
 		return -EINVAL;
 	}
 
@@ -743,13 +770,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
 	cifs_prime_dcache(file->f_dentry, &name, &fattr);
 
 	ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
-	rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
-		     fattr.cf_dtype);
-	return rc;
+	return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
 }
 
 
-int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
+int cifs_readdir(struct file *file, struct dir_context *ctx)
 {
 	int rc = 0;
 	unsigned int xid;
@@ -770,108 +795,91 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	 */
 	if (file->private_data == NULL) {
 		rc = initiate_cifs_search(xid, file);
-		cFYI(1, "initiate cifs search rc %d", rc);
+		cifs_dbg(FYI, "initiate cifs search rc %d\n", rc);
 		if (rc)
 			goto rddir2_exit;
 	}
 
-	switch ((int) file->f_pos) {
-	case 0:
-		if (filldir(direntry, ".", 1, file->f_pos,
-		     file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
-			cERROR(1, "Filldir for current dir failed");
-			rc = -ENOMEM;
-			break;
-		}
-		file->f_pos++;
-	case 1:
-		if (filldir(direntry, "..", 2, file->f_pos,
-		     parent_ino(file->f_path.dentry), DT_DIR) < 0) {
-			cERROR(1, "Filldir for parent dir failed");
-			rc = -ENOMEM;
-			break;
-		}
-		file->f_pos++;
-	default:
-		/* 1) If search is active,
-			is in current search buffer?
-			if it before then restart search
-			if after then keep searching till find it */
-
-		if (file->private_data == NULL) {
-			rc = -EINVAL;
-			free_xid(xid);
-			return rc;
-		}
-		cifsFile = file->private_data;
-		if (cifsFile->srch_inf.endOfSearch) {
-			if (cifsFile->srch_inf.emptyDir) {
-				cFYI(1, "End of search, empty dir");
-				rc = 0;
-				break;
-			}
-		} /* else {
-			cifsFile->invalidHandle = true;
-			tcon->ses->server->close(xid, tcon, &cifsFile->fid);
-		} */
+	if (!dir_emit_dots(file, ctx))
+		goto rddir2_exit;
 
-		tcon = tlink_tcon(cifsFile->tlink);
-		rc = find_cifs_entry(xid, tcon, file, &current_entry,
-				     &num_to_fill);
-		if (rc) {
-			cFYI(1, "fce error %d", rc);
-			goto rddir2_exit;
-		} else if (current_entry != NULL) {
-			cFYI(1, "entry %lld found", file->f_pos);
-		} else {
-			cFYI(1, "could not find entry");
+	/* 1) If search is active,
+		is in current search buffer?
+		if it before then restart search
+		if after then keep searching till find it */
+
+	if (file->private_data == NULL) {
+		rc = -EINVAL;
+		goto rddir2_exit;
+	}
+	cifsFile = file->private_data;
+	if (cifsFile->srch_inf.endOfSearch) {
+		if (cifsFile->srch_inf.emptyDir) {
+			cifs_dbg(FYI, "End of search, empty dir\n");
+			rc = 0;
 			goto rddir2_exit;
 		}
-		cFYI(1, "loop through %d times filling dir for net buf %p",
-			num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
-		max_len = tcon->ses->server->ops->calc_smb_size(
-				cifsFile->srch_inf.ntwrk_buf_start);
-		end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
-
-		tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
-		if (tmp_buf == NULL) {
-			rc = -ENOMEM;
+	} /* else {
+		cifsFile->invalidHandle = true;
+		tcon->ses->server->close(xid, tcon, &cifsFile->fid);
+	} */
+
+	tcon = tlink_tcon(cifsFile->tlink);
+	rc = find_cifs_entry(xid, tcon, ctx->pos, file, &current_entry,
+			     &num_to_fill);
+	if (rc) {
+		cifs_dbg(FYI, "fce error %d\n", rc);
+		goto rddir2_exit;
+	} else if (current_entry != NULL) {
+		cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
+	} else {
+		cifs_dbg(FYI, "could not find entry\n");
+		goto rddir2_exit;
+	}
+	cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
+		 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
+	max_len = tcon->ses->server->ops->calc_smb_size(
+			cifsFile->srch_inf.ntwrk_buf_start);
+	end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+
+	tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
+	if (tmp_buf == NULL) {
+		rc = -ENOMEM;
+		goto rddir2_exit;
+	}
+
+	for (i = 0; i < num_to_fill; i++) {
+		if (current_entry == NULL) {
+			/* evaluate whether this case is an error */
+			cifs_dbg(VFS, "past SMB end,  num to fill %d i %d\n",
+				 num_to_fill, i);
 			break;
 		}
-
-		for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
-			if (current_entry == NULL) {
-				/* evaluate whether this case is an error */
-				cERROR(1, "past SMB end,  num to fill %d i %d",
-					  num_to_fill, i);
-				break;
-			}
-			/*
-			 * if buggy server returns . and .. late do we want to
-			 * check for that here?
-			 */
-			rc = cifs_filldir(current_entry, file, filldir,
-					  direntry, tmp_buf, max_len);
-			if (rc == -EOVERFLOW) {
+		/*
+		 * if buggy server returns . and .. late do we want to
+		 * check for that here?
+		 */
+		rc = cifs_filldir(current_entry, file, ctx,
+				  tmp_buf, max_len);
+		if (rc) {
+			if (rc > 0)
 				rc = 0;
-				break;
-			}
-
-			file->f_pos++;
-			if (file->f_pos ==
-				cifsFile->srch_inf.index_of_last_entry) {
-				cFYI(1, "last entry in buf at pos %lld %s",
-					file->f_pos, tmp_buf);
-				cifs_save_resume_key(current_entry, cifsFile);
-				break;
-			} else
-				current_entry =
-					nxt_dir_entry(current_entry, end_of_smb,
-						cifsFile->srch_inf.info_level);
+			break;
 		}
-		kfree(tmp_buf);
-		break;
-	} /* end switch */
+
+		ctx->pos++;
+		if (ctx->pos ==
+			cifsFile->srch_inf.index_of_last_entry) {
+			cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
+				 ctx->pos, tmp_buf);
+			cifs_save_resume_key(current_entry, cifsFile);
+			break;
+		} else
+			current_entry =
+				nxt_dir_entry(current_entry, end_of_smb,
+					cifsFile->srch_inf.info_level);
+	}
+	kfree(tmp_buf);
 
 rddir2_exit:
 	free_xid(xid);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 76809f4d342..79358e341fd 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -138,8 +138,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
 	capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
 			CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
 
-	if (ses->server->sec_mode &
-	    (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+	if (ses->server->sign)
 		pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 
 	if (ses->capabilities & CAP_UNICODE) {
@@ -283,11 +282,11 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
 	int len;
 	char *data = *pbcc_area;
 
-	cFYI(1, "bleft %d", bleft);
+	cifs_dbg(FYI, "bleft %d\n", bleft);
 
 	kfree(ses->serverOS);
 	ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
-	cFYI(1, "serverOS=%s", ses->serverOS);
+	cifs_dbg(FYI, "serverOS=%s\n", ses->serverOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
 	bleft -= len;
@@ -296,7 +295,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
 
 	kfree(ses->serverNOS);
 	ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
-	cFYI(1, "serverNOS=%s", ses->serverNOS);
+	cifs_dbg(FYI, "serverNOS=%s\n", ses->serverNOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
 	bleft -= len;
@@ -305,41 +304,38 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
 
 	kfree(ses->serverDomain);
 	ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
-	cFYI(1, "serverDomain=%s", ses->serverDomain);
+	cifs_dbg(FYI, "serverDomain=%s\n", ses->serverDomain);
 
 	return;
 }
 
-static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
-			       struct cifs_ses *ses,
-			       const struct nls_table *nls_cp)
+static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
+				struct cifs_ses *ses,
+				const struct nls_table *nls_cp)
 {
-	int rc = 0;
 	int len;
 	char *bcc_ptr = *pbcc_area;
 
-	cFYI(1, "decode sessetup ascii. bleft %d", bleft);
+	cifs_dbg(FYI, "decode sessetup ascii. bleft %d\n", bleft);
 
 	len = strnlen(bcc_ptr, bleft);
 	if (len >= bleft)
-		return rc;
+		return;
 
 	kfree(ses->serverOS);
 
 	ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
 	if (ses->serverOS)
 		strncpy(ses->serverOS, bcc_ptr, len);
-	if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
-			cFYI(1, "OS/2 server");
-			ses->flags |= CIFS_SES_OS2;
-	}
+	if (strncmp(ses->serverOS, "OS/2", 4) == 0)
+		cifs_dbg(FYI, "OS/2 server\n");
 
 	bcc_ptr += len + 1;
 	bleft -= len + 1;
 
 	len = strnlen(bcc_ptr, bleft);
 	if (len >= bleft)
-		return rc;
+		return;
 
 	kfree(ses->serverNOS);
 
@@ -352,16 +348,14 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
 
 	len = strnlen(bcc_ptr, bleft);
 	if (len > bleft)
-		return rc;
+		return;
 
 	/* No domain field in LANMAN case. Domain is
 	   returned by old servers in the SMB negprot response */
 	/* BB For newer servers which do not support Unicode,
 	   but thus do return domain here we could add parsing
 	   for it later, but it is not very important */
-	cFYI(1, "ascii: bytes left %d", bleft);
-
-	return rc;
+	cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
 }
 
 int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
@@ -373,16 +367,18 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
 
 	if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
-		cERROR(1, "challenge blob len %d too small", blob_len);
+		cifs_dbg(VFS, "challenge blob len %d too small\n", blob_len);
 		return -EINVAL;
 	}
 
 	if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
-		cERROR(1, "blob signature incorrect %s", pblob->Signature);
+		cifs_dbg(VFS, "blob signature incorrect %s\n",
+			 pblob->Signature);
 		return -EINVAL;
 	}
 	if (pblob->MessageType != NtLmChallenge) {
-		cERROR(1, "Incorrect message type %d", pblob->MessageType);
+		cifs_dbg(VFS, "Incorrect message type %d\n",
+			 pblob->MessageType);
 		return -EINVAL;
 	}
 
@@ -395,16 +391,17 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
 	tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
 	if (tioffset > blob_len || tioffset + tilen > blob_len) {
-		cERROR(1, "tioffset + tilen too high %u + %u", tioffset, tilen);
+		cifs_dbg(VFS, "tioffset + tilen too high %u + %u",
+			tioffset, tilen);
 		return -EINVAL;
 	}
 	if (tilen) {
-		ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
+		ses->auth_key.response = kmemdup(bcc_ptr + tioffset, tilen,
+						 GFP_KERNEL);
 		if (!ses->auth_key.response) {
-			cERROR(1, "Challenge target info allocation failure");
+			cifs_dbg(VFS, "Challenge target info alloc failure");
 			return -ENOMEM;
 		}
-		memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen);
 		ses->auth_key.len = tilen;
 	}
 
@@ -429,8 +426,7 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
 	flags = NTLMSSP_NEGOTIATE_56 |	NTLMSSP_REQUEST_TARGET |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
 		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-	if (ses->server->sec_mode &
-			(SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+	if (ses->server->sign) {
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
 		if (!ses->server->session_estab)
 			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -468,8 +464,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
 		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-	if (ses->server->sec_mode &
-	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+	if (ses->server->sign) {
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
 		if (!ses->server->session_estab)
 			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -486,7 +481,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
 	rc = setup_ntlmv2_rsp(ses, nls_cp);
 	if (rc) {
-		cERROR(1, "Error %d during NTLMSSP authentication", rc);
+		cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc);
 		goto setup_ntlmv2_ret;
 	}
 	memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
@@ -555,6 +550,56 @@ setup_ntlmv2_ret:
 	return rc;
 }
 
+enum securityEnum
+select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
+{
+	switch (server->negflavor) {
+	case CIFS_NEGFLAVOR_EXTENDED:
+		switch (requested) {
+		case Kerberos:
+		case RawNTLMSSP:
+			return requested;
+		case Unspecified:
+			if (server->sec_ntlmssp &&
+			    (global_secflags & CIFSSEC_MAY_NTLMSSP))
+				return RawNTLMSSP;
+			if ((server->sec_kerberos || server->sec_mskerberos) &&
+			    (global_secflags & CIFSSEC_MAY_KRB5))
+				return Kerberos;
+			/* Fallthrough */
+		default:
+			return Unspecified;
+		}
+	case CIFS_NEGFLAVOR_UNENCAP:
+		switch (requested) {
+		case NTLM:
+		case NTLMv2:
+			return requested;
+		case Unspecified:
+			if (global_secflags & CIFSSEC_MAY_NTLMV2)
+				return NTLMv2;
+			if (global_secflags & CIFSSEC_MAY_NTLM)
+				return NTLM;
+			/* Fallthrough */
+		default:
+			return Unspecified;
+		}
+	case CIFS_NEGFLAVOR_LANMAN:
+		switch (requested) {
+		case LANMAN:
+			return requested;
+		case Unspecified:
+			if (global_secflags & CIFSSEC_MAY_LANMAN)
+				return LANMAN;
+			/* Fallthrough */
+		default:
+			return Unspecified;
+		}
+	default:
+		return Unspecified;
+	}
+}
+
 int
 CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
 	       const struct nls_table *nls_cp)
@@ -576,11 +621,18 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
 	u16 blob_len;
 	char *ntlmsspblob = NULL;
 
-	if (ses == NULL)
+	if (ses == NULL) {
+		WARN(1, "%s: ses == NULL!", __func__);
+		return -EINVAL;
+	}
+
+	type = select_sectype(ses->server, ses->sectype);
+	cifs_dbg(FYI, "sess setup type %d\n", type);
+	if (type == Unspecified) {
+		cifs_dbg(VFS, "Unable to select appropriate authentication method!");
 		return -EINVAL;
+	}
 
-	type = ses->server->secType;
-	cFYI(1, "sess setup type %d", type);
 	if (type == RawNTLMSSP) {
 		/* if memory allocation is successful, caller of this function
 		 * frees it.
@@ -640,8 +692,6 @@ ssetup_ntlmssp_authenticate:
 	}
 	bcc_ptr = str_area;
 
-	ses->flags &= ~CIFS_SES_LANMAN;
-
 	iov[1].iov_base = NULL;
 	iov[1].iov_len = 0;
 
@@ -665,7 +715,6 @@ ssetup_ntlmssp_authenticate:
 				 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
 					true : false, lnm_session_key);
 
-		ses->flags |= CIFS_SES_LANMAN;
 		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
 		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 
@@ -674,7 +723,7 @@ ssetup_ntlmssp_authenticate:
 		changed to do higher than lanman dialect and
 		we reconnected would we ever calc signing_key? */
 
-		cFYI(1, "Negotiating LANMAN setting up strings");
+		cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
 		/* Unicode not allowed for LANMAN dialects */
 		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
@@ -688,7 +737,8 @@ ssetup_ntlmssp_authenticate:
 		/* calculate ntlm response and session key */
 		rc = setup_ntlm_response(ses, nls_cp);
 		if (rc) {
-			cERROR(1, "Error %d during NTLM authentication", rc);
+			cifs_dbg(VFS, "Error %d during NTLM authentication\n",
+				 rc);
 			goto ssetup_exit;
 		}
 
@@ -718,7 +768,8 @@ ssetup_ntlmssp_authenticate:
 		/* calculate nlmv2 response and session key */
 		rc = setup_ntlmv2_rsp(ses, nls_cp);
 		if (rc) {
-			cERROR(1, "Error %d during NTLMv2 authentication", rc);
+			cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n",
+				 rc);
 			goto ssetup_exit;
 		}
 		memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
@@ -754,21 +805,21 @@ ssetup_ntlmssp_authenticate:
 		/* check version field to make sure that cifs.upcall is
 		   sending us a response in an expected form */
 		if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
-			cERROR(1, "incorrect version of cifs.upcall (expected"
-				   " %d but got %d)",
+			cifs_dbg(VFS, "incorrect version of cifs.upcall "
+				   "expected %d but got %d)",
 				   CIFS_SPNEGO_UPCALL_VERSION, msg->version);
 			rc = -EKEYREJECTED;
 			goto ssetup_exit;
 		}
 
-		ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL);
+		ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+						 GFP_KERNEL);
 		if (!ses->auth_key.response) {
-			cERROR(1, "Kerberos can't allocate (%u bytes) memory",
+			cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory",
 					msg->sesskey_len);
 			rc = -ENOMEM;
 			goto ssetup_exit;
 		}
-		memcpy(ses->auth_key.response, msg->data, msg->sesskey_len);
 		ses->auth_key.len = msg->sesskey_len;
 
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
@@ -790,18 +841,18 @@ ssetup_ntlmssp_authenticate:
 		/* BB: is this right? */
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #else /* ! CONFIG_CIFS_UPCALL */
-		cERROR(1, "Kerberos negotiated but upcall support disabled!");
+		cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
 		rc = -ENOSYS;
 		goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
 	} else if (type == RawNTLMSSP) {
 		if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-			cERROR(1, "NTLMSSP requires Unicode support");
+			cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
 			rc = -ENOSYS;
 			goto ssetup_exit;
 		}
 
-		cFYI(1, "ntlmssp session setup phase %d", phase);
+		cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase);
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 		capabilities |= CAP_EXTENDED_SECURITY;
 		pSMB->req.Capabilities |= cpu_to_le32(capabilities);
@@ -824,7 +875,6 @@ ssetup_ntlmssp_authenticate:
 				5*sizeof(struct _AUTHENTICATE_MESSAGE),
 				GFP_KERNEL);
 			if (!ntlmsspblob) {
-				cERROR(1, "Can't allocate NTLMSSP blob");
 				rc = -ENOMEM;
 				goto ssetup_exit;
 			}
@@ -844,7 +894,7 @@ ssetup_ntlmssp_authenticate:
 			smb_buf->Uid = ses->Suid;
 			break;
 		default:
-			cERROR(1, "invalid phase %d", phase);
+			cifs_dbg(VFS, "invalid phase %d\n", phase);
 			rc = -ENOSYS;
 			goto ssetup_exit;
 		}
@@ -855,7 +905,7 @@ ssetup_ntlmssp_authenticate:
 		}
 		unicode_oslm_strings(&bcc_ptr, nls_cp);
 	} else {
-		cERROR(1, "secType %d not supported!", type);
+		cifs_dbg(VFS, "secType %d not supported!\n", type);
 		rc = -ENOSYS;
 		goto ssetup_exit;
 	}
@@ -880,7 +930,7 @@ ssetup_ntlmssp_authenticate:
 	    (smb_buf->Status.CifsError ==
 			cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
 		if (phase != NtLmNegotiate) {
-			cERROR(1, "Unexpected more processing error");
+			cifs_dbg(VFS, "Unexpected more processing error\n");
 			goto ssetup_exit;
 		}
 		/* NTLMSSP Negotiate sent now processing challenge (response) */
@@ -892,14 +942,14 @@ ssetup_ntlmssp_authenticate:
 
 	if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
 		rc = -EIO;
-		cERROR(1, "bad word count %d", smb_buf->WordCount);
+		cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
 		goto ssetup_exit;
 	}
 	action = le16_to_cpu(pSMB->resp.Action);
 	if (action & GUEST_LOGIN)
-		cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
+		cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
 	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
-	cFYI(1, "UID = %llu ", ses->Suid);
+	cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
 	/* response can have either 3 or 4 word count - Samba sends 3 */
 	/* and lanman response is 3 */
 	bytes_remaining = get_bcc(smb_buf);
@@ -908,7 +958,8 @@ ssetup_ntlmssp_authenticate:
 	if (smb_buf->WordCount == 4) {
 		blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 		if (blob_len > bytes_remaining) {
-			cERROR(1, "bad security blob length %d", blob_len);
+			cifs_dbg(VFS, "bad security blob length %d\n",
+				 blob_len);
 			rc = -EINVAL;
 			goto ssetup_exit;
 		}
@@ -933,8 +984,7 @@ ssetup_ntlmssp_authenticate:
 		}
 		decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
 	} else {
-		rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining,
-					 ses, nls_cp);
+		decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
 	}
 
 ssetup_exit:
@@ -946,7 +996,7 @@ ssetup_exit:
 	kfree(ntlmsspblob);
 	ntlmsspblob = NULL;
 	if (resp_buf_type == CIFS_SMALL_BUFFER) {
-		cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
+		cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base);
 		cifs_small_buf_release(iov[0].iov_base);
 	} else if (resp_buf_type == CIFS_LARGE_BUFFER)
 		cifs_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 47bc5a87f94..e813f04511d 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -61,10 +61,13 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf,
 	 */
 	--server->sequence_number;
 	rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+	if (rc < 0)
+		server->sequence_number--;
+
 	mutex_unlock(&server->srv_mutex);
 
-	cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
-		in_buf->Mid, rc);
+	cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n",
+		 in_buf->Mid, rc);
 
 	return rc;
 }
@@ -249,7 +252,7 @@ check2ndT2(char *buf)
 	/* check for plausible wct, bcc and t2 data and parm sizes */
 	/* check for parm and data offset going beyond end of smb */
 	if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
-		cFYI(1, "invalid transact2 word count");
+		cifs_dbg(FYI, "invalid transact2 word count\n");
 		return -EINVAL;
 	}
 
@@ -261,18 +264,18 @@ check2ndT2(char *buf)
 	if (total_data_size == data_in_this_rsp)
 		return 0;
 	else if (total_data_size < data_in_this_rsp) {
-		cFYI(1, "total data %d smaller than data in frame %d",
-			total_data_size, data_in_this_rsp);
+		cifs_dbg(FYI, "total data %d smaller than data in frame %d\n",
+			 total_data_size, data_in_this_rsp);
 		return -EINVAL;
 	}
 
 	remaining = total_data_size - data_in_this_rsp;
 
-	cFYI(1, "missing %d bytes from transact2, check next response",
-		remaining);
+	cifs_dbg(FYI, "missing %d bytes from transact2, check next response\n",
+		 remaining);
 	if (total_data_size > CIFSMaxBufSize) {
-		cERROR(1, "TotalDataSize %d is over maximum buffer %d",
-			total_data_size, CIFSMaxBufSize);
+		cifs_dbg(VFS, "TotalDataSize %d is over maximum buffer %d\n",
+			 total_data_size, CIFSMaxBufSize);
 		return -EINVAL;
 	}
 	return remaining;
@@ -293,28 +296,28 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
 	tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
 
 	if (tgt_total_cnt != src_total_cnt)
-		cFYI(1, "total data count of primary and secondary t2 differ "
-			"source=%hu target=%hu", src_total_cnt, tgt_total_cnt);
+		cifs_dbg(FYI, "total data count of primary and secondary t2 differ source=%hu target=%hu\n",
+			 src_total_cnt, tgt_total_cnt);
 
 	total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
 
 	remaining = tgt_total_cnt - total_in_tgt;
 
 	if (remaining < 0) {
-		cFYI(1, "Server sent too much data. tgt_total_cnt=%hu "
-			"total_in_tgt=%hu", tgt_total_cnt, total_in_tgt);
+		cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n",
+			 tgt_total_cnt, total_in_tgt);
 		return -EPROTO;
 	}
 
 	if (remaining == 0) {
 		/* nothing to do, ignore */
-		cFYI(1, "no more data remains");
+		cifs_dbg(FYI, "no more data remains\n");
 		return 0;
 	}
 
 	total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount);
 	if (remaining < total_in_src)
-		cFYI(1, "transact2 2nd response contains too much data");
+		cifs_dbg(FYI, "transact2 2nd response contains too much data\n");
 
 	/* find end of first SMB data area */
 	data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +
@@ -329,7 +332,8 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
 	total_in_tgt += total_in_src;
 	/* is the result too big for the field? */
 	if (total_in_tgt > USHRT_MAX) {
-		cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt);
+		cifs_dbg(FYI, "coalesced DataCount too large (%u)\n",
+			 total_in_tgt);
 		return -EPROTO;
 	}
 	put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);
@@ -339,7 +343,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
 	byte_count += total_in_src;
 	/* is the result too big for the field? */
 	if (byte_count > USHRT_MAX) {
-		cFYI(1, "coalesced BCC too large (%u)", byte_count);
+		cifs_dbg(FYI, "coalesced BCC too large (%u)\n", byte_count);
 		return -EPROTO;
 	}
 	put_bcc(byte_count, target_hdr);
@@ -348,7 +352,8 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
 	byte_count += total_in_src;
 	/* don't allow buffer to overflow */
 	if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count);
+		cifs_dbg(FYI, "coalesced BCC exceeds buffer size (%u)\n",
+			 byte_count);
 		return -ENOBUFS;
 	}
 	target_hdr->smb_buf_length = cpu_to_be32(byte_count);
@@ -358,12 +363,12 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
 
 	if (remaining != total_in_src) {
 		/* more responses to go */
-		cFYI(1, "waiting for more secondary responses");
+		cifs_dbg(FYI, "waiting for more secondary responses\n");
 		return 1;
 	}
 
 	/* we are done */
-	cFYI(1, "found the last secondary response");
+	cifs_dbg(FYI, "found the last secondary response\n");
 	return 0;
 }
 
@@ -388,7 +393,7 @@ cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server,
 	}
 	if (!server->large_buf) {
 		/*FIXME: switch to already allocated largebuf?*/
-		cERROR(1, "1st trans2 resp needs bigbuf");
+		cifs_dbg(VFS, "1st trans2 resp needs bigbuf\n");
 	} else {
 		/* Have first buffer */
 		mid->resp_buf = buf;
@@ -444,8 +449,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
 	 * WRITEX header, not including the 4 byte RFC1001 length.
 	 */
 	if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
-	    (!(server->capabilities & CAP_UNIX) &&
-	     (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED))))
+	    (!(server->capabilities & CAP_UNIX) && server->sign))
 		wsize = min_t(unsigned int, wsize,
 				server->maxBuf - sizeof(WRITE_REQ) + 4);
 
@@ -760,24 +764,17 @@ smb_set_file_info(struct inode *inode, const char *full_path,
 	}
 	tcon = tlink_tcon(tlink);
 
-	/*
-	 * NT4 apparently returns success on this call, but it doesn't really
-	 * work.
-	 */
-	if (!(tcon->ses->flags & CIFS_SES_NT4)) {
-		rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf,
-					cifs_sb->local_nls,
+	rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc == 0) {
-			cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
-			goto out;
-		} else if (rc != -EOPNOTSUPP && rc != -EINVAL)
-			goto out;
+	if (rc == 0) {
+		cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
+		goto out;
+	} else if (rc != -EOPNOTSUPP && rc != -EINVAL) {
+		goto out;
 	}
 
-	cFYI(1, "calling SetFileInfo since SetPathInfo for times not supported "
-		"by this server");
+	cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
 	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
 			 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
 			 &netfid, &oplock, NULL, cifs_sb->local_nls,
@@ -960,4 +957,6 @@ struct smb_version_values smb1_values = {
 	.cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
 	.cap_large_files = CAP_LARGE_FILES,
 	.oplock_read = OPLOCK_READ,
+	.signing_enabled = SECMODE_SIGN_ENABLED,
+	.signing_required = SECMODE_SIGN_REQUIRED,
 };
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 71e6aed4b38..5da1b55a225 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -43,13 +43,13 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
 	if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
 		cinode->clientCanCacheAll = true;
 		cinode->clientCanCacheRead = true;
-		cFYI(1, "Exclusive Oplock granted on inode %p",
-		     &cinode->vfs_inode);
+		cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
 	} else if (oplock == SMB2_OPLOCK_LEVEL_II) {
 		cinode->clientCanCacheAll = false;
 		cinode->clientCanCacheRead = true;
-		cFYI(1, "Level II Oplock granted on inode %p",
-		    &cinode->vfs_inode);
+		cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
 	} else {
 		cinode->clientCanCacheAll = false;
 		cinode->clientCanCacheRead = false;
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 7c0e2143e77..c38350851b0 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -54,5 +54,7 @@
 #define SMB2_SIGNATURE_SIZE (16)
 #define SMB2_NTLMV2_SESSKEY_SIZE (16)
 #define SMB2_HMACSHA256_SIZE (32)
+#define SMB2_CMACAES_SIZE (16)
+#define SMB3_SIGNKEY_SIZE (16)
 
 #endif	/* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 706482452df..fff6dfba620 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -92,7 +92,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
 				      (FILE_BASIC_INFO *)data);
 		break;
 	default:
-		cERROR(1, "Invalid command");
+		cifs_dbg(VFS, "Invalid command\n");
 		break;
 	}
 
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 494c912c76f..7c2f45c06fc 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -2472,7 +2472,7 @@ map_smb2_to_linux_error(char *buf, bool log_err)
 
 	/* on error mapping not found  - return EIO */
 
-	cFYI(1, "Mapping SMB2 status code %d to POSIX err %d",
+	cifs_dbg(FYI, "Mapping SMB2 status code %d to POSIX err %d\n",
 		 smb2err, rc);
 
 	return rc;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 7b1c5e3287f..b0c43345cd9 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -45,17 +45,17 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
 			if (hdr->Command == SMB2_OPLOCK_BREAK)
 				return 0;
 			else
-				cERROR(1, "Received Request not response");
+				cifs_dbg(VFS, "Received Request not response\n");
 		}
 	} else { /* bad signature or mid */
 		if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER)
-			cERROR(1, "Bad protocol string signature header %x",
-				  *(unsigned int *) hdr->ProtocolId);
+			cifs_dbg(VFS, "Bad protocol string signature header %x\n",
+				 *(unsigned int *) hdr->ProtocolId);
 		if (mid != hdr->MessageId)
-			cERROR(1, "Mids do not match: %llu and %llu", mid,
-				  hdr->MessageId);
+			cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
+				 mid, hdr->MessageId);
 	}
-	cERROR(1, "Bad SMB detected. The Mid=%llu", hdr->MessageId);
+	cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", hdr->MessageId);
 	return 1;
 }
 
@@ -101,7 +101,8 @@ smb2_check_message(char *buf, unsigned int length)
 	int command;
 
 	/* BB disable following printk later */
-	cFYI(1, "%s length: 0x%x, smb_buf_length: 0x%x", __func__, length, len);
+	cifs_dbg(FYI, "%s length: 0x%x, smb_buf_length: 0x%x\n",
+		 __func__, length, len);
 
 	/*
 	 * Add function to do table lookup of StructureSize by command
@@ -117,12 +118,13 @@ smb2_check_message(char *buf, unsigned int length)
 			 */
 			return 0;
 		} else {
-			cERROR(1, "Length less than SMB header size");
+			cifs_dbg(VFS, "Length less than SMB header size\n");
 		}
 		return 1;
 	}
 	if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) {
-		cERROR(1, "SMB length greater than maximum, mid=%llu", mid);
+		cifs_dbg(VFS, "SMB length greater than maximum, mid=%llu\n",
+			 mid);
 		return 1;
 	}
 
@@ -130,14 +132,14 @@ smb2_check_message(char *buf, unsigned int length)
 		return 1;
 
 	if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
-		cERROR(1, "Illegal structure size %u",
-			  le16_to_cpu(hdr->StructureSize));
+		cifs_dbg(VFS, "Illegal structure size %u\n",
+			 le16_to_cpu(hdr->StructureSize));
 		return 1;
 	}
 
 	command = le16_to_cpu(hdr->Command);
 	if (command >= NUMBER_OF_SMB2_COMMANDS) {
-		cERROR(1, "Illegal SMB2 command %d", command);
+		cifs_dbg(VFS, "Illegal SMB2 command %d\n", command);
 		return 1;
 	}
 
@@ -145,30 +147,30 @@ smb2_check_message(char *buf, unsigned int length)
 		if (command != SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0 ||
 		    pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) {
 			/* error packets have 9 byte structure size */
-			cERROR(1, "Illegal response size %u for command %d",
-				   le16_to_cpu(pdu->StructureSize2), command);
+			cifs_dbg(VFS, "Illegal response size %u for command %d\n",
+				 le16_to_cpu(pdu->StructureSize2), command);
 			return 1;
 		} else if (command == SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0)
 			   && (le16_to_cpu(pdu->StructureSize2) != 44)
 			   && (le16_to_cpu(pdu->StructureSize2) != 36)) {
 			/* special case for SMB2.1 lease break message */
-			cERROR(1, "Illegal response size %d for oplock break",
-				   le16_to_cpu(pdu->StructureSize2));
+			cifs_dbg(VFS, "Illegal response size %d for oplock break\n",
+				 le16_to_cpu(pdu->StructureSize2));
 			return 1;
 		}
 	}
 
 	if (4 + len != length) {
-		cERROR(1, "Total length %u RFC1002 length %u mismatch mid %llu",
-			  length, 4 + len, mid);
+		cifs_dbg(VFS, "Total length %u RFC1002 length %u mismatch mid %llu\n",
+			 length, 4 + len, mid);
 		return 1;
 	}
 
 	clc_len = smb2_calc_size(hdr);
 
 	if (4 + len != clc_len) {
-		cFYI(1, "Calculated size %u length %u mismatch mid %llu",
-			clc_len, 4 + len, mid);
+		cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
+			 clc_len, 4 + len, mid);
 		/* Windows 7 server returns 24 bytes more */
 		if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
 			return 0;
@@ -264,10 +266,14 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
 		  ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
 		break;
 	case SMB2_IOCTL:
+		*off = le32_to_cpu(
+		  ((struct smb2_ioctl_rsp *)hdr)->OutputOffset);
+		*len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount);
+		break;
 	case SMB2_CHANGE_NOTIFY:
 	default:
 		/* BB FIXME for unimplemented cases above */
-		cERROR(1, "no length check for command");
+		cifs_dbg(VFS, "no length check for command\n");
 		break;
 	}
 
@@ -276,20 +282,20 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
 	 * we have little choice but to ignore the data area in this case.
 	 */
 	if (*off > 4096) {
-		cERROR(1, "offset %d too large, data area ignored", *off);
+		cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off);
 		*len = 0;
 		*off = 0;
 	} else if (*off < 0) {
-		cERROR(1, "negative offset %d to data invalid ignore data area",
-			  *off);
+		cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n",
+			 *off);
 		*off = 0;
 		*len = 0;
 	} else if (*len < 0) {
-		cERROR(1, "negative data length %d invalid, data area ignored",
-			  *len);
+		cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n",
+			 *len);
 		*len = 0;
 	} else if (*len > 128 * 1024) {
-		cERROR(1, "data area larger than 128K: %d", *len);
+		cifs_dbg(VFS, "data area larger than 128K: %d\n", *len);
 		*len = 0;
 	}
 
@@ -324,7 +330,7 @@ smb2_calc_size(void *buf)
 		goto calc_size_exit;
 
 	smb2_get_data_area_len(&offset, &data_length, hdr);
-	cFYI(1, "SMB2 data length %d offset %d", data_length, offset);
+	cifs_dbg(FYI, "SMB2 data length %d offset %d\n", data_length, offset);
 
 	if (data_length > 0) {
 		/*
@@ -335,15 +341,15 @@ smb2_calc_size(void *buf)
 		 * the size of the RFC1001 hdr.
 		 */
 		if (offset + 4 + 1 < len) {
-			cERROR(1, "data area offset %d overlaps SMB2 header %d",
-				  offset + 4 + 1, len);
+			cifs_dbg(VFS, "data area offset %d overlaps SMB2 header %d\n",
+				 offset + 4 + 1, len);
 			data_length = 0;
 		} else {
 			len = 4 + offset + data_length;
 		}
 	}
 calc_size_exit:
-	cFYI(1, "SMB2 len %d", len);
+	cifs_dbg(FYI, "SMB2 len %d\n", len);
 	return len;
 }
 
@@ -405,7 +411,7 @@ cifs_ses_oplock_break(struct work_struct *work)
 
 	rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key,
 			      lw->lease_state);
-	cFYI(1, "Lease release rc %d", rc);
+	cifs_dbg(FYI, "Lease release rc %d\n", rc);
 	cifs_put_tlink(lw->tlink);
 	kfree(lw);
 }
@@ -426,15 +432,13 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
 				  SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED);
 
 	lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL);
-	if (!lw) {
-		cERROR(1, "Memory allocation failed during lease break check");
+	if (!lw)
 		return false;
-	}
 
 	INIT_WORK(&lw->lease_break, cifs_ses_oplock_break);
 	lw->lease_state = rsp->NewLeaseState;
 
-	cFYI(1, "Checking for lease break");
+	cifs_dbg(FYI, "Checking for lease break\n");
 
 	/* look up tcon based on tid & uid */
 	spin_lock(&cifs_tcp_ses_lock);
@@ -455,9 +459,9 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
 					   SMB2_LEASE_KEY_SIZE))
 					continue;
 
-				cFYI(1, "found in the open list");
-				cFYI(1, "lease key match, lease break 0x%d",
-				     le32_to_cpu(rsp->NewLeaseState));
+				cifs_dbg(FYI, "found in the open list\n");
+				cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+					 le32_to_cpu(rsp->NewLeaseState));
 
 				smb2_set_oplock_level(cinode,
 				  smb2_map_lease_to_oplock(rsp->NewLeaseState));
@@ -489,9 +493,9 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
 						   &lw->lease_break);
 				}
 
-				cFYI(1, "found in the pending open list");
-				cFYI(1, "lease key match, lease break 0x%d",
-				     le32_to_cpu(rsp->NewLeaseState));
+				cifs_dbg(FYI, "found in the pending open list\n");
+				cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+					 le32_to_cpu(rsp->NewLeaseState));
 
 				open->oplock =
 				  smb2_map_lease_to_oplock(rsp->NewLeaseState);
@@ -506,7 +510,7 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
 	}
 	spin_unlock(&cifs_tcp_ses_lock);
 	kfree(lw);
-	cFYI(1, "Can not process lease break - no lease matched");
+	cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
 	return false;
 }
 
@@ -520,7 +524,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 	struct cifsInodeInfo *cinode;
 	struct cifsFileInfo *cfile;
 
-	cFYI(1, "Checking for oplock break");
+	cifs_dbg(FYI, "Checking for oplock break\n");
 
 	if (rsp->hdr.Command != SMB2_OPLOCK_BREAK)
 		return false;
@@ -533,7 +537,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 			return false;
 	}
 
-	cFYI(1, "oplock level 0x%d", rsp->OplockLevel);
+	cifs_dbg(FYI, "oplock level 0x%d\n", rsp->OplockLevel);
 
 	/* look up tcon based on tid & uid */
 	spin_lock(&cifs_tcp_ses_lock);
@@ -553,7 +557,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 				    cfile->fid.volatile_fid)
 					continue;
 
-				cFYI(1, "file id match, oplock break");
+				cifs_dbg(FYI, "file id match, oplock break\n");
 				cinode = CIFS_I(cfile->dentry->d_inode);
 
 				if (!cinode->clientCanCacheAll &&
@@ -573,11 +577,11 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 			}
 			spin_unlock(&cifs_file_list_lock);
 			spin_unlock(&cifs_tcp_ses_lock);
-			cFYI(1, "No matching file for oplock break");
+			cifs_dbg(FYI, "No matching file for oplock break\n");
 			return true;
 		}
 	}
 	spin_unlock(&cifs_tcp_ses_lock);
-	cFYI(1, "Can not process oplock break for non-existent connection");
+	cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n");
 	return false;
 }
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index c9c7aa7ed96..6d15cab95b9 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -38,13 +38,13 @@ change_conf(struct TCP_Server_Info *server)
 	case 1:
 		server->echoes = false;
 		server->oplocks = false;
-		cERROR(1, "disabling echoes and oplocks");
+		cifs_dbg(VFS, "disabling echoes and oplocks\n");
 		break;
 	case 2:
 		server->echoes = true;
 		server->oplocks = false;
 		server->echo_credits = 1;
-		cFYI(1, "disabling oplocks");
+		cifs_dbg(FYI, "disabling oplocks\n");
 		break;
 	default:
 		server->echoes = true;
@@ -147,10 +147,10 @@ smb2_dump_detail(void *buf)
 #ifdef CONFIG_CIFS_DEBUG2
 	struct smb2_hdr *smb = (struct smb2_hdr *)buf;
 
-	cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d",
-		  smb->Command, smb->Status, smb->Flags, smb->MessageId,
-		  smb->ProcessId);
-	cERROR(1, "smb buf %p len %u", smb, smb2_calc_size(smb));
+	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
+		 smb->Command, smb->Status, smb->Flags, smb->MessageId,
+		 smb->ProcessId);
+	cifs_dbg(VFS, "smb buf %p len %u\n", smb, smb2_calc_size(smb));
 #endif
 }
 
@@ -281,6 +281,25 @@ smb2_clear_stats(struct cifs_tcon *tcon)
 }
 
 static void
+smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
+{
+	seq_puts(m, "\n\tShare Capabilities:");
+	if (tcon->capabilities & SMB2_SHARE_CAP_DFS)
+		seq_puts(m, " DFS,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
+		seq_puts(m, " CONTINUOUS AVAILABILITY,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT)
+		seq_puts(m, " SCALEOUT,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER)
+		seq_puts(m, " CLUSTER,");
+	if (tcon->capabilities & SMB2_SHARE_CAP_ASYMMETRIC)
+		seq_puts(m, " ASYMMETRIC,");
+	if (tcon->capabilities == 0)
+		seq_puts(m, " None");
+	seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+}
+
+static void
 smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
 {
 #ifdef CONFIG_CIFS_STATS
@@ -292,7 +311,6 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
 	seq_printf(m, "\nSessionSetups: %d sent %d failed",
 		   atomic_read(&sent[SMB2_SESSION_SETUP_HE]),
 		   atomic_read(&failed[SMB2_SESSION_SETUP_HE]));
-#define SMB2LOGOFF		0x0002 /* trivial request/resp */
 	seq_printf(m, "\nLogoffs: %d sent %d failed",
 		   atomic_read(&sent[SMB2_LOGOFF_HE]),
 		   atomic_read(&failed[SMB2_LOGOFF_HE]));
@@ -436,7 +454,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
 		       &oplock, NULL);
 	kfree(utf16_path);
 	if (rc) {
-		cERROR(1, "open dir failed");
+		cifs_dbg(VFS, "open dir failed\n");
 		return rc;
 	}
 
@@ -448,7 +466,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
 	rc = SMB2_query_directory(xid, tcon, persistent_fid, volatile_fid, 0,
 				  srch_inf);
 	if (rc) {
-		cERROR(1, "query directory failed");
+		cifs_dbg(VFS, "query directory failed\n");
 		SMB2_close(xid, tcon, persistent_fid, volatile_fid);
 	}
 	return rc;
@@ -645,6 +663,7 @@ struct smb_version_operations smb30_operations = {
 	.dump_detail = smb2_dump_detail,
 	.clear_stats = smb2_clear_stats,
 	.print_stats = smb2_print_stats,
+	.dump_share_caps = smb2_dump_share_caps,
 	.is_oplock_break = smb2_is_valid_oplock_break,
 	.need_neg = smb2_need_neg,
 	.negotiate = smb2_negotiate,
@@ -690,6 +709,7 @@ struct smb_version_operations smb30_operations = {
 	.get_lease_key = smb2_get_lease_key,
 	.set_lease_key = smb2_set_lease_key,
 	.new_lease_key = smb2_new_lease_key,
+	.generate_signingkey = generate_smb3signingkey,
 	.calc_signature = smb3_calc_signature,
 };
 
@@ -709,6 +729,8 @@ struct smb_version_values smb20_values = {
 	.cap_nt_find = SMB2_NT_FIND,
 	.cap_large_files = SMB2_LARGE_FILES,
 	.oplock_read = SMB2_OPLOCK_LEVEL_II,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
 };
 
 struct smb_version_values smb21_values = {
@@ -727,6 +749,8 @@ struct smb_version_values smb21_values = {
 	.cap_nt_find = SMB2_NT_FIND,
 	.cap_large_files = SMB2_LARGE_FILES,
 	.oplock_read = SMB2_OPLOCK_LEVEL_II,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
 };
 
 struct smb_version_values smb30_values = {
@@ -744,4 +768,27 @@ struct smb_version_values smb30_values = {
 	.cap_unix = 0,
 	.cap_nt_find = SMB2_NT_FIND,
 	.cap_large_files = SMB2_LARGE_FILES,
+	.oplock_read = SMB2_OPLOCK_LEVEL_II,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+};
+
+struct smb_version_values smb302_values = {
+	.version_string = SMB302_VERSION_STRING,
+	.protocol_id = SMB302_PROT_ID,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+	.large_lock_type = 0,
+	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+	.unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+	.header_size = sizeof(struct smb2_hdr),
+	.max_header_size = MAX_SMB2_HDR_SIZE,
+	.read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+	.lock_cmd = SMB2_LOCK,
+	.cap_unix = 0,
+	.cap_nt_find = SMB2_NT_FIND,
+	.cap_large_files = SMB2_LARGE_FILES,
+	.oplock_read = SMB2_OPLOCK_LEVEL_II,
+	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
 };
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 41d9d0725f0..2b312e4eeaa 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/smb2pdu.c
  *
- *   Copyright (C) International Business Machines  Corp., 2009, 2012
+ *   Copyright (C) International Business Machines  Corp., 2009, 2013
  *                 Etersoft, 2012
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *              Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -108,19 +108,33 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
 	if (!tcon)
 		goto out;
 
+	/* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */
+	/* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
+	/* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
+	if ((tcon->ses) &&
+	    (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+		hdr->CreditCharge = cpu_to_le16(1);
+	/* else CreditCharge MBZ */
+
 	hdr->TreeId = tcon->tid;
 	/* Uid is not converted */
 	if (tcon->ses)
 		hdr->SessionId = tcon->ses->Suid;
-	/* BB check following DFS flags BB */
-	/* BB do we have to add check for SHI1005_FLAGS_DFS_ROOT too? */
-	if (tcon->share_flags & SHI1005_FLAGS_DFS)
-		hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS;
-	/* BB how does SMB2 do case sensitive? */
-	/* if (tcon->nocase)
-		hdr->Flags |= SMBFLG_CASELESS; */
-	if (tcon->ses && tcon->ses->server &&
-	    (tcon->ses->server->sec_mode & SECMODE_SIGN_REQUIRED))
+
+	/*
+	 * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
+	 * to pass the path on the Open SMB prefixed by \\server\share.
+	 * Not sure when we would need to do the augmented path (if ever) and
+	 * setting this flag breaks the SMB2 open operation since it is
+	 * illegal to send an empty path name (without \\server\share prefix)
+	 * when the DFS flag is set in the SMB open header. We could
+	 * consider setting the flag on all operations other than open
+	 * but it is safer to net set it for now.
+	 */
+/*	if (tcon->share_flags & SHI1005_FLAGS_DFS)
+		hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
+
+	if (tcon->ses && tcon->ses->server && tcon->ses->server->sign)
 		hdr->Flags |= SMB2_FLAGS_SIGNED;
 out:
 	pdu->StructureSize2 = cpu_to_le16(parmsize);
@@ -155,8 +169,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
 		if ((smb2_command != SMB2_WRITE) &&
 		   (smb2_command != SMB2_CREATE) &&
 		   (smb2_command != SMB2_TREE_DISCONNECT)) {
-			cFYI(1, "can not send cmd %d while umounting",
-				smb2_command);
+			cifs_dbg(FYI, "can not send cmd %d while umounting\n",
+				 smb2_command);
 			return -ENODEV;
 		}
 	}
@@ -200,7 +214,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
 		 * back on-line
 		 */
 		if (!tcon->retry) {
-			cFYI(1, "gave up waiting on reconnect in smb_init");
+			cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
 			return -EHOSTDOWN;
 		}
 	}
@@ -227,7 +241,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
 	cifs_mark_open_files_invalid(tcon);
 	rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage);
 	mutex_unlock(&tcon->ses->session_mutex);
-	cFYI(1, "reconnect tcon rc = %d", rc);
+	cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
 	if (rc)
 		goto out;
 	atomic_inc(&tconInfoReconnectCount);
@@ -328,34 +342,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 	struct kvec iov[1];
 	int rc = 0;
 	int resp_buftype;
-	struct TCP_Server_Info *server;
-	unsigned int sec_flags;
-	u16 temp = 0;
+	struct TCP_Server_Info *server = ses->server;
 	int blob_offset, blob_length;
 	char *security_blob;
 	int flags = CIFS_NEG_OP;
 
-	cFYI(1, "Negotiate protocol");
+	cifs_dbg(FYI, "Negotiate protocol\n");
 
-	if (ses->server)
-		server = ses->server;
-	else {
-		rc = -EIO;
-		return rc;
+	if (!server) {
+		WARN(1, "%s: server is NULL!\n", __func__);
+		return -EIO;
 	}
 
 	rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req);
 	if (rc)
 		return rc;
 
-	/* if any of auth flags (ie not sign or seal) are overriden use them */
-	if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
-		sec_flags = ses->overrideSecFlg;  /* BB FIXME fix sign flags?*/
-	else /* if override flags set only sign/seal OR them with global auth */
-		sec_flags = global_secflags | ses->overrideSecFlg;
-
-	cFYI(1, "sec_flags 0x%x", sec_flags);
-
 	req->hdr.SessionId = 0;
 
 	req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
@@ -364,12 +366,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 	inc_rfc1001_len(req, 2);
 
 	/* only one of SMB2 signing flags may be set in SMB2 request */
-	if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN)
-		temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
-	else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */
-		temp = SMB2_NEGOTIATE_SIGNING_ENABLED;
-
-	req->SecurityMode = cpu_to_le16(temp);
+	if (ses->sign)
+		req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
+	else if (global_secflags & CIFSSEC_MAY_SIGN)
+		req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
+	else
+		req->SecurityMode = 0;
 
 	req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities);
 
@@ -389,24 +391,28 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 	if (rc != 0)
 		goto neg_exit;
 
-	cFYI(1, "mode 0x%x", rsp->SecurityMode);
+	cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode);
 
 	/* BB we may eventually want to match the negotiated vs. requested
 	   dialect, even though we are only requesting one at a time */
 	if (rsp->DialectRevision == cpu_to_le16(SMB20_PROT_ID))
-		cFYI(1, "negotiated smb2.0 dialect");
+		cifs_dbg(FYI, "negotiated smb2.0 dialect\n");
 	else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID))
-		cFYI(1, "negotiated smb2.1 dialect");
+		cifs_dbg(FYI, "negotiated smb2.1 dialect\n");
 	else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID))
-		cFYI(1, "negotiated smb3.0 dialect");
+		cifs_dbg(FYI, "negotiated smb3.0 dialect\n");
+	else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID))
+		cifs_dbg(FYI, "negotiated smb3.02 dialect\n");
 	else {
-		cERROR(1, "Illegal dialect returned by server %d",
-			   le16_to_cpu(rsp->DialectRevision));
+		cifs_dbg(VFS, "Illegal dialect returned by server %d\n",
+			 le16_to_cpu(rsp->DialectRevision));
 		rc = -EIO;
 		goto neg_exit;
 	}
 	server->dialect = le16_to_cpu(rsp->DialectRevision);
 
+	/* SMB2 only has an extended negflavor */
+	server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
 	server->maxBuf = le32_to_cpu(rsp->MaxTransactSize);
 	server->max_read = le32_to_cpu(rsp->MaxReadSize);
 	server->max_write = le32_to_cpu(rsp->MaxWriteSize);
@@ -418,45 +424,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 
 	security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
 					       &rsp->hdr);
-	if (blob_length == 0) {
-		cERROR(1, "missing security blob on negprot");
-		rc = -EIO;
-		goto neg_exit;
-	}
-
-	cFYI(1, "sec_flags 0x%x", sec_flags);
-	if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
-		cFYI(1, "Signing required");
-		if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
-		      SMB2_NEGOTIATE_SIGNING_ENABLED))) {
-			cERROR(1, "signing required but server lacks support");
-			rc = -EOPNOTSUPP;
-			goto neg_exit;
-		}
-		server->sec_mode |= SECMODE_SIGN_REQUIRED;
-	} else if (sec_flags & CIFSSEC_MAY_SIGN) {
-		cFYI(1, "Signing optional");
-		if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
-			cFYI(1, "Server requires signing");
-			server->sec_mode |= SECMODE_SIGN_REQUIRED;
-		} else {
-			server->sec_mode &=
-				~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
-		}
-	} else {
-		cFYI(1, "Signing disabled");
-		if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
-			cERROR(1, "Server requires packet signing to be enabled"
-				  " in /proc/fs/cifs/SecurityFlags.");
-			rc = -EOPNOTSUPP;
-			goto neg_exit;
-		}
-		server->sec_mode &=
-			~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
-	}
+	/*
+	 * See MS-SMB2 section 2.2.4: if no blob, client picks default which
+	 * for us will be
+	 *	ses->sectype = RawNTLMSSP;
+	 * but for time being this is our only auth choice so doesn't matter.
+	 * We just found a server which sets blob length to zero expecting raw.
+	 */
+	if (blob_length == 0)
+		cifs_dbg(FYI, "missing security blob on negprot\n");
 
+	rc = cifs_enable_signing(server, ses->sign);
 #ifdef CONFIG_SMB2_ASN1  /* BB REMOVEME when updated asn1.c ready */
-	rc = decode_neg_token_init(security_blob, blob_length,
+	if (rc)
+		goto neg_exit;
+	if (blob_length)
+		rc = decode_neg_token_init(security_blob, blob_length,
 				   &server->sec_type);
 	if (rc == 1)
 		rc = 0;
@@ -481,21 +464,17 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
 	int rc = 0;
 	int resp_buftype;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
-	struct TCP_Server_Info *server;
-	unsigned int sec_flags;
-	u8 temp = 0;
+	struct TCP_Server_Info *server = ses->server;
 	u16 blob_length = 0;
 	char *security_blob;
 	char *ntlmssp_blob = NULL;
 	bool use_spnego = false; /* else use raw ntlmssp */
 
-	cFYI(1, "Session Setup");
+	cifs_dbg(FYI, "Session Setup\n");
 
-	if (ses->server)
-		server = ses->server;
-	else {
-		rc = -EIO;
-		return rc;
+	if (!server) {
+		WARN(1, "%s: server is NULL!\n", __func__);
+		return -EIO;
 	}
 
 	/*
@@ -506,7 +485,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
 	if (!ses->ntlmssp)
 		return -ENOMEM;
 
-	ses->server->secType = RawNTLMSSP;
+	/* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
+	ses->sectype = RawNTLMSSP;
 
 ssetup_ntlmssp_authenticate:
 	if (phase == NtLmChallenge)
@@ -516,28 +496,19 @@ ssetup_ntlmssp_authenticate:
 	if (rc)
 		return rc;
 
-	/* if any of auth flags (ie not sign or seal) are overriden use them */
-	if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
-		sec_flags = ses->overrideSecFlg;  /* BB FIXME fix sign flags?*/
-	else /* if override flags set only sign/seal OR them with global auth */
-		sec_flags = global_secflags | ses->overrideSecFlg;
-
-	cFYI(1, "sec_flags 0x%x", sec_flags);
-
 	req->hdr.SessionId = 0; /* First session, not a reauthenticate */
 	req->VcNumber = 0; /* MBZ */
 	/* to enable echos and oplocks */
 	req->hdr.CreditRequest = cpu_to_le16(3);
 
 	/* only one of SMB2 signing flags may be set in SMB2 request */
-	if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN)
-		temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
-	else if (ses->server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED)
-		temp = SMB2_NEGOTIATE_SIGNING_REQUIRED;
-	else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */
-		temp = SMB2_NEGOTIATE_SIGNING_ENABLED;
-
-	req->SecurityMode = temp;
+	if (server->sign)
+		req->SecurityMode = SMB2_NEGOTIATE_SIGNING_REQUIRED;
+	else if (global_secflags & CIFSSEC_MAY_SIGN) /* one flag unlike MUST_ */
+		req->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED;
+	else
+		req->SecurityMode = 0;
+
 	req->Capabilities = 0;
 	req->Channel = 0; /* MBZ */
 
@@ -558,7 +529,7 @@ ssetup_ntlmssp_authenticate:
 					sizeof(struct _NEGOTIATE_MESSAGE),
 					ntlmssp_blob); */
 			/* BB eventually need to add this */
-			cERROR(1, "spnego not supported for SMB2 yet");
+			cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
 			rc = -EOPNOTSUPP;
 			kfree(ntlmssp_blob);
 			goto ssetup_exit;
@@ -572,14 +543,14 @@ ssetup_ntlmssp_authenticate:
 		ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500,
 				       GFP_KERNEL);
 		if (ntlmssp_blob == NULL) {
-			cERROR(1, "failed to malloc ntlmssp blob");
 			rc = -ENOMEM;
 			goto ssetup_exit;
 		}
 		rc = build_ntlmssp_auth_blob(ntlmssp_blob, &blob_length, ses,
 					     nls_cp);
 		if (rc) {
-			cFYI(1, "build_ntlmssp_auth_blob failed %d", rc);
+			cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n",
+				 rc);
 			goto ssetup_exit; /* BB double check error handling */
 		}
 		if (use_spnego) {
@@ -587,7 +558,7 @@ ssetup_ntlmssp_authenticate:
 							&security_blob,
 							blob_length,
 							ntlmssp_blob); */
-			cERROR(1, "spnego not supported for SMB2 yet");
+			cifs_dbg(VFS, "spnego not supported for SMB2 yet\n");
 			rc = -EOPNOTSUPP;
 			kfree(ntlmssp_blob);
 			goto ssetup_exit;
@@ -595,7 +566,7 @@ ssetup_ntlmssp_authenticate:
 			security_blob = ntlmssp_blob;
 		}
 	} else {
-		cERROR(1, "illegal ntlmssp phase");
+		cifs_dbg(VFS, "illegal ntlmssp phase\n");
 		rc = -EIO;
 		goto ssetup_exit;
 	}
@@ -620,13 +591,13 @@ ssetup_ntlmssp_authenticate:
 	if (resp_buftype != CIFS_NO_BUFFER &&
 	    rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) {
 		if (phase != NtLmNegotiate) {
-			cERROR(1, "Unexpected more processing error");
+			cifs_dbg(VFS, "Unexpected more processing error\n");
 			goto ssetup_exit;
 		}
 		if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 !=
 				le16_to_cpu(rsp->SecurityBufferOffset)) {
-			cERROR(1, "Invalid security buffer offset %d",
-				  le16_to_cpu(rsp->SecurityBufferOffset));
+			cifs_dbg(VFS, "Invalid security buffer offset %d\n",
+				 le16_to_cpu(rsp->SecurityBufferOffset));
 			rc = -EIO;
 			goto ssetup_exit;
 		}
@@ -667,7 +638,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
 	int rc = 0;
 	struct TCP_Server_Info *server;
 
-	cFYI(1, "disconnect session %p", ses);
+	cifs_dbg(FYI, "disconnect session %p\n", ses);
 
 	if (ses && (ses->server))
 		server = ses->server;
@@ -680,7 +651,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
 
 	 /* since no tcon, smb2_init can not do this, so do here */
 	req->hdr.SessionId = ses->Suid;
-	if (server->sec_mode & SECMODE_SIGN_REQUIRED)
+	if (server->sign)
 		req->hdr.Flags |= SMB2_FLAGS_SIGNED;
 
 	rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0);
@@ -711,7 +682,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	struct TCP_Server_Info *server;
 	__le16 *unc_path = NULL;
 
-	cFYI(1, "TCON");
+	cifs_dbg(FYI, "TCON\n");
 
 	if ((ses->server) && tree)
 		server = ses->server;
@@ -775,29 +746,30 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	}
 
 	if (rsp->ShareType & SMB2_SHARE_TYPE_DISK)
-		cFYI(1, "connection to disk share");
+		cifs_dbg(FYI, "connection to disk share\n");
 	else if (rsp->ShareType & SMB2_SHARE_TYPE_PIPE) {
 		tcon->ipc = true;
-		cFYI(1, "connection to pipe share");
+		cifs_dbg(FYI, "connection to pipe share\n");
 	} else if (rsp->ShareType & SMB2_SHARE_TYPE_PRINT) {
 		tcon->print = true;
-		cFYI(1, "connection to printer");
+		cifs_dbg(FYI, "connection to printer\n");
 	} else {
-		cERROR(1, "unknown share type %d", rsp->ShareType);
+		cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType);
 		rc = -EOPNOTSUPP;
 		goto tcon_error_exit;
 	}
 
 	tcon->share_flags = le32_to_cpu(rsp->ShareFlags);
+	tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
 	tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
 	tcon->tidStatus = CifsGood;
 	tcon->need_reconnect = false;
 	tcon->tid = rsp->hdr.TreeId;
-	strncpy(tcon->treeName, tree, MAX_TREE_SIZE);
+	strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
 
 	if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
 	    ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
-		cERROR(1, "DFS capability contradicts DFS flag");
+		cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
 
 tcon_exit:
 	free_rsp_buf(resp_buftype, rsp);
@@ -806,7 +778,7 @@ tcon_exit:
 
 tcon_error_exit:
 	if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) {
-		cERROR(1, "BAD_NETWORK_NAME: %s", tree);
+		cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
 		tcon->bad_network_name = true;
 	}
 	goto tcon_exit;
@@ -820,7 +792,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
 	struct TCP_Server_Info *server;
 	struct cifs_ses *ses = tcon->ses;
 
-	cFYI(1, "Tree Disconnect");
+	cifs_dbg(FYI, "Tree Disconnect\n");
 
 	if (ses && (ses->server))
 		server = ses->server;
@@ -846,12 +818,10 @@ create_lease_buf(u8 *lease_key, u8 oplock)
 {
 	struct create_lease *buf;
 
-	buf = kmalloc(sizeof(struct create_lease), GFP_KERNEL);
+	buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL);
 	if (!buf)
 		return NULL;
 
-	memset(buf, 0, sizeof(struct create_lease));
-
 	buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
 	buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
 	if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
@@ -925,7 +895,7 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
 	int rc = 0;
 	int num_iovecs = 2;
 
-	cFYI(1, "create/open");
+	cifs_dbg(FYI, "create/open\n");
 
 	if (ses && (ses->server))
 		server = ses->server;
@@ -1039,6 +1009,122 @@ creat_exit:
 	return rc;
 }
 
+/*
+ *	SMB2 IOCTL is used for both IOCTLs and FSCTLs
+ */
+int
+SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
+	   u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data,
+	   u32 indatalen, char **out_data, u32 *plen /* returned data len */)
+{
+	struct smb2_ioctl_req *req;
+	struct smb2_ioctl_rsp *rsp;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses = tcon->ses;
+	struct kvec iov[2];
+	int resp_buftype;
+	int num_iovecs;
+	int rc = 0;
+
+	cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+	/* zero out returned data len, in case of error */
+	if (plen)
+		*plen = 0;
+
+	if (ses && (ses->server))
+		server = ses->server;
+	else
+		return -EIO;
+
+	rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req);
+	if (rc)
+		return rc;
+
+	req->CtlCode = cpu_to_le32(opcode);
+	req->PersistentFileId = persistent_fid;
+	req->VolatileFileId = volatile_fid;
+
+	if (indatalen) {
+		req->InputCount = cpu_to_le32(indatalen);
+		/* do not set InputOffset if no input data */
+		req->InputOffset =
+		       cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4);
+		iov[1].iov_base = in_data;
+		iov[1].iov_len = indatalen;
+		num_iovecs = 2;
+	} else
+		num_iovecs = 1;
+
+	req->OutputOffset = 0;
+	req->OutputCount = 0; /* MBZ */
+
+	/*
+	 * Could increase MaxOutputResponse, but that would require more
+	 * than one credit. Windows typically sets this smaller, but for some
+	 * ioctls it may be useful to allow server to send more. No point
+	 * limiting what the server can send as long as fits in one credit
+	 */
+	req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */
+
+	if (is_fsctl)
+		req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
+	else
+		req->Flags = 0;
+
+	iov[0].iov_base = (char *)req;
+	/* 4 for rfc1002 length field */
+	iov[0].iov_len = get_rfc1002_length(req) + 4;
+
+	if (indatalen)
+		inc_rfc1001_len(req, indatalen);
+
+	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
+	rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
+
+	if (rc != 0) {
+		if (tcon)
+			cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+		goto ioctl_exit;
+	}
+
+	/* check if caller wants to look at return data or just return rc */
+	if ((plen == NULL) || (out_data == NULL))
+		goto ioctl_exit;
+
+	*plen = le32_to_cpu(rsp->OutputCount);
+
+	/* We check for obvious errors in the output buffer length and offset */
+	if (*plen == 0)
+		goto ioctl_exit; /* server returned no data */
+	else if (*plen > 0xFF00) {
+		cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen);
+		*plen = 0;
+		rc = -EIO;
+		goto ioctl_exit;
+	}
+
+	if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) {
+		cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
+			le32_to_cpu(rsp->OutputOffset));
+		*plen = 0;
+		rc = -EIO;
+		goto ioctl_exit;
+	}
+
+	*out_data = kmalloc(*plen, GFP_KERNEL);
+	if (*out_data == NULL) {
+		rc = -ENOMEM;
+		goto ioctl_exit;
+	}
+
+	memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
+	       *plen);
+ioctl_exit:
+	free_rsp_buf(resp_buftype, rsp);
+	return rc;
+}
+
 int
 SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	   u64 persistent_fid, u64 volatile_fid)
@@ -1051,7 +1137,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	int resp_buftype;
 	int rc = 0;
 
-	cFYI(1, "Close");
+	cifs_dbg(FYI, "Close\n");
 
 	if (ses && (ses->server))
 		server = ses->server;
@@ -1097,20 +1183,20 @@ validate_buf(unsigned int offset, unsigned int buffer_length,
 
 
 	if (buffer_length < min_buf_size) {
-		cERROR(1, "buffer length %d smaller than minimum size %d",
-			   buffer_length, min_buf_size);
+		cifs_dbg(VFS, "buffer length %d smaller than minimum size %d\n",
+			 buffer_length, min_buf_size);
 		return -EINVAL;
 	}
 
 	/* check if beyond RFC1001 maximum length */
 	if ((smb_len > 0x7FFFFF) || (buffer_length > 0x7FFFFF)) {
-		cERROR(1, "buffer length %d or smb length %d too large",
-			   buffer_length, smb_len);
+		cifs_dbg(VFS, "buffer length %d or smb length %d too large\n",
+			 buffer_length, smb_len);
 		return -EINVAL;
 	}
 
 	if ((begin_of_buf > end_of_smb) || (end_of_buf > end_of_smb)) {
-		cERROR(1, "illegal server response, bad offset to data");
+		cifs_dbg(VFS, "illegal server response, bad offset to data\n");
 		return -EINVAL;
 	}
 
@@ -1155,7 +1241,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
 	struct TCP_Server_Info *server;
 	struct cifs_ses *ses = tcon->ses;
 
-	cFYI(1, "Query Info");
+	cifs_dbg(FYI, "Query Info\n");
 
 	if (ses && (ses->server))
 		server = ses->server;
@@ -1247,7 +1333,7 @@ SMB2_echo(struct TCP_Server_Info *server)
 	struct smb_rqst rqst = { .rq_iov = &iov,
 				 .rq_nvec = 1 };
 
-	cFYI(1, "In echo request");
+	cifs_dbg(FYI, "In echo request\n");
 
 	rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req);
 	if (rc)
@@ -1262,7 +1348,7 @@ SMB2_echo(struct TCP_Server_Info *server)
 	rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, server,
 			     CIFS_ECHO_OP);
 	if (rc)
-		cFYI(1, "Echo request failed: %d", rc);
+		cifs_dbg(FYI, "Echo request failed: %d\n", rc);
 
 	cifs_small_buf_release(req);
 	return rc;
@@ -1279,7 +1365,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	int resp_buftype;
 	int rc = 0;
 
-	cFYI(1, "Flush");
+	cifs_dbg(FYI, "Flush\n");
 
 	if (ses && (ses->server))
 		server = ses->server;
@@ -1379,21 +1465,21 @@ smb2_readv_callback(struct mid_q_entry *mid)
 				 .rq_pagesz = rdata->pagesz,
 				 .rq_tailsz = rdata->tailsz };
 
-	cFYI(1, "%s: mid=%llu state=%d result=%d bytes=%u", __func__,
-		mid->mid, mid->mid_state, rdata->result, rdata->bytes);
+	cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
+		 __func__, mid->mid, mid->mid_state, rdata->result,
+		 rdata->bytes);
 
 	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
 		credits_received = le16_to_cpu(buf->CreditRequest);
 		/* result already set, check signature */
-		if (server->sec_mode &
-		    (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+		if (server->sign) {
 			int rc;
 
 			rc = smb2_verify_signature(&rqst, server);
 			if (rc)
-				cERROR(1, "SMB signature verification returned "
-				       "error = %d", rc);
+				cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+					 rc);
 		}
 		/* FIXME: should this be counted toward the initiating task? */
 		task_io_account_read(rdata->bytes);
@@ -1426,8 +1512,8 @@ smb2_async_readv(struct cifs_readdata *rdata)
 	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
 				 .rq_nvec = 1 };
 
-	cFYI(1, "%s: offset=%llu bytes=%u", __func__,
-		rdata->offset, rdata->bytes);
+	cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
+		 __func__, rdata->offset, rdata->bytes);
 
 	io_parms.tcon = tlink_tcon(rdata->cfile->tlink);
 	io_parms.offset = rdata->offset;
@@ -1481,13 +1567,13 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 
 	if (rc) {
 		cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
-		cERROR(1, "Send error in read = %d", rc);
+		cifs_dbg(VFS, "Send error in read = %d\n", rc);
 	} else {
 		*nbytes = le32_to_cpu(rsp->DataLength);
 		if ((*nbytes > CIFS_MAX_MSGSIZE) ||
 		    (*nbytes > io_parms->length)) {
-			cFYI(1, "bad length %d for count %d", *nbytes,
-				io_parms->length);
+			cifs_dbg(FYI, "bad length %d for count %d\n",
+				 *nbytes, io_parms->length);
 			rc = -EIO;
 			*nbytes = 0;
 		}
@@ -1597,7 +1683,8 @@ smb2_async_writev(struct cifs_writedata *wdata)
 	rqst.rq_pagesz = wdata->pagesz;
 	rqst.rq_tailsz = wdata->tailsz;
 
-	cFYI(1, "async write at %llu %u bytes", wdata->offset, wdata->bytes);
+	cifs_dbg(FYI, "async write at %llu %u bytes\n",
+		 wdata->offset, wdata->bytes);
 
 	req->Length = cpu_to_le32(wdata->bytes);
 
@@ -1670,7 +1757,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
 
 	if (rc) {
 		cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE);
-		cERROR(1, "Send error in write = %d", rc);
+		cifs_dbg(VFS, "Send error in write = %d\n", rc);
 	} else
 		*nbytes = le32_to_cpu(rsp->DataLength);
 
@@ -1696,14 +1783,14 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size)
 					((char *)entryptr + next_offset);
 
 		if ((char *)entryptr + size > end_of_buf) {
-			cERROR(1, "malformed search entry would overflow");
+			cifs_dbg(VFS, "malformed search entry would overflow\n");
 			break;
 		}
 
 		len = le32_to_cpu(entryptr->FileNameLength);
 		if ((char *)entryptr + len + size > end_of_buf) {
-			cERROR(1, "directory entry name would overflow frame "
-				  "end of buf %p", end_of_buf);
+			cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n",
+				 end_of_buf);
 			break;
 		}
 
@@ -1759,8 +1846,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 		info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1;
 		break;
 	default:
-		cERROR(1, "info level %u isn't supported",
-		       srch_inf->info_level);
+		cifs_dbg(VFS, "info level %u isn't supported\n",
+			 srch_inf->info_level);
 		rc = -EINVAL;
 		goto qdir_exit;
 	}
@@ -1824,15 +1911,15 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 			num_entries(srch_inf->srch_entries_start, end_of_smb,
 				    &srch_inf->last_entry, info_buf_size);
 	srch_inf->index_of_last_entry += srch_inf->entries_in_buffer;
-	cFYI(1, "num entries %d last_index %lld srch start %p srch end %p",
-		srch_inf->entries_in_buffer, srch_inf->index_of_last_entry,
-		srch_inf->srch_entries_start, srch_inf->last_entry);
+	cifs_dbg(FYI, "num entries %d last_index %lld srch start %p srch end %p\n",
+		 srch_inf->entries_in_buffer, srch_inf->index_of_last_entry,
+		 srch_inf->srch_entries_start, srch_inf->last_entry);
 	if (resp_buftype == CIFS_LARGE_BUFFER)
 		srch_inf->smallBuf = false;
 	else if (resp_buftype == CIFS_SMALL_BUFFER)
 		srch_inf->smallBuf = true;
 	else
-		cERROR(1, "illegal search buffer type");
+		cifs_dbg(VFS, "illegal search buffer type\n");
 
 	if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
 		srch_inf->endOfSearch = 1;
@@ -2017,7 +2104,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc;
 	struct smb2_oplock_break *req = NULL;
 
-	cFYI(1, "SMB2_oplock_break");
+	cifs_dbg(FYI, "SMB2_oplock_break\n");
 	rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
 
 	if (rc)
@@ -2033,7 +2120,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
 
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
-		cFYI(1, "Send error in Oplock Break = %d", rc);
+		cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc);
 	}
 
 	return rc;
@@ -2058,7 +2145,7 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level,
 	int rc;
 	struct smb2_query_info_req *req;
 
-	cFYI(1, "Query FSInfo level %d", level);
+	cifs_dbg(FYI, "Query FSInfo level %d\n", level);
 
 	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
 		return -EIO;
@@ -2131,7 +2218,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	int resp_buf_type;
 	unsigned int count;
 
-	cFYI(1, "smb2_lockv num lock %d", num_lock);
+	cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock);
 
 	rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req);
 	if (rc)
@@ -2155,7 +2242,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
 	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
 	if (rc) {
-		cFYI(1, "Send error in smb2_lockv = %d", rc);
+		cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc);
 		cifs_stats_fail_inc(tcon, SMB2_LOCK_HE);
 	}
 
@@ -2186,7 +2273,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc;
 	struct smb2_lease_ack *req = NULL;
 
-	cFYI(1, "SMB2_lease_break");
+	cifs_dbg(FYI, "SMB2_lease_break\n");
 	rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
 
 	if (rc)
@@ -2204,7 +2291,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
 
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
-		cFYI(1, "Send error in Lease Break = %d", rc);
+		cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc);
 	}
 
 	return rc;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 4cb4ced258c..f31043b26bd 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/smb2pdu.h
  *
- *   Copyright (c) International Business Machines  Corp., 2009, 2010
+ *   Copyright (c) International Business Machines  Corp., 2009, 2013
  *                 Etersoft, 2012
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *              Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -170,6 +170,7 @@ struct smb2_negotiate_req {
 #define SMB20_PROT_ID 0x0202
 #define SMB21_PROT_ID 0x0210
 #define SMB30_PROT_ID 0x0300
+#define SMB302_PROT_ID 0x0302
 #define BAD_PROT_ID   0xFFFF
 
 /* SecurityMode flags */
@@ -283,10 +284,17 @@ struct smb2_tree_connect_rsp {
 #define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING		0x00000400
 #define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM	0x00000800
 #define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK		0x00001000
-#define SHI1005_FLAGS_ENABLE_HASH			0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V1			0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V2			0x00004000
+#define SHI1005_FLAGS_ENCRYPT_DATA			0x00008000
+#define SHI1005_FLAGS_ALL				0x0000FF33
 
 /* Possible share capabilities */
-#define SMB2_SHARE_CAP_DFS	cpu_to_le32(0x00000008)
+#define SMB2_SHARE_CAP_DFS	cpu_to_le32(0x00000008) /* all dialects */
+#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
+#define SMB2_SHARE_CAP_SCALEOUT	cpu_to_le32(0x00000020) /* 3.0 */
+#define SMB2_SHARE_CAP_CLUSTER	cpu_to_le32(0x00000040) /* 3.0 */
+#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
 
 struct smb2_tree_disconnect_req {
 	struct smb2_hdr hdr;
@@ -477,6 +485,75 @@ struct create_lease {
 	struct lease_context lcontext;
 } __packed;
 
+/* this goes in the ioctl buffer when doing a copychunk request */
+struct copychunk_ioctl {
+	char SourceKey[24];
+	__le32 ChunkCount; /* we are only sending 1 */
+	__le32 Reserved;
+	/* array will only be one chunk long for us */
+	__le64 SourceOffset;
+	__le64 TargetOffset;
+	__le32 Length; /* how many bytes to copy */
+	__u32 Reserved2;
+} __packed;
+
+/* Response and Request are the same format */
+struct validate_negotiate_info {
+	__le32 Capabilities;
+	__u8   Guid[SMB2_CLIENT_GUID_SIZE];
+	__le16 SecurityMode;
+	__le16 DialectCount;
+	__le16 Dialect[1];
+} __packed;
+
+#define RSS_CAPABLE	0x00000001
+#define RDMA_CAPABLE	0x00000002
+
+struct network_interface_info_ioctl_rsp {
+	__le32 Next; /* next interface. zero if this is last one */
+	__le32 IfIndex;
+	__le32 Capability; /* RSS or RDMA Capable */
+	__le32 Reserved;
+	__le64 LinkSpeed;
+	char	SockAddr_Storage[128];
+} __packed;
+
+#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
+
+struct smb2_ioctl_req {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 57 */
+	__u16 Reserved;
+	__le32 CtlCode;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 InputOffset;
+	__le32 InputCount;
+	__le32 MaxInputResponse;
+	__le32 OutputOffset;
+	__le32 OutputCount;
+	__le32 MaxOutputResponse;
+	__le32 Flags;
+	__u32  Reserved2;
+	char   Buffer[0];
+} __packed;
+
+struct smb2_ioctl_rsp {
+	struct smb2_hdr hdr;
+	__le16 StructureSize;	/* Must be 57 */
+	__u16 Reserved;
+	__le32 CtlCode;
+	__u64  PersistentFileId; /* opaque endianness */
+	__u64  VolatileFileId; /* opaque endianness */
+	__le32 InputOffset;
+	__le32 InputCount;
+	__le32 OutputOffset;
+	__le32 OutputCount;
+	__le32 Flags;
+	__u32  Reserved2;
+	/* char * buffer[] */
+} __packed;
+
 /* Currently defined values for close flags */
 #define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB	cpu_to_le16(0x0001)
 struct smb2_close_req {
@@ -517,17 +594,25 @@ struct smb2_flush_rsp {
 	__le16 Reserved;
 } __packed;
 
+/* For read request Flags field below, following flag is defined for SMB3.02 */
+#define SMB2_READFLAG_READ_UNBUFFERED	0x01
+
+/* Channel field for read and write: exactly one of following flags can be set*/
+#define SMB2_CHANNEL_NONE		0x00000000
+#define SMB2_CHANNEL_RDMA_V1		0x00000001 /* SMB3 or later */
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */
+
 struct smb2_read_req {
 	struct smb2_hdr hdr;
 	__le16 StructureSize; /* Must be 49 */
 	__u8   Padding; /* offset from start of SMB2 header to place read */
-	__u8   Reserved;
+	__u8   Flags; /* MBZ unless SMB3.02 or later */
 	__le32 Length;
 	__le64 Offset;
 	__u64  PersistentFileId; /* opaque endianness */
 	__u64  VolatileFileId; /* opaque endianness */
 	__le32 MinimumCount;
-	__le32 Channel; /* Reserved MBZ */
+	__le32 Channel; /* MBZ except for SMB3 or later */
 	__le32 RemainingBytes;
 	__le16 ReadChannelInfoOffset; /* Reserved MBZ */
 	__le16 ReadChannelInfoLength; /* Reserved MBZ */
@@ -545,8 +630,9 @@ struct smb2_read_rsp {
 	__u8   Buffer[1];
 } __packed;
 
-/* For write request Flags field below the following flag is defined: */
-#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001
+/* For write request Flags field below the following flags are defined: */
+#define SMB2_WRITEFLAG_WRITE_THROUGH	0x00000001	/* SMB2.1 or later */
+#define SMB2_WRITEFLAG_WRITE_UNBUFFERED	0x00000002	/* SMB3.02 or later */
 
 struct smb2_write_req {
 	struct smb2_hdr hdr;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 2aa3535e38c..d4e1eb80745 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -111,6 +111,10 @@ extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon,
 		     __u32 desired_access, __u32 create_disposition,
 		     __u32 file_attributes, __u32 create_options,
 		     __u8 *oplock, struct smb2_file_all_info *buf);
+extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
+		     u64 persistent_fid, u64 volatile_fid, u32 opcode,
+		     bool is_fsctl, char *in_data, u32 indatalen,
+		     char **out_data, u32 *plen /* returned data len */);
 extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 		      u64 persistent_file_id, u64 volatile_file_id);
 extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 8dd73e61d76..09b4fbaadeb 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -55,13 +55,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	rc = crypto_shash_setkey(server->secmech.hmacsha256,
 		server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
 	if (rc) {
-		cERROR(1, "%s: Could not update with response\n", __func__);
+		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
 		return rc;
 	}
 
 	rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
 	if (rc) {
-		cERROR(1, "%s: Could not init md5\n", __func__);
+		cifs_dbg(VFS, "%s: Could not init md5\n", __func__);
 		return rc;
 	}
 
@@ -69,7 +69,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 		if (iov[i].iov_len == 0)
 			continue;
 		if (iov[i].iov_base == NULL) {
-			cERROR(1, "null iovec entry");
+			cifs_dbg(VFS, "null iovec entry\n");
 			return -EIO;
 		}
 		/*
@@ -90,8 +90,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 				iov[i].iov_base, iov[i].iov_len);
 		}
 		if (rc) {
-			cERROR(1, "%s: Could not update with payload\n",
-							__func__);
+			cifs_dbg(VFS, "%s: Could not update with payload\n",
+				 __func__);
 			return rc;
 		}
 	}
@@ -109,18 +109,162 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
 				sigptr);
 	if (rc)
-		cERROR(1, "%s: Could not generate sha256 hash\n", __func__);
+		cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
 
 	memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
 
 	return rc;
 }
 
+void
+generate_smb3signingkey(struct TCP_Server_Info *server)
+{
+	unsigned char zero = 0x0;
+	__u8 i[4] = {0, 0, 0, 1};
+	__u8 L[4] = {0, 0, 0, 128};
+	int rc = 0;
+	unsigned char prfhash[SMB2_HMACSHA256_SIZE];
+	unsigned char *hashptr = prfhash;
+
+	memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
+	memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+
+	rc = crypto_shash_setkey(server->secmech.hmacsha256,
+		server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set with session key\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+				i, 4);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with n\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+				"SMB2AESCMAC", 12);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+				&zero, 1);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with zero\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+				"SmbSign", 8);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+				L, 4);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not update with L\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
+				hashptr);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
+		goto smb3signkey_ret;
+	}
+
+	memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
+
+smb3signkey_ret:
+	return;
+}
+
 int
 smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
-	cFYI(1, "smb3 signatures not supported yet");
-	return -EOPNOTSUPP;
+	int i, rc;
+	unsigned char smb3_signature[SMB2_CMACAES_SIZE];
+	unsigned char *sigptr = smb3_signature;
+	struct kvec *iov = rqst->rq_iov;
+	int n_vec = rqst->rq_nvec;
+	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+
+	memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
+	memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+
+	rc = crypto_shash_setkey(server->secmech.cmacaes,
+		server->smb3signingkey, SMB2_CMACAES_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
+		return rc;
+	}
+
+	rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
+		return rc;
+	}
+
+	for (i = 0; i < n_vec; i++) {
+		if (iov[i].iov_len == 0)
+			continue;
+		if (iov[i].iov_base == NULL) {
+			cifs_dbg(VFS, "null iovec entry");
+			return -EIO;
+		}
+		/*
+		 * The first entry includes a length field (which does not get
+		 * signed that occupies the first 4 bytes before the header).
+		 */
+		if (i == 0) {
+			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
+				break; /* nothing to sign or corrupt header */
+			rc =
+			crypto_shash_update(
+				&server->secmech.sdesccmacaes->shash,
+				iov[i].iov_base + 4, iov[i].iov_len - 4);
+		} else {
+			rc =
+			crypto_shash_update(
+				&server->secmech.sdesccmacaes->shash,
+				iov[i].iov_base, iov[i].iov_len);
+		}
+		if (rc) {
+			cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n",
+							__func__);
+			return rc;
+		}
+	}
+
+	/* now hash over the rq_pages array */
+	for (i = 0; i < rqst->rq_npages; i++) {
+		struct kvec p_iov;
+
+		cifs_rqst_page_to_kvec(rqst, i, &p_iov);
+		crypto_shash_update(&server->secmech.sdesccmacaes->shash,
+					p_iov.iov_base, p_iov.iov_len);
+		kunmap(rqst->rq_pages[i]);
+	}
+
+	rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash,
+						sigptr);
+	if (rc)
+		cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__);
+
+	memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+
+	return rc;
 }
 
 /* must be called with server->srv_mutex held */
@@ -163,8 +307,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 
 	/* Do not need to verify session setups with signature "BSRSPYL " */
 	if (memcmp(smb2_pdu->Signature, "BSRSPYL ", 8) == 0)
-		cFYI(1, "dummy signature received for smb command 0x%x",
-			smb2_pdu->Command);
+		cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n",
+			 smb2_pdu->Command);
 
 	/*
 	 * Save off the origiginal signature so we can modify the smb and check
@@ -205,7 +349,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer,
 	struct mid_q_entry *temp;
 
 	if (server == NULL) {
-		cERROR(1, "Null TCP session in smb2_mid_entry_alloc");
+		cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n");
 		return NULL;
 	}
 
@@ -241,7 +385,7 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf,
 		return -ENOENT;
 
 	if (ses->server->tcpStatus == CifsNeedReconnect) {
-		cFYI(1, "tcp session dead - return to caller to retry");
+		cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
 		return -EAGAIN;
 	}
 
@@ -275,14 +419,13 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
 
 	dump_smb(mid->resp_buf, min_t(u32, 80, len));
 	/* convert the length into a more usable form */
-	if ((len > 24) &&
-	    (server->sec_mode & (SECMODE_SIGN_REQUIRED|SECMODE_SIGN_ENABLED))) {
+	if (len > 24 && server->sign) {
 		int rc;
 
 		rc = smb2_verify_signature(&rqst, server);
 		if (rc)
-			cERROR(1, "SMB signature verification returned error = "
-			       "%d", rc);
+			cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+				 rc);
 	}
 
 	return map_smb2_to_linux_error(mid->resp_buf, log_error);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index a0a58fbe2c1..43eb1367b10 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -78,7 +78,7 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
 	tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(tfm_des)) {
 		rc = PTR_ERR(tfm_des);
-		cERROR(1, "could not allocate des crypto API");
+		cifs_dbg(VFS, "could not allocate des crypto API\n");
 		goto smbhash_err;
 	}
 
@@ -91,7 +91,7 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
 
 	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
 	if (rc)
-		cERROR(1, "could not encrypt crypt key rc: %d", rc);
+		cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc);
 
 	crypto_free_blkcipher(tfm_des);
 smbhash_err:
@@ -139,14 +139,14 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
 	md4 = crypto_alloc_shash("md4", 0, 0);
 	if (IS_ERR(md4)) {
 		rc = PTR_ERR(md4);
-		cERROR(1, "%s: Crypto md4 allocation error %d", __func__, rc);
+		cifs_dbg(VFS, "%s: Crypto md4 allocation error %d\n",
+			 __func__, rc);
 		return rc;
 	}
 	size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
 	sdescmd4 = kmalloc(size, GFP_KERNEL);
 	if (!sdescmd4) {
 		rc = -ENOMEM;
-		cERROR(1, "%s: Memory allocation failure", __func__);
 		goto mdfour_err;
 	}
 	sdescmd4->shash.tfm = md4;
@@ -154,17 +154,17 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
 
 	rc = crypto_shash_init(&sdescmd4->shash);
 	if (rc) {
-		cERROR(1, "%s: Could not init md4 shash", __func__);
+		cifs_dbg(VFS, "%s: Could not init md4 shash\n", __func__);
 		goto mdfour_err;
 	}
 	rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len);
 	if (rc) {
-		cERROR(1, "%s: Could not update with link_str", __func__);
+		cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__);
 		goto mdfour_err;
 	}
 	rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
 	if (rc)
-		cERROR(1, "%s: Could not genereate md4 hash", __func__);
+		cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__);
 
 mdfour_err:
 	crypto_free_shash(md4);
@@ -238,7 +238,8 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24,
 
 	rc = E_md4hash(passwd, p16, codepage);
 	if (rc) {
-		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+		cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n",
+			 __func__, rc);
 		return rc;
 	}
 	memcpy(p21, p16, 16);
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 7056b891e08..d952ee48f4d 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
  *
- *   Copyright (c) International Business Machines  Corp., 2002,2009
+ *   Copyright (c) International Business Machines  Corp., 2002,2013
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -22,7 +22,7 @@
 /* IOCTL information */
 /*
  * List of ioctl/fsctl function codes that are or could be useful in the
- * future to remote clients like cifs or SMB2 client.  There is probably
+ * future to remote clients like cifs or SMB2/SMB3 client.  This is probably
  * a slightly larger set of fsctls that NTFS local filesystem could handle,
  * including the seven below that we do not have struct definitions for.
  * Even with protocol definitions for most of these now available, we still
@@ -30,7 +30,13 @@
  * remotely.  Some of the following, such as the encryption/compression ones
  * could be invoked from tools via a specialized hook into the VFS rather
  * than via the standard vfs entry points
+ *
+ * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are
+ * below). Additional detail on less common ones can be found in MS-FSCC
+ * section 2.3.
  */
+#define FSCTL_DFS_GET_REFERRALS      0x00060194
+#define FSCTL_DFS_GET_REFERRALS_EX   0x000601B0
 #define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
 #define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
 #define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
@@ -71,14 +77,31 @@
 #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
 #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
 #define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
+#define FSCTL_FILE_LEVEL_TRIM        0x00098208 /* BB add struct */
 #define FSCTL_SIS_LINK_FILES         0x0009C104
 #define FSCTL_PIPE_PEEK              0x0011400C /* BB add struct */
 #define FSCTL_PIPE_TRANSCEIVE        0x0011C017 /* BB add struct */
 /* strange that the number for this op is not sequential with previous op */
 #define FSCTL_PIPE_WAIT              0x00110018 /* BB add struct */
+/* Enumerate previous versions of a file */
+#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
+/* Retrieve an opaque file reference for server-side data movement ie copy */
+#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
 #define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
 #define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */
+/* Perform server-side data movement */
+#define FSCTL_SRV_COPYCHUNK 0x001440F2
+#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
+#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
+#define FSCTL_SRV_READ_HASH          0x001441BB /* BB add struct */
 
 #define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
 #define IO_REPARSE_TAG_HSM           0xC0000004
 #define IO_REPARSE_TAG_SIS           0x80000007
+
+/* fsctl flags */
+/* If Flags is set to this value, the request is an FSCTL not ioctl request */
+#define SMB2_0_IOCTL_IS_FSCTL		0x00000001
+
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 1a528680ec5..6fdcb1b4a10 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -49,7 +49,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 	struct mid_q_entry *temp;
 
 	if (server == NULL) {
-		cERROR(1, "Null TCP session in AllocMidQEntry");
+		cifs_dbg(VFS, "Null TCP session in AllocMidQEntry\n");
 		return NULL;
 	}
 
@@ -61,7 +61,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 		temp->mid = smb_buffer->Mid;	/* always LE */
 		temp->pid = current->pid;
 		temp->command = cpu_to_le16(smb_buffer->Command);
-		cFYI(1, "For smb_command %d", smb_buffer->Command);
+		cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
 	/*	do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
 		/* when mid allocated can be before when sent */
 		temp->when_alloc = jiffies;
@@ -179,17 +179,11 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
 		 */
 		rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
 				    n_vec - first_vec, remaining);
-		if (rc == -ENOSPC || rc == -EAGAIN) {
-			/*
-			 * Catch if a low level driver returns -ENOSPC. This
-			 * WARN_ON will be removed by 3.10 if no one reports
-			 * seeing this.
-			 */
-			WARN_ON_ONCE(rc == -ENOSPC);
+		if (rc == -EAGAIN) {
 			i++;
 			if (i >= 14 || (!server->noblocksnd && (i > 2))) {
-				cERROR(1, "sends on sock %p stuck for 15 "
-					  "seconds", ssocket);
+				cifs_dbg(VFS, "sends on sock %p stuck for 15 seconds\n",
+					 ssocket);
 				rc = -EAGAIN;
 				break;
 			}
@@ -209,14 +203,14 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
 		}
 
 		if (rc > remaining) {
-			cERROR(1, "sent %d requested %d", rc, remaining);
+			cifs_dbg(VFS, "sent %d requested %d\n", rc, remaining);
 			break;
 		}
 
 		if (rc == 0) {
 			/* should never happen, letting socket clear before
 			   retrying is our only obvious option here */
-			cERROR(1, "tcp sent no data");
+			cifs_dbg(VFS, "tcp sent no data\n");
 			msleep(500);
 			continue;
 		}
@@ -291,7 +285,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
 	if (ssocket == NULL)
 		return -ENOTSOCK;
 
-	cFYI(1, "Sending smb: smb_len=%u", smb_buf_length);
+	cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length);
 	dump_smb(iov[0].iov_base, iov[0].iov_len);
 
 	/* cork the socket */
@@ -324,8 +318,8 @@ uncork:
 				(char *)&val, sizeof(val));
 
 	if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
-		cFYI(1, "partial send (wanted=%u sent=%zu): terminating "
-			"session", smb_buf_length + 4, total_len);
+		cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n",
+			 smb_buf_length + 4, total_len);
 		/*
 		 * If we have only sent part of an SMB then the next SMB could
 		 * be taken as the remainder of this one. We need to kill the
@@ -335,7 +329,8 @@ uncork:
 	}
 
 	if (rc < 0 && rc != -EINTR)
-		cERROR(1, "Error %d sending data on socket to server", rc);
+		cifs_dbg(VFS, "Error %d sending data on socket to server\n",
+			 rc);
 	else
 		rc = 0;
 
@@ -427,7 +422,7 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
 	}
 
 	if (ses->server->tcpStatus == CifsNeedReconnect) {
-		cFYI(1, "tcp session dead - return to caller to retry");
+		cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
 		return -EAGAIN;
 	}
 
@@ -452,7 +447,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 {
 	int error;
 
-	error = wait_event_freezekillable(server->response_q,
+	error = wait_event_freezekillable_unsafe(server->response_q,
 				    midQ->mid_state != MID_REQUEST_SUBMITTED);
 	if (error < 0)
 		return -ERESTARTSYS;
@@ -468,7 +463,7 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
 	struct mid_q_entry *mid;
 
 	/* enable signing if server requires it */
-	if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+	if (server->sign)
 		hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
 
 	mid = AllocMidQEntry(hdr, server);
@@ -527,6 +522,9 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
 	rc = smb_send_rqst(server, rqst);
 	cifs_in_send_dec(server);
 	cifs_save_when_sent(mid);
+
+	if (rc < 0)
+		server->sequence_number -= 2;
 	mutex_unlock(&server->srv_mutex);
 
 	if (rc == 0)
@@ -559,7 +557,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
 	iov[0].iov_len = get_rfc1002_length(in_buf) + 4;
 	flags |= CIFS_NO_RESP;
 	rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
-	cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
+	cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc);
 
 	return rc;
 }
@@ -569,8 +567,8 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 {
 	int rc = 0;
 
-	cFYI(1, "%s: cmd=%d mid=%llu state=%d", __func__,
-	     le16_to_cpu(mid->command), mid->mid, mid->mid_state);
+	cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n",
+		 __func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state);
 
 	spin_lock(&GlobalMid_Lock);
 	switch (mid->mid_state) {
@@ -588,8 +586,8 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 		break;
 	default:
 		list_del_init(&mid->qhead);
-		cERROR(1, "%s: invalid mid state mid=%llu state=%d", __func__,
-		       mid->mid, mid->mid_state);
+		cifs_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
+			 __func__, mid->mid, mid->mid_state);
 		rc = -EIO;
 	}
 	spin_unlock(&GlobalMid_Lock);
@@ -614,7 +612,7 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
 	dump_smb(mid->resp_buf, min_t(u32, 92, len));
 
 	/* convert the length into a more usable form */
-	if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+	if (server->sign) {
 		struct kvec iov;
 		int rc = 0;
 		struct smb_rqst rqst = { .rq_iov = &iov,
@@ -624,10 +622,10 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
 		iov.iov_len = len;
 		/* FIXME: add code to kill session */
 		rc = cifs_verify_signature(&rqst, server,
-					   mid->sequence_number + 1);
+					   mid->sequence_number);
 		if (rc)
-			cERROR(1, "SMB signature verification returned error = "
-			       "%d", rc);
+			cifs_dbg(VFS, "SMB signature verification returned error = %d\n",
+				 rc);
 	}
 
 	/* BB special case reconnect tid and uid here? */
@@ -672,7 +670,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 
 	if ((ses == NULL) || (ses->server == NULL)) {
 		cifs_small_buf_release(buf);
-		cERROR(1, "Null session");
+		cifs_dbg(VFS, "Null session\n");
 		return -EIO;
 	}
 
@@ -716,6 +714,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 	cifs_in_send_dec(ses->server);
 	cifs_save_when_sent(midQ);
 
+	if (rc < 0)
+		ses->server->sequence_number -= 2;
 	mutex_unlock(&ses->server->srv_mutex);
 
 	if (rc < 0) {
@@ -752,7 +752,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 
 	if (!midQ->resp_buf || midQ->mid_state != MID_RESPONSE_RECEIVED) {
 		rc = -EIO;
-		cFYI(1, "Bad MID state?");
+		cifs_dbg(FYI, "Bad MID state?\n");
 		goto out;
 	}
 
@@ -788,11 +788,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 	struct mid_q_entry *midQ;
 
 	if (ses == NULL) {
-		cERROR(1, "Null smb session");
+		cifs_dbg(VFS, "Null smb session\n");
 		return -EIO;
 	}
 	if (ses->server == NULL) {
-		cERROR(1, "Null tcp session");
+		cifs_dbg(VFS, "Null tcp session\n");
 		return -EIO;
 	}
 
@@ -805,8 +805,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 
 	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
 			MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, "Illegal length, greater than maximum frame, %d",
-			   be32_to_cpu(in_buf->smb_buf_length));
+		cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
+			 be32_to_cpu(in_buf->smb_buf_length));
 		return -EIO;
 	}
 
@@ -840,6 +840,10 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
 	cifs_in_send_dec(ses->server);
 	cifs_save_when_sent(midQ);
+
+	if (rc < 0)
+		ses->server->sequence_number -= 2;
+
 	mutex_unlock(&ses->server->srv_mutex);
 
 	if (rc < 0)
@@ -871,7 +875,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 	if (!midQ->resp_buf || !out_buf ||
 	    midQ->mid_state != MID_RESPONSE_RECEIVED) {
 		rc = -EIO;
-		cERROR(1, "Bad MID state?");
+		cifs_dbg(VFS, "Bad MID state?\n");
 		goto out;
 	}
 
@@ -921,13 +925,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 	struct cifs_ses *ses;
 
 	if (tcon == NULL || tcon->ses == NULL) {
-		cERROR(1, "Null smb session");
+		cifs_dbg(VFS, "Null smb session\n");
 		return -EIO;
 	}
 	ses = tcon->ses;
 
 	if (ses->server == NULL) {
-		cERROR(1, "Null tcp session");
+		cifs_dbg(VFS, "Null tcp session\n");
 		return -EIO;
 	}
 
@@ -940,8 +944,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 
 	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
 			MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, "Illegal length, greater than maximum frame, %d",
-			   be32_to_cpu(in_buf->smb_buf_length));
+		cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
+			 be32_to_cpu(in_buf->smb_buf_length));
 		return -EIO;
 	}
 
@@ -973,6 +977,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
 	cifs_in_send_dec(ses->server);
 	cifs_save_when_sent(midQ);
+
+	if (rc < 0)
+		ses->server->sequence_number -= 2;
+
 	mutex_unlock(&ses->server->srv_mutex);
 
 	if (rc < 0) {
@@ -1038,7 +1046,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 	/* rcvd frame is ok */
 	if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) {
 		rc = -EIO;
-		cERROR(1, "Bad MID state?");
+		cifs_dbg(VFS, "Bad MID state?\n");
 		goto out;
 	}
 
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 5142f2c6027..09afda4cc58 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -68,12 +68,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 		goto remove_ea_exit;
 	}
 	if (ea_name == NULL) {
-		cFYI(1, "Null xattr names not supported");
+		cifs_dbg(FYI, "Null xattr names not supported\n");
 	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
 		&& (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))) {
-		cFYI(1,
-		     "illegal xattr request %s (only user namespace supported)",
-		     ea_name);
+		cifs_dbg(FYI,
+			 "illegal xattr request %s (only user namespace supported)\n",
+			 ea_name);
 		/* BB what if no namespace prefix? */
 		/* Should we just pass them to server, except for
 		system and perhaps security prefixes? */
@@ -134,19 +134,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 		search server for EAs or streams to
 		returns as xattrs */
 	if (value_size > MAX_EA_VALUE_SIZE) {
-		cFYI(1, "size of EA value too large");
+		cifs_dbg(FYI, "size of EA value too large\n");
 		rc = -EOPNOTSUPP;
 		goto set_ea_exit;
 	}
 
 	if (ea_name == NULL) {
-		cFYI(1, "Null xattr names not supported");
+		cifs_dbg(FYI, "Null xattr names not supported\n");
 	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
 		   == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto set_ea_exit;
 		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
-			cFYI(1, "attempt to set cifs inode metadata");
+			cifs_dbg(FYI, "attempt to set cifs inode metadata\n");
 
 		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
 		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
@@ -167,8 +167,6 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 		struct cifs_ntsd *pacl;
 		pacl = kmalloc(value_size, GFP_KERNEL);
 		if (!pacl) {
-			cFYI(1, "%s: Can't allocate memory for ACL",
-					__func__);
 			rc = -ENOMEM;
 		} else {
 			memcpy(pacl, ea_value, value_size);
@@ -179,7 +177,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 			kfree(pacl);
 		}
 #else
-			cFYI(1, "Set CIFS ACL not supported yet");
+		cifs_dbg(FYI, "Set CIFS ACL not supported yet\n");
 #endif /* CONFIG_CIFS_ACL */
 	} else {
 		int temp;
@@ -193,9 +191,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 					ACL_TYPE_ACCESS, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			cFYI(1, "set POSIX ACL rc %d", rc);
+			cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc);
 #else
-			cFYI(1, "set POSIX ACL not supported");
+			cifs_dbg(FYI, "set POSIX ACL not supported\n");
 #endif
 		} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
 				   strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -206,13 +204,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 					ACL_TYPE_DEFAULT, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			cFYI(1, "set POSIX default ACL rc %d", rc);
+			cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc);
 #else
-			cFYI(1, "set default POSIX ACL not supported");
+			cifs_dbg(FYI, "set default POSIX ACL not supported\n");
 #endif
 		} else {
-			cFYI(1, "illegal xattr request %s (only user namespace"
-				" supported)", ea_name);
+			cifs_dbg(FYI, "illegal xattr request %s (only user namespace supported)\n",
+				 ea_name);
 		  /* BB what if no namespace prefix? */
 		  /* Should we just pass them to server, except for
 		  system and perhaps security prefixes? */
@@ -263,14 +261,14 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 	/* return dos attributes as pseudo xattr */
 	/* return alt name if available as pseudo attr */
 	if (ea_name == NULL) {
-		cFYI(1, "Null xattr names not supported");
+		cifs_dbg(FYI, "Null xattr names not supported\n");
 	} else if (strncmp(ea_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)
 		   == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto get_ea_exit;
 
 		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
-			cFYI(1, "attempt to query cifs inode metadata");
+			cifs_dbg(FYI, "attempt to query cifs inode metadata\n");
 			/* revalidate/getattr then populate from inode */
 		} /* BB add else when above is implemented */
 		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
@@ -295,7 +293,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-		cFYI(1, "Query POSIX ACL not supported yet");
+		cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
 #endif /* CONFIG_CIFS_POSIX */
 	} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
 			  strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -307,7 +305,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-		cFYI(1, "Query POSIX default ACL not supported yet");
+		cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n");
 #endif /* CONFIG_CIFS_POSIX */
 	} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
 				strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
@@ -319,8 +317,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 						full_path, &acllen);
 			if (IS_ERR(pacl)) {
 				rc = PTR_ERR(pacl);
-				cERROR(1, "%s: error %zd getting sec desc",
-						__func__, rc);
+				cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
+					 __func__, rc);
 			} else {
 				if (ea_value) {
 					if (acllen > buf_size)
@@ -332,18 +330,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 				kfree(pacl);
 			}
 #else
-		cFYI(1, "Query CIFS ACL not supported yet");
+			cifs_dbg(FYI, "Query CIFS ACL not supported yet\n");
 #endif /* CONFIG_CIFS_ACL */
 	} else if (strncmp(ea_name,
 		  XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
-		cFYI(1, "Trusted xattr namespace not supported yet");
+		cifs_dbg(FYI, "Trusted xattr namespace not supported yet\n");
 	} else if (strncmp(ea_name,
 		  XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
-		cFYI(1, "Security xattr namespace not supported yet");
+		cifs_dbg(FYI, "Security xattr namespace not supported yet\n");
 	} else
-		cFYI(1,
-		    "illegal xattr request %s (only user namespace supported)",
-		     ea_name);
+		cifs_dbg(FYI,
+			 "illegal xattr request %s (only user namespace supported)\n",
+			 ea_name);
 
 	/* We could add an additional check for streams ie
 	    if proc/fs/cifs/streamstoxattr is set then
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 958ae0e0ff8..1da168c61d3 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -33,7 +33,7 @@ void coda_cache_enter(struct inode *inode, int mask)
 
 	spin_lock(&cii->c_lock);
 	cii->c_cached_epoch = atomic_read(&permission_epoch);
-	if (cii->c_uid != current_fsuid()) {
+	if (!uid_eq(cii->c_uid, current_fsuid())) {
 		cii->c_uid = current_fsuid();
                 cii->c_cached_perm = mask;
         } else
@@ -65,7 +65,7 @@ int coda_cache_check(struct inode *inode, int mask)
 	
 	spin_lock(&cii->c_lock);
 	hit = (mask & cii->c_cached_perm) == mask &&
-	    cii->c_uid == current_fsuid() &&
+	    uid_eq(cii->c_uid, current_fsuid()) &&
 	    cii->c_cached_epoch == atomic_read(&permission_epoch);
 	spin_unlock(&cii->c_lock);
 
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index b24fdfd8a3f..c6407521321 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -25,7 +25,7 @@ struct coda_inode_info {
 	u_short	           c_flags;     /* flags (see below) */
 	unsigned int	   c_mapcount;  /* nr of times this inode is mapped */
 	unsigned int	   c_cached_epoch; /* epoch for cached permissions */
-	vuid_t		   c_uid;	/* fsuid for cached permissions */
+	kuid_t		   c_uid;	/* fsuid for cached permissions */
 	unsigned int       c_cached_perm; /* cached access permissions */
 	spinlock_t	   c_lock;
 	struct inode	   vfs_inode;
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 854ace71268..2849f41e72a 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -100,9 +100,9 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
 	if (attr->va_mode != (u_short) -1)
 	        inode->i_mode = attr->va_mode | inode_type;
         if (attr->va_uid != -1) 
-	        inode->i_uid = (uid_t) attr->va_uid;
+	        inode->i_uid = make_kuid(&init_user_ns, (uid_t) attr->va_uid);
         if (attr->va_gid != -1)
-	        inode->i_gid = (gid_t) attr->va_gid;
+	        inode->i_gid = make_kgid(&init_user_ns, (gid_t) attr->va_gid);
 	if (attr->va_nlink != -1)
 		set_nlink(inode, attr->va_nlink);
 	if (attr->va_size != -1)
@@ -171,10 +171,10 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr)
                 vattr->va_mode = iattr->ia_mode;
 	}
         if ( valid & ATTR_UID ) {
-                vattr->va_uid = (vuid_t) iattr->ia_uid;
+                vattr->va_uid = (vuid_t) from_kuid(&init_user_ns, iattr->ia_uid);
 	}
         if ( valid & ATTR_GID ) {
-                vattr->va_gid = (vgid_t) iattr->ia_gid;
+                vattr->va_gid = (vgid_t) from_kgid(&init_user_ns, iattr->ia_gid);
 	}
         if ( valid & ATTR_SIZE ) {
                 vattr->va_size = iattr->ia_size;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 49fe52d2560..190effc6a6f 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -43,15 +43,14 @@ static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
                        struct inode *new_inode, struct dentry *new_dentry);
 
 /* dir file-ops */
-static int coda_readdir(struct file *file, void *buf, filldir_t filldir);
+static int coda_readdir(struct file *file, struct dir_context *ctx);
 
 /* dentry ops */
 static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
 static int coda_dentry_delete(const struct dentry *);
 
 /* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
-			      filldir_t filldir);
+static int coda_venus_readdir(struct file *, struct dir_context *);
 
 /* same as fs/bad_inode.c */
 static int coda_return_EIO(void)
@@ -85,7 +84,7 @@ const struct inode_operations coda_dir_inode_operations =
 const struct file_operations coda_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= coda_readdir,
+	.iterate	= coda_readdir,
 	.open		= coda_open,
 	.release	= coda_release,
 	.fsync		= coda_fsync,
@@ -378,7 +377,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 
 /* file operations for directories */
-static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
+static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
 {
 	struct coda_file_info *cfi;
 	struct file *host_file;
@@ -391,30 +390,19 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
 	if (!host_file->f_op)
 		return -ENOTDIR;
 
-	if (host_file->f_op->readdir)
-	{
-		/* potemkin case: we were handed a directory inode.
-		 * We can't use vfs_readdir because we have to keep the file
-		 * position in sync between the coda_file and the host_file.
-		 * and as such we need grab the inode mutex. */
-		struct inode *host_inode = host_file->f_path.dentry->d_inode;
-
+	if (host_file->f_op->iterate) {
+		struct inode *host_inode = file_inode(host_file);
 		mutex_lock(&host_inode->i_mutex);
-		host_file->f_pos = coda_file->f_pos;
-
 		ret = -ENOENT;
 		if (!IS_DEADDIR(host_inode)) {
-			ret = host_file->f_op->readdir(host_file, buf, filldir);
+			ret = host_file->f_op->iterate(host_file, ctx);
 			file_accessed(host_file);
 		}
-
-		coda_file->f_pos = host_file->f_pos;
 		mutex_unlock(&host_inode->i_mutex);
+		return ret;
 	}
-	else /* Venus: we must read Venus dirents from a file */
-		ret = coda_venus_readdir(coda_file, buf, filldir);
-
-	return ret;
+	/* Venus: we must read Venus dirents from a file */
+	return coda_venus_readdir(coda_file, ctx);
 }
 
 static inline unsigned int CDT2DT(unsigned char cdt)
@@ -437,10 +425,8 @@ static inline unsigned int CDT2DT(unsigned char cdt)
 }
 
 /* support routines */
-static int coda_venus_readdir(struct file *coda_file, void *buf,
-			      filldir_t filldir)
+static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
 {
-	int result = 0; /* # of entries returned */
 	struct coda_file_info *cfi;
 	struct coda_inode_info *cii;
 	struct file *host_file;
@@ -462,23 +448,12 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
 	vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
 	if (!vdir) return -ENOMEM;
 
-	if (coda_file->f_pos == 0) {
-		ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR);
-		if (ret < 0)
-			goto out;
-		result++;
-		coda_file->f_pos++;
-	}
-	if (coda_file->f_pos == 1) {
-		ret = filldir(buf, "..", 2, 1, parent_ino(de), DT_DIR);
-		if (ret < 0)
-			goto out;
-		result++;
-		coda_file->f_pos++;
-	}
+	if (!dir_emit_dots(coda_file, ctx))
+		goto out;
+
 	while (1) {
 		/* read entries from the directory file */
-		ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir,
+		ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
 				  sizeof(*vdir));
 		if (ret < 0) {
 			printk(KERN_ERR "coda readdir: read dir %s failed %d\n",
@@ -507,32 +482,23 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
 
 		/* Make sure we skip '.' and '..', we already got those */
 		if (name.name[0] == '.' && (name.len == 1 ||
-		    (vdir->d_name[1] == '.' && name.len == 2)))
+		    (name.name[1] == '.' && name.len == 2)))
 			vdir->d_fileno = name.len = 0;
 
 		/* skip null entries */
 		if (vdir->d_fileno && name.len) {
-			/* try to look up this entry in the dcache, that way
-			 * userspace doesn't have to worry about breaking
-			 * getcwd by having mismatched inode numbers for
-			 * internal volume mountpoints. */
-			ino = find_inode_number(de, &name);
-			if (!ino) ino = vdir->d_fileno;
-
+			ino = vdir->d_fileno;
 			type = CDT2DT(vdir->d_type);
-			ret = filldir(buf, name.name, name.len,
-				      coda_file->f_pos, ino, type);
-			/* failure means no space for filling in this round */
-			if (ret < 0) break;
-			result++;
+			if (!dir_emit(ctx, name.name, name.len, ino, type))
+				break;
 		}
 		/* we'll always have progress because d_reclen is unsigned and
 		 * we've already established it is non-zero. */
-		coda_file->f_pos += vdir->d_reclen;
+		ctx->pos += vdir->d_reclen;
 	}
 out:
 	kfree(vdir);
-	return result ? result : ret;
+	return 0;
 }
 
 /* called when a cache lookup succeeds */
@@ -560,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
 	if (cii->c_flags & C_FLUSH) 
 		coda_flag_inode_children(inode, C_FLUSH);
 
-	if (de->d_count > 1)
+	if (d_count(de) > 1)
 		/* pretend it's valid, but don't change the flags */
 		goto out;
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 8edd404e641..380b798f844 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -66,7 +66,7 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
 static ssize_t
 coda_file_write(struct file *coda_file, const char __user *buf, size_t count, loff_t *ppos)
 {
-	struct inode *host_inode, *coda_inode = coda_file->f_path.dentry->d_inode;
+	struct inode *host_inode, *coda_inode = file_inode(coda_file);
 	struct coda_file_info *cfi;
 	struct file *host_file;
 	ssize_t ret;
@@ -78,7 +78,8 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
 	if (!host_file->f_op || !host_file->f_op->write)
 		return -EINVAL;
 
-	host_inode = host_file->f_path.dentry->d_inode;
+	host_inode = file_inode(host_file);
+	file_start_write(host_file);
 	mutex_lock(&coda_inode->i_mutex);
 
 	ret = host_file->f_op->write(host_file, buf, count, ppos);
@@ -87,6 +88,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
 	coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
 	coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
 	mutex_unlock(&coda_inode->i_mutex);
+	file_end_write(host_file);
 
 	return ret;
 }
@@ -106,8 +108,8 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
 	if (!host_file->f_op || !host_file->f_op->mmap)
 		return -ENODEV;
 
-	coda_inode = coda_file->f_path.dentry->d_inode;
-	host_inode = host_file->f_path.dentry->d_inode;
+	coda_inode = file_inode(coda_file);
+	host_inode = file_inode(host_file);
 
 	cii = ITOC(coda_inode);
 	spin_lock(&cii->c_lock);
@@ -178,7 +180,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 	err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
 			  coda_flags, coda_file->f_cred->fsuid);
 
-	host_inode = cfi->cfi_container->f_path.dentry->d_inode;
+	host_inode = file_inode(cfi->cfi_container);
 	cii = ITOC(coda_inode);
 
 	/* did we mmap this file? */
@@ -202,7 +204,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
 {
 	struct file *host_file;
-	struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
+	struct inode *coda_inode = file_inode(coda_file);
 	struct coda_file_info *cfi;
 	int err;
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index be2aa490948..4dcc0d81a7a 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -20,6 +20,7 @@
 #include <linux/file.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
+#include <linux/pid_namespace.h>
 
 #include <asm/uaccess.h>
 
@@ -48,7 +49,7 @@ static struct inode *coda_alloc_inode(struct super_block *sb)
 		return NULL;
 	memset(&ei->c_fid, 0, sizeof(struct CodaFid));
 	ei->c_flags = 0;
-	ei->c_uid = 0;
+	ei->c_uid = GLOBAL_ROOT_UID;
 	ei->c_cached_perm = 0;
 	spin_lock_init(&ei->c_lock);
 	return &ei->vfs_inode;
@@ -129,7 +130,7 @@ static int get_device_index(struct coda_mount_data *data)
 	f = fdget(data->fd);
 	if (!f.file)
 		goto Ebadf;
-	inode = f.file->f_path.dentry->d_inode;
+	inode = file_inode(f.file);
 	if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
 		fdput(f);
 		goto Ebadf;
@@ -157,6 +158,9 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	int error;
 	int idx;
 
+	if (task_active_pid_ns(current) != &init_pid_ns)
+		return -EINVAL;
+
 	idx = get_device_index((struct coda_mount_data *) data);
 
 	/* Ignore errors in data, for backward compatibility */
@@ -325,4 +329,5 @@ struct file_system_type coda_fs_type = {
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_BINARY_MOUNTDATA,
 };
+MODULE_ALIAS_FS("coda");
 
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index ee0981f1375..3f5de96bbb5 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -52,7 +52,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
 	struct path path;
 	int error;
 	struct PioctlData data;
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct inode *target_inode = NULL;
 	struct coda_inode_info *cnp;
 
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 761d5b31b18..ebc2bae6c28 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -37,6 +37,7 @@
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/device.h>
+#include <linux/pid_namespace.h>
 #include <asm/io.h>
 #include <asm/poll.h>
 #include <asm/uaccess.h>
@@ -266,6 +267,12 @@ static int coda_psdev_open(struct inode * inode, struct file * file)
 	struct venus_comm *vcp;
 	int idx, err;
 
+	if (task_active_pid_ns(current) != &init_pid_ns)
+		return -EINVAL;
+
+	if (current_user_ns() != &init_user_ns)
+		return -EINVAL;
+
 	idx = iminor(inode);
 	if (idx < 0 || idx >= MAX_CODADEVS)
 		return -ENODEV;
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 0c68fd31fbf..3a731976dc5 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -50,9 +50,9 @@ static void *alloc_upcall(int opcode, int size)
 		return ERR_PTR(-ENOMEM);
 
         inp->ih.opcode = opcode;
-	inp->ih.pid = current->pid;
-	inp->ih.pgid = task_pgrp_nr(current);
-	inp->ih.uid = current_fsuid();
+	inp->ih.pid = task_pid_nr_ns(current, &init_pid_ns);
+	inp->ih.pgid = task_pgrp_nr_ns(current, &init_pid_ns);
+	inp->ih.uid = from_kuid(&init_user_ns, current_fsuid());
 
 	return (void*)inp;
 }
@@ -157,7 +157,7 @@ int venus_lookup(struct super_block *sb, struct CodaFid *fid,
 }
 
 int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
-		vuid_t uid)
+		kuid_t uid)
 {
 	union inputArgs *inp;
 	union outputArgs *outp;
@@ -166,7 +166,7 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
 	insize = SIZE(release);
 	UPARG(CODA_CLOSE);
 	
-	inp->ih.uid = uid;
+	inp->ih.uid = from_kuid(&init_user_ns, uid);
         inp->coda_close.VFid = *fid;
         inp->coda_close.flags = flags;
 
diff --git a/fs/compat.c b/fs/compat.c
index 015e1e1f87c..6af20de2c1a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -44,10 +44,10 @@
 #include <linux/signal.h>
 #include <linux/poll.h>
 #include <linux/mm.h>
-#include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -68,8 +68,6 @@ int compat_printk(const char *fmt, ...)
 	return ret;
 }
 
-#include "read_write.h"
-
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -558,6 +556,10 @@ ssize_t compat_rw_copy_check_uvector(int type,
 	}
 	*ret_pointer = iov;
 
+	ret = -EFAULT;
+	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+		goto out;
+
 	/*
 	 * Single unix specification:
 	 * We should -EINVAL if an element length is not >= 0 and fitting an
@@ -830,6 +832,7 @@ struct compat_old_linux_dirent {
 };
 
 struct compat_readdir_callback {
+	struct dir_context ctx;
 	struct compat_old_linux_dirent __user *dirent;
 	int result;
 };
@@ -871,15 +874,15 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
 {
 	int error;
 	struct fd f = fdget(fd);
-	struct compat_readdir_callback buf;
+	struct compat_readdir_callback buf = {
+		.ctx.actor = compat_fillonedir,
+		.dirent = dirent
+	};
 
 	if (!f.file)
 		return -EBADF;
 
-	buf.result = 0;
-	buf.dirent = dirent;
-
-	error = vfs_readdir(f.file, compat_fillonedir, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (buf.result)
 		error = buf.result;
 
@@ -895,6 +898,7 @@ struct compat_linux_dirent {
 };
 
 struct compat_getdents_callback {
+	struct dir_context ctx;
 	struct compat_linux_dirent __user *current_dir;
 	struct compat_linux_dirent __user *previous;
 	int count;
@@ -949,7 +953,11 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 {
 	struct fd f;
 	struct compat_linux_dirent __user * lastdirent;
-	struct compat_getdents_callback buf;
+	struct compat_getdents_callback buf = {
+		.ctx.actor = compat_filldir,
+		.current_dir = dirent,
+		.count = count
+	};
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -959,17 +967,12 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 	if (!f.file)
 		return -EBADF;
 
-	buf.current_dir = dirent;
-	buf.previous = NULL;
-	buf.count = count;
-	buf.error = 0;
-
-	error = vfs_readdir(f.file, compat_filldir, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		if (put_user(f.file->f_pos, &lastdirent->d_off))
+		if (put_user(buf.ctx.pos, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;
@@ -981,6 +984,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 #ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
 
 struct compat_getdents_callback64 {
+	struct dir_context ctx;
 	struct linux_dirent64 __user *current_dir;
 	struct linux_dirent64 __user *previous;
 	int count;
@@ -1034,7 +1038,11 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
 {
 	struct fd f;
 	struct linux_dirent64 __user * lastdirent;
-	struct compat_getdents_callback64 buf;
+	struct compat_getdents_callback64 buf = {
+		.ctx.actor = compat_filldir64,
+		.current_dir = dirent,
+		.count = count
+	};
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -1044,17 +1052,12 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
 	if (!f.file)
 		return -EBADF;
 
-	buf.current_dir = dirent;
-	buf.previous = NULL;
-	buf.count = count;
-	buf.error = 0;
-
-	error = vfs_readdir(f.file, compat_filldir64, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		typeof(lastdirent->d_off) d_off = f.file->f_pos;
+		typeof(lastdirent->d_off) d_off = buf.ctx.pos;
 		if (__put_user_unaligned(d_off, &lastdirent->d_off))
 			error = -EFAULT;
 		else
@@ -1065,221 +1068,11 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
 }
 #endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
 
-static ssize_t compat_do_readv_writev(int type, struct file *file,
-			       const struct compat_iovec __user *uvector,
-			       unsigned long nr_segs, loff_t *pos)
-{
-	compat_ssize_t tot_len;
-	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov = iovstack;
-	ssize_t ret;
-	io_fn_t fn;
-	iov_fn_t fnv;
-
-	ret = -EINVAL;
-	if (!file->f_op)
-		goto out;
-
-	ret = -EFAULT;
-	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
-		goto out;
-
-	tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
-					       UIO_FASTIOV, iovstack, &iov);
-	if (tot_len == 0) {
-		ret = 0;
-		goto out;
-	}
-
-	ret = rw_verify_area(type, file, pos, tot_len);
-	if (ret < 0)
-		goto out;
-
-	fnv = NULL;
-	if (type == READ) {
-		fn = file->f_op->read;
-		fnv = file->f_op->aio_read;
-	} else {
-		fn = (io_fn_t)file->f_op->write;
-		fnv = file->f_op->aio_write;
-	}
-
-	if (fnv)
-		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
-						pos, fnv);
-	else
-		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
-
-out:
-	if (iov != iovstack)
-		kfree(iov);
-	if ((ret + (type == READ)) > 0) {
-		if (type == READ)
-			fsnotify_access(file);
-		else
-			fsnotify_modify(file);
-	}
-	return ret;
-}
-
-static size_t compat_readv(struct file *file,
-			   const struct compat_iovec __user *vec,
-			   unsigned long vlen, loff_t *pos)
-{
-	ssize_t ret = -EBADF;
-
-	if (!(file->f_mode & FMODE_READ))
-		goto out;
-
-	ret = -EINVAL;
-	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
-		goto out;
-
-	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
-
-out:
-	if (ret > 0)
-		add_rchar(current, ret);
-	inc_syscr(current);
-	return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
-		 unsigned long vlen)
-{
-	struct fd f = fdget(fd);
-	ssize_t ret;
-	loff_t pos;
-
-	if (!f.file)
-		return -EBADF;
-	pos = f.file->f_pos;
-	ret = compat_readv(f.file, vec, vlen, &pos);
-	f.file->f_pos = pos;
-	fdput(f);
-	return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec,
-		    unsigned long vlen, loff_t pos)
-{
-	struct fd f;
-	ssize_t ret;
-
-	if (pos < 0)
-		return -EINVAL;
-	f = fdget(fd);
-	if (!f.file)
-		return -EBADF;
-	ret = -ESPIPE;
-	if (f.file->f_mode & FMODE_PREAD)
-		ret = compat_readv(f.file, vec, vlen, &pos);
-	fdput(f);
-	return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
-		  unsigned long vlen, u32 pos_low, u32 pos_high)
-{
-	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
-	return compat_sys_preadv64(fd, vec, vlen, pos);
-}
-
-static size_t compat_writev(struct file *file,
-			    const struct compat_iovec __user *vec,
-			    unsigned long vlen, loff_t *pos)
-{
-	ssize_t ret = -EBADF;
-
-	if (!(file->f_mode & FMODE_WRITE))
-		goto out;
-
-	ret = -EINVAL;
-	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
-		goto out;
-
-	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
-
-out:
-	if (ret > 0)
-		add_wchar(current, ret);
-	inc_syscw(current);
-	return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
-		  unsigned long vlen)
-{
-	struct fd f = fdget(fd);
-	ssize_t ret;
-	loff_t pos;
-
-	if (!f.file)
-		return -EBADF;
-	pos = f.file->f_pos;
-	ret = compat_writev(f.file, vec, vlen, &pos);
-	f.file->f_pos = pos;
-	fdput(f);
-	return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec,
-		     unsigned long vlen, loff_t pos)
-{
-	struct fd f;
-	ssize_t ret;
-
-	if (pos < 0)
-		return -EINVAL;
-	f = fdget(fd);
-	if (!f.file)
-		return -EBADF;
-	ret = -ESPIPE;
-	if (f.file->f_mode & FMODE_PWRITE)
-		ret = compat_writev(f.file, vec, vlen, &pos);
-	fdput(f);
-	return ret;
-}
-
-asmlinkage ssize_t
-compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
-		   unsigned long vlen, u32 pos_low, u32 pos_high)
-{
-	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
-	return compat_sys_pwritev64(fd, vec, vlen, pos);
-}
-
-asmlinkage long
-compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
-		    unsigned int nr_segs, unsigned int flags)
-{
-	unsigned i;
-	struct iovec __user *iov;
-	if (nr_segs > UIO_MAXIOV)
-		return -EINVAL;
-	iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
-	for (i = 0; i < nr_segs; i++) {
-		struct compat_iovec v;
-		if (get_user(v.iov_base, &iov32[i].iov_base) ||
-		    get_user(v.iov_len, &iov32[i].iov_len) ||
-		    put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
-		    put_user(v.iov_len, &iov[i].iov_len))
-			return -EFAULT;
-	}
-	return sys_vmsplice(fd, iov, nr_segs, flags);
-}
-
 /*
  * Exactly like fs/open.c:sys_open(), except that it doesn't set the
  * O_LARGEFILE flag.
  */
-asmlinkage long
-compat_sys_open(const char __user *filename, int flags, umode_t mode)
+COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
 {
 	return do_sys_open(AT_FDCWD, filename, flags, mode);
 }
@@ -1288,8 +1081,7 @@ compat_sys_open(const char __user *filename, int flags, umode_t mode)
  * Exactly like fs/open.c:sys_openat(), except that it doesn't set the
  * O_LARGEFILE flag.
  */
-asmlinkage long
-compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, umode_t mode)
+COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
 {
 	return do_sys_open(dfd, filename, flags, mode);
 }
@@ -1661,156 +1453,14 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 	return ret;
 }
 
-#ifdef CONFIG_EPOLL
-
-asmlinkage long compat_sys_epoll_pwait(int epfd,
-			struct compat_epoll_event __user *events,
-			int maxevents, int timeout,
-			const compat_sigset_t __user *sigmask,
-			compat_size_t sigsetsize)
-{
-	long err;
-	compat_sigset_t csigmask;
-	sigset_t ksigmask, sigsaved;
-
-	/*
-	 * If the caller wants a certain signal mask to be set during the wait,
-	 * we apply it here.
-	 */
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
-			return -EFAULT;
-		sigset_from_compat(&ksigmask, &csigmask);
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
-
-	err = sys_epoll_wait(epfd, events, maxevents, timeout);
-
-	/*
-	 * If we changed the signal mask, we need to restore the original one.
-	 * In case we've got a signal while waiting, we do not restore the
-	 * signal mask yet, and we allow do_signal() to deliver the signal on
-	 * the way back to userspace, before the signal mask is restored.
-	 */
-	if (sigmask) {
-		if (err == -EINTR) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		} else
-			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-	}
-
-	return err;
-}
-
-#endif /* CONFIG_EPOLL */
-
-#ifdef CONFIG_SIGNALFD
-
-asmlinkage long compat_sys_signalfd4(int ufd,
-				     const compat_sigset_t __user *sigmask,
-				     compat_size_t sigsetsize, int flags)
-{
-	compat_sigset_t ss32;
-	sigset_t tmp;
-	sigset_t __user *ksigmask;
-
-	if (sigsetsize != sizeof(compat_sigset_t))
-		return -EINVAL;
-	if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
-		return -EFAULT;
-	sigset_from_compat(&tmp, &ss32);
-	ksigmask = compat_alloc_user_space(sizeof(sigset_t));
-	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
-		return -EFAULT;
-
-	return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
-}
-
-asmlinkage long compat_sys_signalfd(int ufd,
-				    const compat_sigset_t __user *sigmask,
-				    compat_size_t sigsetsize)
-{
-	return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
-}
-#endif /* CONFIG_SIGNALFD */
-
-#ifdef CONFIG_TIMERFD
-
-asmlinkage long compat_sys_timerfd_settime(int ufd, int flags,
-				   const struct compat_itimerspec __user *utmr,
-				   struct compat_itimerspec __user *otmr)
-{
-	int error;
-	struct itimerspec t;
-	struct itimerspec __user *ut;
-
-	if (get_compat_itimerspec(&t, utmr))
-		return -EFAULT;
-	ut = compat_alloc_user_space(2 * sizeof(struct itimerspec));
-	if (copy_to_user(&ut[0], &t, sizeof(t)))
-		return -EFAULT;
-	error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]);
-	if (!error && otmr)
-		error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) ||
-			 put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
-
-	return error;
-}
-
-asmlinkage long compat_sys_timerfd_gettime(int ufd,
-				   struct compat_itimerspec __user *otmr)
-{
-	int error;
-	struct itimerspec t;
-	struct itimerspec __user *ut;
-
-	ut = compat_alloc_user_space(sizeof(struct itimerspec));
-	error = sys_timerfd_gettime(ufd, ut);
-	if (!error)
-		error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) ||
-			 put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0;
-
-	return error;
-}
-
-#endif /* CONFIG_TIMERFD */
-
 #ifdef CONFIG_FHANDLE
 /*
  * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
  * doesn't set the O_LARGEFILE flag.
  */
-asmlinkage long
-compat_sys_open_by_handle_at(int mountdirfd,
-			     struct file_handle __user *handle, int flags)
+COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+			     struct file_handle __user *, handle, int, flags)
 {
 	return do_handle_open(mountdirfd, handle, flags);
 }
 #endif
-
-#ifdef __ARCH_WANT_COMPAT_SYS_SENDFILE
-asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
-				    compat_off_t __user *offset, compat_size_t count)
-{
-	loff_t pos;
-	off_t off;
-	ssize_t ret;
-
-	if (offset) {
-		if (unlikely(get_user(off, offset)))
-			return -EFAULT;
-		pos = off;
-		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
-		if (unlikely(put_user(pos, offset)))
-			return -EFAULT;
-		return ret;
-	}
-
-	return do_sendfile(out_fd, in_fd, NULL, count, 0);
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_SENDFILE */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index e2f57a00702..5d19acfa7c6 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -66,7 +66,6 @@
 #include <linux/gigaset_dev.h>
 
 #ifdef CONFIG_BLOCK
-#include <linux/loop.h>
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <scsi/scsi.h>
@@ -608,7 +607,6 @@ struct serial_struct32 {
 static int serial_struct_ioctl(unsigned fd, unsigned cmd,
 			struct serial_struct32 __user *ss32)
 {
-        typedef struct serial_struct SS;
         typedef struct serial_struct32 SS32;
         int err;
         struct serial_struct ss;
@@ -955,8 +953,6 @@ COMPATIBLE_IOCTL(MTIOCTOP)
 /* Socket level stuff */
 COMPATIBLE_IOCTL(FIOQSIZE)
 #ifdef CONFIG_BLOCK
-/* loop */
-IGNORE_IOCTL(LOOP_CLR_FD)
 /* md calls this on random blockdevs */
 IGNORE_IOCTL(RAID_VERSION)
 /* qemu/qemu-img might call these two on plain files for probing */
@@ -1582,7 +1578,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 	case FIBMAP:
 	case FIGETBSZ:
 	case FIONREAD:
-		if (S_ISREG(f.file->f_path.dentry->d_inode->i_mode))
+		if (S_ISREG(file_inode(f.file)->i_mode))
 			break;
 		/*FALL THROUGH*/
 
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 712b10f64c7..5e7c60c1cb6 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d)
 	if (d->d_inode)
 		simple_rmdir(parent->d_inode,d);
 
-	pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count);
+	pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d));
 
 	dput(parent);
 }
@@ -1037,10 +1037,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
 static int configfs_depend_prep(struct dentry *origin,
 				struct config_item *target)
 {
-	struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
+	struct configfs_dirent *child_sd, *sd;
 	int ret = 0;
 
-	BUG_ON(!origin || !sd);
+	BUG_ON(!origin || !origin->d_fsdata);
+	sd = origin->d_fsdata;
 
 	if (sd->s_element == target)  /* Boo-yah */
 		goto out;
@@ -1531,84 +1532,66 @@ static inline unsigned char dt_type(struct configfs_dirent *sd)
 	return (sd->s_mode >> 12) & 15;
 }
 
-static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int configfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *dentry = file->f_path.dentry;
 	struct super_block *sb = dentry->d_sb;
 	struct configfs_dirent * parent_sd = dentry->d_fsdata;
-	struct configfs_dirent *cursor = filp->private_data;
+	struct configfs_dirent *cursor = file->private_data;
 	struct list_head *p, *q = &cursor->s_sibling;
 	ino_t ino = 0;
-	int i = filp->f_pos;
 
-	switch (i) {
-		case 0:
-			ino = dentry->d_inode->i_ino;
-			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-				break;
-			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		case 1:
-			ino = parent_ino(dentry);
-			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-				break;
-			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		default:
-			if (filp->f_pos == 2) {
-				spin_lock(&configfs_dirent_lock);
-				list_move(q, &parent_sd->s_children);
-				spin_unlock(&configfs_dirent_lock);
-			}
-			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
-				struct configfs_dirent *next;
-				const char * name;
-				int len;
-				struct inode *inode = NULL;
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+	if (ctx->pos == 2) {
+		spin_lock(&configfs_dirent_lock);
+		list_move(q, &parent_sd->s_children);
+		spin_unlock(&configfs_dirent_lock);
+	}
+	for (p = q->next; p != &parent_sd->s_children; p = p->next) {
+		struct configfs_dirent *next;
+		const char *name;
+		int len;
+		struct inode *inode = NULL;
+
+		next = list_entry(p, struct configfs_dirent, s_sibling);
+		if (!next->s_element)
+			continue;
 
-				next = list_entry(p, struct configfs_dirent,
-						   s_sibling);
-				if (!next->s_element)
-					continue;
-
-				name = configfs_get_name(next);
-				len = strlen(name);
-
-				/*
-				 * We'll have a dentry and an inode for
-				 * PINNED items and for open attribute
-				 * files.  We lock here to prevent a race
-				 * with configfs_d_iput() clearing
-				 * s_dentry before calling iput().
-				 *
-				 * Why do we go to the trouble?  If
-				 * someone has an attribute file open,
-				 * the inode number should match until
-				 * they close it.  Beyond that, we don't
-				 * care.
-				 */
-				spin_lock(&configfs_dirent_lock);
-				dentry = next->s_dentry;
-				if (dentry)
-					inode = dentry->d_inode;
-				if (inode)
-					ino = inode->i_ino;
-				spin_unlock(&configfs_dirent_lock);
-				if (!inode)
-					ino = iunique(sb, 2);
+		name = configfs_get_name(next);
+		len = strlen(name);
+
+		/*
+		 * We'll have a dentry and an inode for
+		 * PINNED items and for open attribute
+		 * files.  We lock here to prevent a race
+		 * with configfs_d_iput() clearing
+		 * s_dentry before calling iput().
+		 *
+		 * Why do we go to the trouble?  If
+		 * someone has an attribute file open,
+		 * the inode number should match until
+		 * they close it.  Beyond that, we don't
+		 * care.
+		 */
+		spin_lock(&configfs_dirent_lock);
+		dentry = next->s_dentry;
+		if (dentry)
+			inode = dentry->d_inode;
+		if (inode)
+			ino = inode->i_ino;
+		spin_unlock(&configfs_dirent_lock);
+		if (!inode)
+			ino = iunique(sb, 2);
 
-				if (filldir(dirent, name, len, filp->f_pos, ino,
-						 dt_type(next)) < 0)
-					return 0;
+		if (!dir_emit(ctx, name, len, ino, dt_type(next)))
+			return 0;
 
-				spin_lock(&configfs_dirent_lock);
-				list_move(q, p);
-				spin_unlock(&configfs_dirent_lock);
-				p = q;
-				filp->f_pos++;
-			}
+		spin_lock(&configfs_dirent_lock);
+		list_move(q, p);
+		spin_unlock(&configfs_dirent_lock);
+		p = q;
+		ctx->pos++;
 	}
 	return 0;
 }
@@ -1625,7 +1608,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
 			if (offset >= 0)
 				break;
 		default:
-			mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+			mutex_unlock(&file_inode(file)->i_mutex);
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -1660,7 +1643,7 @@ const struct file_operations configfs_dir_operations = {
 	.release	= configfs_dir_close,
 	.llseek		= configfs_dir_lseek,
 	.read		= generic_read_dir,
-	.readdir	= configfs_readdir,
+	.iterate	= configfs_readdir,
 };
 
 int configfs_register_subsystem(struct configfs_subsystem *subsys)
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2b6cb23dd14..1d1c41f1014 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
 	mutex_lock(&buffer->mutex);
 	len = fill_write_buffer(buffer, buf, count);
 	if (len > 0)
-		len = flush_write_buffer(file->f_path.dentry, buffer, count);
+		len = flush_write_buffer(file->f_path.dentry, buffer, len);
 	if (len > 0)
 		*ppos += len;
 	mutex_unlock(&buffer->mutex);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index aee0a7ebbd8..7f26c3cf75a 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -114,6 +114,7 @@ static struct file_system_type configfs_fs_type = {
 	.mount		= configfs_do_mount,
 	.kill_sb	= kill_litter_super,
 };
+MODULE_ALIAS_FS("configfs");
 
 struct dentry *configfs_pin_fs(void)
 {
diff --git a/fs/coredump.c b/fs/coredump.c
index 177493272a6..72f816d6cad 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -45,69 +45,79 @@
 #include <trace/events/sched.h>
 
 int core_uses_pid;
-char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
+char core_pattern[CORENAME_MAX_SIZE] = "core";
+static int core_name_size = CORENAME_MAX_SIZE;
 
 struct core_name {
 	char *corename;
 	int used, size;
 };
-static atomic_t call_count = ATOMIC_INIT(1);
 
 /* The maximal length of core_pattern is also specified in sysctl.c */
 
-static int expand_corename(struct core_name *cn)
+static int expand_corename(struct core_name *cn, int size)
 {
-	char *old_corename = cn->corename;
+	char *corename = krealloc(cn->corename, size, GFP_KERNEL);
 
-	cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
-	cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
-
-	if (!cn->corename) {
-		kfree(old_corename);
+	if (!corename)
 		return -ENOMEM;
-	}
 
+	if (size > core_name_size) /* racy but harmless */
+		core_name_size = size;
+
+	cn->size = ksize(corename);
+	cn->corename = corename;
 	return 0;
 }
 
+static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
+{
+	int free, need;
+
+again:
+	free = cn->size - cn->used;
+	need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
+	if (need < free) {
+		cn->used += need;
+		return 0;
+	}
+
+	if (!expand_corename(cn, cn->size + need - free + 1))
+		goto again;
+
+	return -ENOMEM;
+}
+
 static int cn_printf(struct core_name *cn, const char *fmt, ...)
 {
-	char *cur;
-	int need;
-	int ret;
 	va_list arg;
+	int ret;
 
 	va_start(arg, fmt);
-	need = vsnprintf(NULL, 0, fmt, arg);
+	ret = cn_vprintf(cn, fmt, arg);
 	va_end(arg);
 
-	if (likely(need < cn->size - cn->used - 1))
-		goto out_printf;
+	return ret;
+}
 
-	ret = expand_corename(cn);
-	if (ret)
-		goto expand_fail;
+static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
+{
+	int cur = cn->used;
+	va_list arg;
+	int ret;
 
-out_printf:
-	cur = cn->corename + cn->used;
 	va_start(arg, fmt);
-	vsnprintf(cur, need + 1, fmt, arg);
+	ret = cn_vprintf(cn, fmt, arg);
 	va_end(arg);
-	cn->used += need;
-	return 0;
 
-expand_fail:
+	for (; cur < cn->used; ++cur) {
+		if (cn->corename[cur] == '/')
+			cn->corename[cur] = '!';
+	}
 	return ret;
 }
 
-static void cn_escape(char *str)
-{
-	for (; *str; str++)
-		if (*str == '/')
-			*str = '!';
-}
-
 static int cn_print_exe_file(struct core_name *cn)
 {
 	struct file *exe_file;
@@ -115,12 +125,8 @@ static int cn_print_exe_file(struct core_name *cn)
 	int ret;
 
 	exe_file = get_mm_exe_file(current->mm);
-	if (!exe_file) {
-		char *commstart = cn->corename + cn->used;
-		ret = cn_printf(cn, "%s (path unknown)", current->comm);
-		cn_escape(commstart);
-		return ret;
-	}
+	if (!exe_file)
+		return cn_esc_printf(cn, "%s (path unknown)", current->comm);
 
 	pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
 	if (!pathbuf) {
@@ -134,9 +140,7 @@ static int cn_print_exe_file(struct core_name *cn)
 		goto free_buf;
 	}
 
-	cn_escape(path);
-
-	ret = cn_printf(cn, "%s", path);
+	ret = cn_esc_printf(cn, "%s", path);
 
 free_buf:
 	kfree(pathbuf);
@@ -157,19 +161,19 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 	int pid_in_pattern = 0;
 	int err = 0;
 
-	cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
-	cn->corename = kmalloc(cn->size, GFP_KERNEL);
 	cn->used = 0;
-
-	if (!cn->corename)
+	cn->corename = NULL;
+	if (expand_corename(cn, core_name_size))
 		return -ENOMEM;
+	cn->corename[0] = '\0';
+
+	if (ispipe)
+		++pat_ptr;
 
 	/* Repeat as long as we have more pattern to process and more output
 	   space */
 	while (*pat_ptr) {
 		if (*pat_ptr != '%') {
-			if (*pat_ptr == 0)
-				goto out;
 			err = cn_printf(cn, "%c", *pat_ptr++);
 		} else {
 			switch (*++pat_ptr) {
@@ -210,22 +214,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 				break;
 			}
 			/* hostname */
-			case 'h': {
-				char *namestart = cn->corename + cn->used;
+			case 'h':
 				down_read(&uts_sem);
-				err = cn_printf(cn, "%s",
+				err = cn_esc_printf(cn, "%s",
 					      utsname()->nodename);
 				up_read(&uts_sem);
-				cn_escape(namestart);
 				break;
-			}
 			/* executable */
-			case 'e': {
-				char *commstart = cn->corename + cn->used;
-				err = cn_printf(cn, "%s", current->comm);
-				cn_escape(commstart);
+			case 'e':
+				err = cn_esc_printf(cn, "%s", current->comm);
 				break;
-			}
 			case 'E':
 				err = cn_print_exe_file(cn);
 				break;
@@ -244,6 +242,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 			return err;
 	}
 
+out:
 	/* Backward compatibility with core_uses_pid:
 	 *
 	 * If core_pattern does not include a %p (as is the default)
@@ -254,7 +253,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 		if (err)
 			return err;
 	}
-out:
 	return ispipe;
 }
 
@@ -263,7 +261,6 @@ static int zap_process(struct task_struct *start, int exit_code)
 	struct task_struct *t;
 	int nr = 0;
 
-	start->signal->flags = SIGNAL_GROUP_EXIT;
 	start->signal->group_exit_code = exit_code;
 	start->signal->group_stop_count = 0;
 
@@ -280,8 +277,8 @@ static int zap_process(struct task_struct *start, int exit_code)
 	return nr;
 }
 
-static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
-				struct core_state *core_state, int exit_code)
+static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+			struct core_state *core_state, int exit_code)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
@@ -291,11 +288,16 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	if (!signal_group_exit(tsk->signal)) {
 		mm->core_state = core_state;
 		nr = zap_process(tsk, exit_code);
+		tsk->signal->group_exit_task = tsk;
+		/* ignore all signals except SIGKILL, see prepare_signal() */
+		tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+		clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
 	}
 	spin_unlock_irq(&tsk->sighand->siglock);
 	if (unlikely(nr < 0))
 		return nr;
 
+	tsk->flags = PF_DUMPCORE;
 	if (atomic_read(&mm->mm_users) == nr + 1)
 		goto done;
 	/*
@@ -340,6 +342,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 				if (unlikely(p->mm == mm)) {
 					lock_task_sighand(p, &flags);
 					nr += zap_process(p, exit_code);
+					p->signal->flags = SIGNAL_GROUP_EXIT;
 					unlock_task_sighand(p, &flags);
 				}
 				break;
@@ -386,11 +389,18 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 	return core_waiters;
 }
 
-static void coredump_finish(struct mm_struct *mm)
+static void coredump_finish(struct mm_struct *mm, bool core_dumped)
 {
 	struct core_thread *curr, *next;
 	struct task_struct *task;
 
+	spin_lock_irq(&current->sighand->siglock);
+	if (core_dumped && !__fatal_signal_pending(current))
+		current->signal->group_exit_code |= 0x80;
+	current->signal->group_exit_task = NULL;
+	current->signal->flags = SIGNAL_GROUP_EXIT;
+	spin_unlock_irq(&current->sighand->siglock);
+
 	next = mm->core_state->dumper.next;
 	while ((curr = next) != NULL) {
 		next = curr->next;
@@ -407,26 +417,38 @@ static void coredump_finish(struct mm_struct *mm)
 	mm->core_state = NULL;
 }
 
-static void wait_for_dump_helpers(struct file *file)
+static bool dump_interrupted(void)
 {
-	struct pipe_inode_info *pipe;
+	/*
+	 * SIGKILL or freezing() interrupt the coredumping. Perhaps we
+	 * can do try_to_freeze() and check __fatal_signal_pending(),
+	 * but then we need to teach dump_write() to restart and clear
+	 * TIF_SIGPENDING.
+	 */
+	return signal_pending(current);
+}
 
-	pipe = file->f_path.dentry->d_inode->i_pipe;
+static void wait_for_dump_helpers(struct file *file)
+{
+	struct pipe_inode_info *pipe = file->private_data;
 
 	pipe_lock(pipe);
 	pipe->readers++;
 	pipe->writers--;
+	wake_up_interruptible_sync(&pipe->wait);
+	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+	pipe_unlock(pipe);
 
-	while ((pipe->readers > 1) && (!signal_pending(current))) {
-		wake_up_interruptible_sync(&pipe->wait);
-		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
-		pipe_wait(pipe);
-	}
+	/*
+	 * We actually want wait_event_freezable() but then we need
+	 * to clear TIF_SIGPENDING and improve dump_interrupted().
+	 */
+	wait_event_interruptible(pipe->wait, pipe->readers == 1);
 
+	pipe_lock(pipe);
 	pipe->readers--;
 	pipe->writers++;
 	pipe_unlock(pipe);
-
 }
 
 /*
@@ -471,6 +493,7 @@ void do_coredump(siginfo_t *siginfo)
 	int ispipe;
 	struct files_struct *displaced;
 	bool need_nonrelative = false;
+	bool core_dumped = false;
 	static atomic_t core_dump_count = ATOMIC_INIT(0);
 	struct coredump_params cprm = {
 		.siginfo = siginfo,
@@ -501,7 +524,7 @@ void do_coredump(siginfo_t *siginfo)
 	 * so we dump it as root in mode 2, and only into a controlled
 	 * environment (pipe handler or fully qualified path).
 	 */
-	if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
+	if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
 		/* Setuid core dump mode */
 		flag = O_EXCL;		/* Stop rewrite attacks */
 		cred->fsuid = GLOBAL_ROOT_UID;	/* Dump root private */
@@ -514,22 +537,17 @@ void do_coredump(siginfo_t *siginfo)
 
 	old_cred = override_creds(cred);
 
-	/*
-	 * Clear any false indication of pending signals that might
-	 * be seen by the filesystem code called to write the core file.
-	 */
-	clear_thread_flag(TIF_SIGPENDING);
-
 	ispipe = format_corename(&cn, &cprm);
 
- 	if (ispipe) {
+	if (ispipe) {
 		int dump_count;
 		char **helper_argv;
+		struct subprocess_info *sub_info;
 
 		if (ispipe < 0) {
 			printk(KERN_WARNING "format_corename failed\n");
 			printk(KERN_WARNING "Aborting core\n");
-			goto fail_corename;
+			goto fail_unlock;
 		}
 
 		if (cprm.limit == 1) {
@@ -564,22 +582,27 @@ void do_coredump(siginfo_t *siginfo)
 			goto fail_dropcount;
 		}
 
-		helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
+		helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
 		if (!helper_argv) {
 			printk(KERN_WARNING "%s failed to allocate memory\n",
 			       __func__);
 			goto fail_dropcount;
 		}
 
-		retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
-					NULL, UMH_WAIT_EXEC, umh_pipe_setup,
-					NULL, &cprm);
+		retval = -ENOMEM;
+		sub_info = call_usermodehelper_setup(helper_argv[0],
+						helper_argv, NULL, GFP_KERNEL,
+						umh_pipe_setup, NULL, &cprm);
+		if (sub_info)
+			retval = call_usermodehelper_exec(sub_info,
+							  UMH_WAIT_EXEC);
+
 		argv_free(helper_argv);
 		if (retval) {
- 			printk(KERN_INFO "Core dump to %s pipe failed\n",
+			printk(KERN_INFO "Core dump to |%s pipe failed\n",
 			       cn.corename);
 			goto close_fail;
- 		}
+		}
 	} else {
 		struct inode *inode;
 
@@ -600,7 +623,7 @@ void do_coredump(siginfo_t *siginfo)
 		if (IS_ERR(cprm.file))
 			goto fail_unlock;
 
-		inode = cprm.file->f_path.dentry->d_inode;
+		inode = file_inode(cprm.file);
 		if (inode->i_nlink > 1)
 			goto close_fail;
 		if (d_unhashed(cprm.file->f_path.dentry))
@@ -629,10 +652,11 @@ void do_coredump(siginfo_t *siginfo)
 		goto close_fail;
 	if (displaced)
 		put_files_struct(displaced);
-	retval = binfmt->core_dump(&cprm);
-	if (retval)
-		current->signal->group_exit_code |= 0x80;
-
+	if (!dump_interrupted()) {
+		file_start_write(cprm.file);
+		core_dumped = binfmt->core_dump(&cprm);
+		file_end_write(cprm.file);
+	}
 	if (ispipe && core_pipe_limit)
 		wait_for_dump_helpers(cprm.file);
 close_fail:
@@ -643,8 +667,7 @@ fail_dropcount:
 		atomic_dec(&core_dump_count);
 fail_unlock:
 	kfree(cn.corename);
-fail_corename:
-	coredump_finish(mm);
+	coredump_finish(mm, core_dumped);
 	revert_creds(old_cred);
 fail_creds:
 	put_cred(cred);
@@ -659,7 +682,9 @@ fail:
  */
 int dump_write(struct file *file, const void *addr, int nr)
 {
-	return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+	return !dump_interrupted() &&
+		access_ok(VERIFY_READ, addr, nr) &&
+		file->f_op->write(file, addr, nr, &file->f_pos) == nr;
 }
 EXPORT_SYMBOL(dump_write);
 
@@ -668,7 +693,8 @@ int dump_seek(struct file *file, loff_t off)
 	int ret = 1;
 
 	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
-		if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+		if (dump_interrupted() ||
+		    file->f_op->llseek(file, off, SEEK_CUR) < 0)
 			return 0;
 	} else {
 		char *buf = (char *)get_zeroed_page(GFP_KERNEL);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index c6c3f91ecf0..e501ac3a49f 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -349,18 +349,17 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 /*
  * Read a cramfs directory entry.
  */
-static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int cramfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	char *buf;
 	unsigned int offset;
-	int copied;
 
 	/* Offset within the thing. */
-	offset = filp->f_pos;
-	if (offset >= inode->i_size)
+	if (ctx->pos >= inode->i_size)
 		return 0;
+	offset = ctx->pos;
 	/* Directory entries are always 4-byte aligned */
 	if (offset & 3)
 		return -EINVAL;
@@ -369,14 +368,13 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (!buf)
 		return -ENOMEM;
 
-	copied = 0;
 	while (offset < inode->i_size) {
 		struct cramfs_inode *de;
 		unsigned long nextoffset;
 		char *name;
 		ino_t ino;
 		umode_t mode;
-		int namelen, error;
+		int namelen;
 
 		mutex_lock(&read_mutex);
 		de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
@@ -402,13 +400,10 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				break;
 			namelen--;
 		}
-		error = filldir(dirent, buf, namelen, offset, ino, mode >> 12);
-		if (error)
+		if (!dir_emit(ctx, buf, namelen, ino, mode >> 12))
 			break;
 
-		offset = nextoffset;
-		filp->f_pos = offset;
-		copied++;
+		ctx->pos = offset = nextoffset;
 	}
 	kfree(buf);
 	return 0;
@@ -547,7 +542,7 @@ static const struct address_space_operations cramfs_aops = {
 static const struct file_operations cramfs_directory_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= cramfs_readdir,
+	.iterate	= cramfs_readdir,
 };
 
 static const struct inode_operations cramfs_dir_inode_operations = {
@@ -573,6 +568,7 @@ static struct file_system_type cramfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("cramfs");
 
 static int __init init_cramfs_fs(void)
 {
diff --git a/fs/dcache.c b/fs/dcache.c
index 19153a0a810..87bdb5329c3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -337,23 +337,6 @@ static void dentry_lru_del(struct dentry *dentry)
 	}
 }
 
-/*
- * Remove a dentry that is unreferenced and about to be pruned
- * (unhashed and destroyed) from the LRU, and inform the file system.
- * This wrapper should be called _prior_ to unhashing a victim dentry.
- */
-static void dentry_lru_prune(struct dentry *dentry)
-{
-	if (!list_empty(&dentry->d_lru)) {
-		if (dentry->d_flags & DCACHE_OP_PRUNE)
-			dentry->d_op->d_prune(dentry);
-
-		spin_lock(&dcache_lru_lock);
-		__dentry_lru_del(dentry);
-		spin_unlock(&dcache_lru_lock);
-	}
-}
-
 static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
 {
 	spin_lock(&dcache_lru_lock);
@@ -486,11 +469,13 @@ relock:
 	if (ref)
 		dentry->d_count--;
 	/*
-	 * if dentry was on the d_lru list delete it from there.
 	 * inform the fs via d_prune that this dentry is about to be
 	 * unhashed and destroyed.
 	 */
-	dentry_lru_prune(dentry);
+	if (dentry->d_flags & DCACHE_OP_PRUNE)
+		dentry->d_op->d_prune(dentry);
+
+	dentry_lru_del(dentry);
 	/* if it was on the hash then remove it */
 	__d_drop(dentry);
 	return d_kill(dentry, parent);
@@ -675,11 +660,10 @@ EXPORT_SYMBOL(dget_parent);
 static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
 {
 	struct dentry *alias, *discon_alias;
-	struct hlist_node *p;
 
 again:
 	discon_alias = NULL;
-	hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) {
+	hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {
 		spin_lock(&alias->d_lock);
  		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
 			if (IS_ROOT(alias) &&
@@ -730,10 +714,9 @@ EXPORT_SYMBOL(d_find_alias);
 void d_prune_aliases(struct inode *inode)
 {
 	struct dentry *dentry;
-	struct hlist_node *p;
 restart:
 	spin_lock(&inode->i_lock);
-	hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		spin_lock(&dentry->d_lock);
 		if (!dentry->d_count) {
 			__dget_dlock(dentry);
@@ -921,11 +904,13 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			struct inode *inode;
 
 			/*
-			 * remove the dentry from the lru, and inform
-			 * the fs that this dentry is about to be
+			 * inform the fs that this dentry is about to be
 			 * unhashed and destroyed.
 			 */
-			dentry_lru_prune(dentry);
+			if (dentry->d_flags & DCACHE_OP_PRUNE)
+				dentry->d_op->d_prune(dentry);
+
+			dentry_lru_del(dentry);
 			__d_shrink(dentry);
 
 			if (dentry->d_count != 0) {
@@ -1232,8 +1217,10 @@ void shrink_dcache_parent(struct dentry * parent)
 	LIST_HEAD(dispose);
 	int found;
 
-	while ((found = select_parent(parent, &dispose)) != 0)
+	while ((found = select_parent(parent, &dispose)) != 0) {
 		shrink_dentry_list(&dispose);
+		cond_resched();
+	}
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
 
@@ -1358,6 +1345,7 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 	WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH	|
 				DCACHE_OP_COMPARE	|
 				DCACHE_OP_REVALIDATE	|
+				DCACHE_OP_WEAK_REVALIDATE	|
 				DCACHE_OP_DELETE ));
 	dentry->d_op = op;
 	if (!op)
@@ -1368,6 +1356,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 		dentry->d_flags |= DCACHE_OP_COMPARE;
 	if (op->d_revalidate)
 		dentry->d_flags |= DCACHE_OP_REVALIDATE;
+	if (op->d_weak_revalidate)
+		dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
 	if (op->d_delete)
 		dentry->d_flags |= DCACHE_OP_DELETE;
 	if (op->d_prune)
@@ -1440,14 +1430,13 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
 	int len = entry->d_name.len;
 	const char *name = entry->d_name.name;
 	unsigned int hash = entry->d_name.hash;
-	struct hlist_node *p;
 
 	if (!inode) {
 		__d_instantiate(entry, NULL);
 		return NULL;
 	}
 
-	hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) {
+	hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {
 		/*
 		 * Don't need alias->d_lock here, because aliases with
 		 * d_parent == entry->d_parent are not subject to name or
@@ -1623,6 +1612,10 @@ EXPORT_SYMBOL(d_obtain_alias);
  * If a dentry was found and moved, then it is returned.  Otherwise NULL
  * is returned.  This matches the expected return value of ->lookup.
  *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
  */
 struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 {
@@ -1647,8 +1640,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 			security_d_instantiate(dentry, inode);
 			d_rehash(dentry);
 		}
-	} else
-		d_add(dentry, inode);
+	} else {
+		d_instantiate(dentry, inode);
+		if (d_unhashed(dentry))
+			d_rehash(dentry);
+	}
 	return new;
 }
 EXPORT_SYMBOL(d_splice_alias);
@@ -1672,7 +1668,6 @@ EXPORT_SYMBOL(d_splice_alias);
 struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 			struct qstr *name)
 {
-	int error;
 	struct dentry *found;
 	struct dentry *new;
 
@@ -1681,10 +1676,12 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	 * if not go ahead and create it now.
 	 */
 	found = d_hash_and_lookup(dentry->d_parent, name);
+	if (unlikely(IS_ERR(found)))
+		goto err_out;
 	if (!found) {
 		new = d_alloc(dentry->d_parent, name);
 		if (!new) {
-			error = -ENOMEM;
+			found = ERR_PTR(-ENOMEM);
 			goto err_out;
 		}
 
@@ -1725,7 +1722,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 
 err_out:
 	iput(inode);
-	return ERR_PTR(error);
+	return found;
 }
 EXPORT_SYMBOL(d_add_ci);
 
@@ -1733,7 +1730,7 @@ EXPORT_SYMBOL(d_add_ci);
  * Do the slow-case of the dentry name compare.
  *
  * Unlike the dentry_cmp() function, we need to atomically
- * load the name, length and inode information, so that the
+ * load the name and length information, so that the
  * filesystem can rely on them, and can use the 'name' and
  * 'len' information without worrying about walking off the
  * end of memory etc.
@@ -1751,22 +1748,18 @@ enum slow_d_compare {
 
 static noinline enum slow_d_compare slow_dentry_cmp(
 		const struct dentry *parent,
-		struct inode *inode,
 		struct dentry *dentry,
 		unsigned int seq,
 		const struct qstr *name)
 {
 	int tlen = dentry->d_name.len;
 	const char *tname = dentry->d_name.name;
-	struct inode *i = dentry->d_inode;
 
 	if (read_seqcount_retry(&dentry->d_seq, seq)) {
 		cpu_relax();
 		return D_COMP_SEQRETRY;
 	}
-	if (parent->d_op->d_compare(parent, inode,
-				dentry, i,
-				tlen, tname, name))
+	if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
 		return D_COMP_NOMATCH;
 	return D_COMP_OK;
 }
@@ -1776,7 +1769,6 @@ static noinline enum slow_d_compare slow_dentry_cmp(
  * @parent: parent dentry
  * @name: qstr of name we wish to find
  * @seqp: returns d_seq value at the point where the dentry was found
- * @inode: returns dentry->d_inode when the inode was found valid.
  * Returns: dentry, or NULL
  *
  * __d_lookup_rcu is the dcache lookup function for rcu-walk name
@@ -1803,7 +1795,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
  */
 struct dentry *__d_lookup_rcu(const struct dentry *parent,
 				const struct qstr *name,
-				unsigned *seqp, struct inode *inode)
+				unsigned *seqp)
 {
 	u64 hashlen = name->hash_len;
 	const unsigned char *str = name->name;
@@ -1837,11 +1829,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
 seqretry:
 		/*
 		 * The dentry sequence count protects us from concurrent
-		 * renames, and thus protects inode, parent and name fields.
+		 * renames, and thus protects parent and name fields.
 		 *
 		 * The caller must perform a seqcount check in order
-		 * to do anything useful with the returned dentry,
-		 * including using the 'd_inode' pointer.
+		 * to do anything useful with the returned dentry.
 		 *
 		 * NOTE! We do a "raw" seqcount_begin here. That means that
 		 * we don't wait for the sequence count to stabilize if it
@@ -1855,12 +1846,12 @@ seqretry:
 			continue;
 		if (d_unhashed(dentry))
 			continue;
-		*seqp = seq;
 
 		if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
 			if (dentry->d_name.hash != hashlen_hash(hashlen))
 				continue;
-			switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) {
+			*seqp = seq;
+			switch (slow_dentry_cmp(parent, dentry, seq, name)) {
 			case D_COMP_OK:
 				return dentry;
 			case D_COMP_NOMATCH:
@@ -1872,6 +1863,7 @@ seqretry:
 
 		if (dentry->d_name.hash_len != hashlen)
 			continue;
+		*seqp = seq;
 		if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
 			return dentry;
 	}
@@ -1889,7 +1881,7 @@ seqretry:
  * dentry is returned. The caller must use dput to free the entry when it has
  * finished using it. %NULL is returned if the dentry does not exist.
  */
-struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
+struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
 {
 	struct dentry *dentry;
 	unsigned seq;
@@ -1919,7 +1911,7 @@ EXPORT_SYMBOL(d_lookup);
  *
  * __d_lookup callers must be commented.
  */
-struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
+struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
 {
 	unsigned int len = name->len;
 	unsigned int hash = name->hash;
@@ -1969,9 +1961,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 		if (parent->d_flags & DCACHE_OP_COMPARE) {
 			int tlen = dentry->d_name.len;
 			const char *tname = dentry->d_name.name;
-			if (parent->d_op->d_compare(parent, parent->d_inode,
-						dentry, dentry->d_inode,
-						tlen, tname, name))
+			if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
 				goto next;
 		} else {
 			if (dentry->d_name.len != len)
@@ -1997,12 +1987,10 @@ next:
  * @dir: Directory to search in
  * @name: qstr of name we wish to find
  *
- * On hash failure or on lookup failure NULL is returned.
+ * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
  */
 struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
 {
-	struct dentry *dentry = NULL;
-
 	/*
 	 * Check for a fs-specific hash function. Note that we must
 	 * calculate the standard hash first, as the d_op->d_hash()
@@ -2010,13 +1998,13 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
 	 */
 	name->hash = full_name_hash(name->name, name->len);
 	if (dir->d_flags & DCACHE_OP_HASH) {
-		if (dir->d_op->d_hash(dir, dir->d_inode, name) < 0)
-			goto out;
+		int err = dir->d_op->d_hash(dir, name);
+		if (unlikely(err < 0))
+			return ERR_PTR(err);
 	}
-	dentry = d_lookup(dir, name);
-out:
-	return dentry;
+	return d_lookup(dir, name);
 }
+EXPORT_SYMBOL(d_hash_and_lookup);
 
 /**
  * d_validate - verify dentry provided from insecure source (deprecated)
@@ -2394,7 +2382,7 @@ out_err:
  */
 static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 {
-	struct dentry *dparent, *aparent;
+	struct dentry *dparent;
 
 	dentry_lock_for_move(anon, dentry);
 
@@ -2402,24 +2390,14 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	write_seqcount_begin(&anon->d_seq);
 
 	dparent = dentry->d_parent;
-	aparent = anon->d_parent;
 
 	switch_names(dentry, anon);
 	swap(dentry->d_name.hash, anon->d_name.hash);
 
-	dentry->d_parent = (aparent == anon) ? dentry : aparent;
-	list_del(&dentry->d_u.d_child);
-	if (!IS_ROOT(dentry))
-		list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
-	else
-		INIT_LIST_HEAD(&dentry->d_u.d_child);
-
-	anon->d_parent = (dparent == dentry) ? anon : dparent;
-	list_del(&anon->d_u.d_child);
-	if (!IS_ROOT(anon))
-		list_add(&anon->d_u.d_child, &anon->d_parent->d_subdirs);
-	else
-		INIT_LIST_HEAD(&anon->d_u.d_child);
+	dentry->d_parent = dentry;
+	list_del_init(&dentry->d_u.d_child);
+	anon->d_parent = dparent;
+	list_move(&anon->d_u.d_child, &dparent->d_subdirs);
 
 	write_seqcount_end(&dentry->d_seq);
 	write_seqcount_end(&anon->d_seq);
@@ -2552,7 +2530,6 @@ static int prepend_path(const struct path *path,
 	bool slash = false;
 	int error = 0;
 
-	br_read_lock(&vfsmount_lock);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
 
@@ -2582,8 +2559,6 @@ static int prepend_path(const struct path *path,
 	if (!error && !slash)
 		error = prepend(buffer, buflen, "/", 1);
 
-out:
-	br_read_unlock(&vfsmount_lock);
 	return error;
 
 global_root:
@@ -2600,7 +2575,7 @@ global_root:
 		error = prepend(buffer, buflen, "/", 1);
 	if (!error)
 		error = is_mounted(vfsmnt) ? 1 : 2;
-	goto out;
+	return error;
 }
 
 /**
@@ -2627,9 +2602,11 @@ char *__d_path(const struct path *path,
 	int error;
 
 	prepend(&res, &buflen, "\0", 1);
+	br_read_lock(&vfsmount_lock);
 	write_seqlock(&rename_lock);
 	error = prepend_path(path, root, &res, &buflen);
 	write_sequnlock(&rename_lock);
+	br_read_unlock(&vfsmount_lock);
 
 	if (error < 0)
 		return ERR_PTR(error);
@@ -2646,9 +2623,11 @@ char *d_absolute_path(const struct path *path,
 	int error;
 
 	prepend(&res, &buflen, "\0", 1);
+	br_read_lock(&vfsmount_lock);
 	write_seqlock(&rename_lock);
 	error = prepend_path(path, &root, &res, &buflen);
 	write_sequnlock(&rename_lock);
+	br_read_unlock(&vfsmount_lock);
 
 	if (error > 1)
 		error = -EINVAL;
@@ -2712,47 +2691,18 @@ char *d_path(const struct path *path, char *buf, int buflen)
 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
 	get_fs_root(current->fs, &root);
+	br_read_lock(&vfsmount_lock);
 	write_seqlock(&rename_lock);
 	error = path_with_deleted(path, &root, &res, &buflen);
+	write_sequnlock(&rename_lock);
+	br_read_unlock(&vfsmount_lock);
 	if (error < 0)
 		res = ERR_PTR(error);
-	write_sequnlock(&rename_lock);
 	path_put(&root);
 	return res;
 }
 EXPORT_SYMBOL(d_path);
 
-/**
- * d_path_with_unreachable - return the path of a dentry
- * @path: path to report
- * @buf: buffer to return value in
- * @buflen: buffer length
- *
- * The difference from d_path() is that this prepends "(unreachable)"
- * to paths which are unreachable from the current process' root.
- */
-char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
-{
-	char *res = buf + buflen;
-	struct path root;
-	int error;
-
-	if (path->dentry->d_op && path->dentry->d_op->d_dname)
-		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
-
-	get_fs_root(current->fs, &root);
-	write_seqlock(&rename_lock);
-	error = path_with_deleted(path, &root, &res, &buflen);
-	if (error > 0)
-		error = prepend_unreachable(&res, &buflen);
-	write_sequnlock(&rename_lock);
-	path_put(&root);
-	if (error)
-		res =  ERR_PTR(error);
-
-	return res;
-}
-
 /*
  * Helper function for dentry_operations.d_dname() members
  */
@@ -2871,6 +2821,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	get_fs_root_and_pwd(current->fs, &root, &pwd);
 
 	error = -ENOENT;
+	br_read_lock(&vfsmount_lock);
 	write_seqlock(&rename_lock);
 	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
@@ -2880,6 +2831,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 		prepend(&cwd, &buflen, "\0", 1);
 		error = prepend_path(&pwd, &root, &cwd, &buflen);
 		write_sequnlock(&rename_lock);
+		br_read_unlock(&vfsmount_lock);
 
 		if (error < 0)
 			goto out;
@@ -2900,6 +2852,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 		}
 	} else {
 		write_sequnlock(&rename_lock);
+		br_read_unlock(&vfsmount_lock);
 	}
 
 out:
@@ -3015,34 +2968,21 @@ rename_retry:
 	goto again;
 }
 
-/**
- * find_inode_number - check for dentry with name
- * @dir: directory to check
- * @name: Name to find.
- *
- * Check whether a dentry already exists for the given name,
- * and return the inode number if it has an inode. Otherwise
- * 0 is returned.
- *
- * This routine is used to post-process directory listings for
- * filesystems using synthetic inode numbers, and is necessary
- * to keep getcwd() working.
- */
- 
-ino_t find_inode_number(struct dentry *dir, struct qstr *name)
+void d_tmpfile(struct dentry *dentry, struct inode *inode)
 {
-	struct dentry * dentry;
-	ino_t ino = 0;
-
-	dentry = d_hash_and_lookup(dir, name);
-	if (dentry) {
-		if (dentry->d_inode)
-			ino = dentry->d_inode->i_ino;
-		dput(dentry);
-	}
-	return ino;
+	inode_dec_link_count(inode);
+	BUG_ON(dentry->d_name.name != dentry->d_iname ||
+		!hlist_unhashed(&dentry->d_alias) ||
+		!d_unlinked(dentry));
+	spin_lock(&dentry->d_parent->d_lock);
+	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
+				(unsigned long long)inode->i_ino);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&dentry->d_parent->d_lock);
+	d_instantiate(dentry, inode);
 }
-EXPORT_SYMBOL(find_inode_number);
+EXPORT_SYMBOL(d_tmpfile);
 
 static __initdata unsigned long dhash_entries;
 static int __init set_dhash_entries(char *str)
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 17c77996782..ab5954b5026 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -25,6 +25,7 @@
 #include <linux/dcookies.h>
 #include <linux/mutex.h>
 #include <linux/path.h>
+#include <linux/compat.h>
 #include <asm/uaccess.h>
 
 /* The dcookies are allocated from a kmem_cache and
@@ -145,7 +146,7 @@ out:
 /* And here is where the userspace process can look up the cookie value
  * to retrieve the path.
  */
-SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len)
+SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len)
 {
 	unsigned long cookie = (unsigned long)cookie64;
 	int err = -EINVAL;
@@ -201,12 +202,16 @@ out:
 	mutex_unlock(&dcookie_mutex);
 	return err;
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len)
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len)
 {
-	return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len);
+#ifdef __BIG_ENDIAN
+	return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
+#else
+	return sys_lookup_dcookie(((u64)w1 << 32) | w0, buf, len);
+#endif
 }
-SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie);
 #endif
 
 static int dcookie_init(void)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index c5ca6ae5a30..63146295153 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -21,6 +21,7 @@
 #include <linux/debugfs.h>
 #include <linux/io.h>
 #include <linux/slab.h>
+#include <linux/atomic.h>
 
 static ssize_t default_read_file(struct file *file, char __user *buf,
 				 size_t count, loff_t *ppos)
@@ -403,6 +404,47 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_size_t);
 
+static int debugfs_atomic_t_set(void *data, u64 val)
+{
+	atomic_set((atomic_t *)data, val);
+	return 0;
+}
+static int debugfs_atomic_t_get(void *data, u64 *val)
+{
+	*val = atomic_read((atomic_t *)data);
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+			debugfs_atomic_t_set, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
+
+/**
+ * debugfs_create_atomic_t - create a debugfs file that is used to read and
+ * write an atomic_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
+				 struct dentry *parent, atomic_t *value)
+{
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value,
+					&fops_atomic_t_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value,
+					&fops_atomic_t_wo);
+
+	return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
 
 static ssize_t read_file_bool(struct file *file, char __user *user_buf,
 			      size_t count, loff_t *ppos)
@@ -431,6 +473,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
 	if (copy_from_user(buf, user_buf, buf_size))
 		return -EFAULT;
 
+	buf[buf_size] = '\0';
 	if (strtobool(buf, &bv) == 0)
 		*val = bv;
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a5f12b7e228..4888cb3fdef 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -299,6 +299,7 @@ static struct file_system_type debug_fs_type = {
 	.mount =	debug_mount,
 	.kill_sb =	kill_litter_super,
 };
+MODULE_ALIAS_FS("debugfs");
 
 static struct dentry *__create_file(const char *name, umode_t mode,
 				    struct dentry *parent, void *data,
@@ -322,7 +323,6 @@ static struct dentry *__create_file(const char *name, umode_t mode,
 	if (!parent)
 		parent = debugfs_mount->mnt_root;
 
-	dentry = NULL;
 	mutex_lock(&parent->d_inode->i_mutex);
 	dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry)) {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 472e6befc54..073d30b9d1a 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -243,6 +243,13 @@ static int mknod_ptmx(struct super_block *sb)
 	struct dentry *root = sb->s_root;
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
+	kuid_t root_uid;
+	kgid_t root_gid;
+
+	root_uid = make_kuid(current_user_ns(), 0);
+	root_gid = make_kgid(current_user_ns(), 0);
+	if (!uid_valid(root_uid) || !gid_valid(root_gid))
+		return -EINVAL;
 
 	mutex_lock(&root->d_inode->i_mutex);
 
@@ -273,6 +280,8 @@ static int mknod_ptmx(struct super_block *sb)
 
 	mode = S_IFCHR|opts->ptmxmode;
 	init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
+	inode->i_uid = root_uid;
+	inode->i_gid = root_gid;
 
 	d_add(dentry, inode);
 
@@ -438,6 +447,12 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
 	if (error)
 		return ERR_PTR(error);
 
+	/* Require newinstance for all user namespace mounts to ensure
+	 * the mount options are not changed.
+	 */
+	if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
+		return ERR_PTR(-EINVAL);
+
 	if (opts.newinstance)
 		s = sget(fs_type, NULL, set_anon_super, flags, NULL);
 	else
@@ -491,6 +506,9 @@ static struct file_system_type devpts_fs_type = {
 	.name		= "devpts",
 	.mount		= devpts_mount,
 	.kill_sb	= devpts_kill_sb,
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+	.fs_flags	= FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
+#endif
 };
 
 /*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index cf5b44b10c6..7ab90f5081e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,6 +37,7 @@
 #include <linux/uio.h>
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
+#include <linux/aio.h>
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
@@ -261,9 +262,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
 		dio->end_io(dio->iocb, offset, transferred,
 			    dio->private, ret, is_async);
 	} else {
+		inode_dio_done(dio->inode);
 		if (is_async)
 			aio_complete(dio->iocb, ret, 0);
-		inode_dio_done(dio->inode);
 	}
 
 	return ret;
@@ -441,8 +442,8 @@ static struct bio *dio_await_one(struct dio *dio)
 static int dio_bio_complete(struct dio *dio, struct bio *bio)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec;
-	int page_no;
+	struct bio_vec *bvec;
+	unsigned i;
 
 	if (!uptodate)
 		dio->io_error = -EIO;
@@ -450,8 +451,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 	if (dio->is_async && dio->rw == READ) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 	} else {
-		for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
-			struct page *page = bvec[page_no].bv_page;
+		bio_for_each_segment_all(bvec, bio, i) {
+			struct page *page = bvec->bv_page;
 
 			if (dio->rw == READ && !PageCompound(page))
 				set_page_dirty_lock(page);
@@ -672,12 +673,6 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
 		if (sdio->final_block_in_bio != sdio->cur_page_block ||
 		    cur_offset != bio_next_offset)
 			dio_bio_submit(dio, sdio);
-		/*
-		 * Submit now if the underlying fs is about to perform a
-		 * metadata read
-		 */
-		else if (sdio->boundary)
-			dio_bio_submit(dio, sdio);
 	}
 
 	if (sdio->bio == NULL) {
@@ -737,16 +732,6 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 	    sdio->cur_page_block +
 	    (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
 		sdio->cur_page_len += len;
-
-		/*
-		 * If sdio->boundary then we want to schedule the IO now to
-		 * avoid metadata seeks.
-		 */
-		if (sdio->boundary) {
-			ret = dio_send_cur_page(dio, sdio, map_bh);
-			page_cache_release(sdio->cur_page);
-			sdio->cur_page = NULL;
-		}
 		goto out;
 	}
 
@@ -758,7 +743,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 		page_cache_release(sdio->cur_page);
 		sdio->cur_page = NULL;
 		if (ret)
-			goto out;
+			return ret;
 	}
 
 	page_cache_get(page);		/* It is in dio */
@@ -768,6 +753,16 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 	sdio->cur_page_block = blocknr;
 	sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
 out:
+	/*
+	 * If sdio->boundary then we want to schedule the IO now to
+	 * avoid metadata seeks.
+	 */
+	if (sdio->boundary) {
+		ret = dio_send_cur_page(dio, sdio, map_bh);
+		dio_bio_submit(dio, sdio);
+		page_cache_release(sdio->cur_page);
+		sdio->cur_page = NULL;
+	}
 	return ret;
 }
 
@@ -969,7 +964,8 @@ do_holes:
 			this_chunk_bytes = this_chunk_blocks << blkbits;
 			BUG_ON(this_chunk_bytes == 0);
 
-			sdio->boundary = buffer_boundary(map_bh);
+			if (this_chunk_blocks == sdio->blocks_available)
+				sdio->boundary = buffer_boundary(map_bh);
 			ret = submit_page_section(dio, sdio, page,
 						  offset_in_page,
 						  this_chunk_bytes,
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index a0387dd8b1f..76feb4b60fa 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
 static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
 					  const char *buf, size_t len)
 {
-	strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
-	strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+	strlcpy(dlm_config.ci_cluster_name, buf,
+				sizeof(dlm_config.ci_cluster_name));
+	strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
 	return len;
 }
 
@@ -158,7 +159,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
 	unsigned int x;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	x = simple_strtoul(buf, NULL, 0);
 
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 77c0f70f8fe..e7665c31f7b 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -96,10 +96,13 @@ do { \
 }
 
 
+#define DLM_RTF_SHRINK		0x00000001
+
 struct dlm_rsbtable {
 	struct rb_root		keep;
 	struct rb_root		toss;
 	spinlock_t		lock;
+	uint32_t		flags;
 };
 
 
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index a579f30f237..e223a911a83 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1132,6 +1132,7 @@ static void toss_rsb(struct kref *kref)
 	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
 	rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
 	r->res_toss_time = jiffies;
+	ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK;
 	if (r->res_lvbptr) {
 		dlm_free_lvb(r->res_lvbptr);
 		r->res_lvbptr = NULL;
@@ -1182,7 +1183,7 @@ static void detach_lkb(struct dlm_lkb *lkb)
 static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 {
 	struct dlm_lkb *lkb;
-	int rv, id;
+	int rv;
 
 	lkb = dlm_allocate_lkb(ls);
 	if (!lkb)
@@ -1198,19 +1199,13 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 	mutex_init(&lkb->lkb_cb_mutex);
 	INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
 
- retry:
-	rv = idr_pre_get(&ls->ls_lkbidr, GFP_NOFS);
-	if (!rv)
-		return -ENOMEM;
-
+	idr_preload(GFP_NOFS);
 	spin_lock(&ls->ls_lkbidr_spin);
-	rv = idr_get_new_above(&ls->ls_lkbidr, lkb, 1, &id);
-	if (!rv)
-		lkb->lkb_id = id;
+	rv = idr_alloc(&ls->ls_lkbidr, lkb, 1, 0, GFP_NOWAIT);
+	if (rv >= 0)
+		lkb->lkb_id = rv;
 	spin_unlock(&ls->ls_lkbidr_spin);
-
-	if (rv == -EAGAIN)
-		goto retry;
+	idr_preload_end();
 
 	if (rv < 0) {
 		log_error(ls, "create_lkb idr error %d", rv);
@@ -1659,11 +1654,18 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
 	char *name;
 	int our_nodeid = dlm_our_nodeid();
 	int remote_count = 0;
+	int need_shrink = 0;
 	int i, len, rv;
 
 	memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);
 
 	spin_lock(&ls->ls_rsbtbl[b].lock);
+
+	if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) {
+		spin_unlock(&ls->ls_rsbtbl[b].lock);
+		return;
+	}
+
 	for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {
 		next = rb_next(n);
 		r = rb_entry(n, struct dlm_rsb, res_hashnode);
@@ -1679,6 +1681,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
 			continue;
 		}
 
+		need_shrink = 1;
+
 		if (!time_after_eq(jiffies, r->res_toss_time +
 				   dlm_config.ci_toss_secs * HZ)) {
 			continue;
@@ -1710,6 +1714,11 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
 		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
 		dlm_free_rsb(r);
 	}
+
+	if (need_shrink)
+		ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK;
+	else
+		ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK;
 	spin_unlock(&ls->ls_rsbtbl[b].lock);
 
 	/*
@@ -2029,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
 	if (b == 1) {
 		int len = receive_extralen(ms);
-		if (len > DLM_RESNAME_MAXLEN)
-			len = DLM_RESNAME_MAXLEN;
+		if (len > r->res_ls->ls_lvblen)
+			len = r->res_ls->ls_lvblen;
 		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
 		lkb->lkb_lvbseq = ms->m_lvbseq;
 	}
@@ -3884,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
 		if (!lkb->lkb_lvbptr)
 			return -ENOMEM;
 		len = receive_extralen(ms);
-		if (len > DLM_RESNAME_MAXLEN)
-			len = DLM_RESNAME_MAXLEN;
+		if (len > ls->ls_lvblen)
+			len = ls->ls_lvblen;
 		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
 	}
 	return 0;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 2e99fb0c973..88556dc0458 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -796,7 +796,6 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 	 */
 
 	idr_for_each(&ls->ls_lkbidr, lkb_idr_free, ls);
-	idr_remove_all(&ls->ls_lkbidr);
 	idr_destroy(&ls->ls_lkbidr);
 
 	/*
@@ -884,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force)
 void dlm_stop_lockspaces(void)
 {
 	struct dlm_ls *ls;
+	int count;
 
  restart:
+	count = 0;
 	spin_lock(&lslist_lock);
 	list_for_each_entry(ls, &lslist, ls_list) {
-		if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+		if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
+			count++;
 			continue;
+		}
 		spin_unlock(&lslist_lock);
 		log_error(ls, "no userland control daemon, stopping lockspace");
 		dlm_ls_stop(ls);
 		goto restart;
 	}
 	spin_unlock(&lslist_lock);
+
+	if (count)
+		log_print("dlm user daemon left %d lockspaces", count);
 }
 
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index dd87a31bcc2..d90909ec6aa 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -53,7 +53,6 @@
 #include <linux/sctp.h>
 #include <linux/slab.h>
 #include <net/sctp/sctp.h>
-#include <net/sctp/user.h>
 #include <net/ipv6.h>
 
 #include "dlm_internal.h"
@@ -126,6 +125,7 @@ struct connection {
 	struct connection *othercon;
 	struct work_struct rwork; /* Receive workqueue */
 	struct work_struct swork; /* Send workqueue */
+	bool try_new_addr;
 };
 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
 
@@ -144,6 +144,7 @@ struct dlm_node_addr {
 	struct list_head list;
 	int nodeid;
 	int addr_count;
+	int curr_addr_index;
 	struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
 };
 
@@ -177,12 +178,11 @@ static inline int nodeid_hash(int nodeid)
 static struct connection *__find_con(int nodeid)
 {
 	int r;
-	struct hlist_node *h;
 	struct connection *con;
 
 	r = nodeid_hash(nodeid);
 
-	hlist_for_each_entry(con, h, &connection_hash[r], list) {
+	hlist_for_each_entry(con, &connection_hash[r], list) {
 		if (con->nodeid == nodeid)
 			return con;
 	}
@@ -232,13 +232,12 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
 static void foreach_conn(void (*conn_func)(struct connection *c))
 {
 	int i;
-	struct hlist_node *h, *n;
+	struct hlist_node *n;
 	struct connection *con;
 
 	for (i = 0; i < CONN_HASH_SIZE; i++) {
-		hlist_for_each_entry_safe(con, h, n, &connection_hash[i], list){
+		hlist_for_each_entry_safe(con, n, &connection_hash[i], list)
 			conn_func(con);
-		}
 	}
 }
 
@@ -257,13 +256,12 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 static struct connection *assoc2con(int assoc_id)
 {
 	int i;
-	struct hlist_node *h;
 	struct connection *con;
 
 	mutex_lock(&connections_lock);
 
 	for (i = 0 ; i < CONN_HASH_SIZE; i++) {
-		hlist_for_each_entry(con, h, &connection_hash[i], list) {
+		hlist_for_each_entry(con, &connection_hash[i], list) {
 			if (con->sctp_assoc == assoc_id) {
 				mutex_unlock(&connections_lock);
 				return con;
@@ -313,7 +311,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
 }
 
 static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
-			  struct sockaddr *sa_out)
+			  struct sockaddr *sa_out, bool try_new_addr)
 {
 	struct sockaddr_storage sas;
 	struct dlm_node_addr *na;
@@ -323,8 +321,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
 
 	spin_lock(&dlm_node_addrs_spin);
 	na = find_node_addr(nodeid);
-	if (na && na->addr_count)
-		memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage));
+	if (na && na->addr_count) {
+		if (try_new_addr) {
+			na->curr_addr_index++;
+			if (na->curr_addr_index == na->addr_count)
+				na->curr_addr_index = 0;
+		}
+
+		memcpy(&sas, na->addr[na->curr_addr_index ],
+			sizeof(struct sockaddr_storage));
+	}
 	spin_unlock(&dlm_node_addrs_spin);
 
 	if (!na)
@@ -356,19 +362,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
 {
 	struct dlm_node_addr *na;
 	int rv = -EEXIST;
+	int addr_i;
 
 	spin_lock(&dlm_node_addrs_spin);
 	list_for_each_entry(na, &dlm_node_addrs, list) {
 		if (!na->addr_count)
 			continue;
 
-		if (!addr_compare(na->addr[0], addr))
-			continue;
-
-		*nodeid = na->nodeid;
-		rv = 0;
-		break;
+		for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
+			if (addr_compare(na->addr[addr_i], addr)) {
+				*nodeid = na->nodeid;
+				rv = 0;
+				goto unlock;
+			}
+		}
 	}
+unlock:
 	spin_unlock(&dlm_node_addrs_spin);
 	return rv;
 }
@@ -564,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
 
 static void sctp_init_failed_foreach(struct connection *con)
 {
+
+	/*
+	 * Don't try to recover base con and handle race where the
+	 * other node's assoc init creates a assoc and we get that
+	 * notification, then we get a notification that our attempt
+	 * failed due. This happens when we are still trying the primary
+	 * address, but the other node has already tried secondary addrs
+	 * and found one that worked.
+	 */
+	if (!con->nodeid || con->sctp_assoc)
+		return;
+
+	log_print("Retrying SCTP association init for node %d\n", con->nodeid);
+
+	con->try_new_addr = true;
 	con->sctp_assoc = 0;
-	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+	if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
 		if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
 			queue_work(send_workqueue, &con->swork);
 	}
@@ -582,15 +606,56 @@ static void sctp_init_failed(void)
 	mutex_unlock(&connections_lock);
 }
 
+static void retry_failed_sctp_send(struct connection *recv_con,
+				   struct sctp_send_failed *sn_send_failed,
+				   char *buf)
+{
+	int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
+	struct dlm_mhandle *mh;
+	struct connection *con;
+	char *retry_buf;
+	int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
+
+	log_print("Retry sending %d bytes to node id %d", len, nodeid);
+
+	con = nodeid2con(nodeid, 0);
+	if (!con) {
+		log_print("Could not look up con for nodeid %d\n",
+			  nodeid);
+		return;
+	}
+
+	mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
+	if (!mh) {
+		log_print("Could not allocate buf for retry.");
+		return;
+	}
+	memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
+	dlm_lowcomms_commit_buffer(mh);
+
+	/*
+	 * If we got a assoc changed event before the send failed event then
+	 * we only need to retry the send.
+	 */
+	if (con->sctp_assoc) {
+		if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+			queue_work(send_workqueue, &con->swork);
+	} else
+		sctp_init_failed_foreach(con);
+}
+
 /* Something happened to an association */
 static void process_sctp_notification(struct connection *con,
 				      struct msghdr *msg, char *buf)
 {
 	union sctp_notification *sn = (union sctp_notification *)buf;
 
-	if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+	switch (sn->sn_header.sn_type) {
+	case SCTP_SEND_FAILED:
+		retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
+		break;
+	case SCTP_ASSOC_CHANGE:
 		switch (sn->sn_assoc_change.sac_state) {
-
 		case SCTP_COMM_UP:
 		case SCTP_RESTART:
 		{
@@ -665,9 +730,11 @@ static void process_sctp_notification(struct connection *con,
 			log_print("connecting to %d sctp association %d",
 				 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
 
+			new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
+			new_con->try_new_addr = false;
 			/* Send any pending writes */
 			clear_bit(CF_CONNECT_PENDING, &new_con->flags);
-			clear_bit(CF_INIT_PENDING, &con->flags);
+			clear_bit(CF_INIT_PENDING, &new_con->flags);
 			if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
 				queue_work(send_workqueue, &new_con->swork);
 			}
@@ -686,14 +753,10 @@ static void process_sctp_notification(struct connection *con,
 		}
 		break;
 
-		/* We don't know which INIT failed, so clear the PENDING flags
-		 * on them all.  if assoc_id is zero then it will then try
-		 * again */
-
 		case SCTP_CANT_STR_ASSOC:
 		{
+			/* Will retry init when we get the send failed notification */
 			log_print("Can't start SCTP association - retrying");
-			sctp_init_failed();
 		}
 		break;
 
@@ -702,6 +765,8 @@ static void process_sctp_notification(struct connection *con,
 				  (int)sn->sn_assoc_change.sac_assoc_id,
 				  sn->sn_assoc_change.sac_state);
 		}
+	default:
+		; /* fall through */
 	}
 }
 
@@ -961,6 +1026,24 @@ static void free_entry(struct writequeue_entry *e)
 	kfree(e);
 }
 
+/*
+ * writequeue_entry_complete - try to delete and free write queue entry
+ * @e: write queue entry to try to delete
+ * @completed: bytes completed
+ *
+ * writequeue_lock must be held.
+ */
+static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
+{
+	e->offset += completed;
+	e->len -= completed;
+
+	if (e->len == 0 && e->users == 0) {
+		list_del(&e->list);
+		free_entry(e);
+	}
+}
+
 /* Initiate an SCTP association.
    This is a special case of send_to_sock() in that we don't yet have a
    peeled-off socket for this association, so we use the listening socket
@@ -980,15 +1063,14 @@ static void sctp_init_assoc(struct connection *con)
 	int addrlen;
 	struct kvec iov[1];
 
+	mutex_lock(&con->sock_mutex);
 	if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
-		return;
-
-	if (con->retries++ > MAX_CONNECT_RETRIES)
-		return;
+		goto unlock;
 
-	if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) {
+	if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
+			   con->try_new_addr)) {
 		log_print("no address for nodeid %d", con->nodeid);
-		return;
+		goto unlock;
 	}
 	base_con = nodeid2con(0, 0);
 	BUG_ON(base_con == NULL);
@@ -1006,17 +1088,25 @@ static void sctp_init_assoc(struct connection *con)
 	if (list_empty(&con->writequeue)) {
 		spin_unlock(&con->writequeue_lock);
 		log_print("writequeue empty for nodeid %d", con->nodeid);
-		return;
+		goto unlock;
 	}
 
 	e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
 	len = e->len;
 	offset = e->offset;
-	spin_unlock(&con->writequeue_lock);
 
 	/* Send the first block off the write queue */
 	iov[0].iov_base = page_address(e->page)+offset;
 	iov[0].iov_len = len;
+	spin_unlock(&con->writequeue_lock);
+
+	if (rem_addr.ss_family == AF_INET) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
+		log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
+	} else {
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
+		log_print("Trying to connect to %pI6", &sin6->sin6_addr);
+	}
 
 	cmsg = CMSG_FIRSTHDR(&outmessage);
 	cmsg->cmsg_level = IPPROTO_SCTP;
@@ -1024,8 +1114,9 @@ static void sctp_init_assoc(struct connection *con)
 	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
 	sinfo = CMSG_DATA(cmsg);
 	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
-	sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid());
+	sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
 	outmessage.msg_controllen = cmsg->cmsg_len;
+	sinfo->sinfo_flags |= SCTP_ADDR_OVER;
 
 	ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
 	if (ret < 0) {
@@ -1038,15 +1129,12 @@ static void sctp_init_assoc(struct connection *con)
 	}
 	else {
 		spin_lock(&con->writequeue_lock);
-		e->offset += ret;
-		e->len -= ret;
-
-		if (e->len == 0 && e->users == 0) {
-			list_del(&e->list);
-			free_entry(e);
-		}
+		writequeue_entry_complete(e, ret);
 		spin_unlock(&con->writequeue_lock);
 	}
+
+unlock:
+	mutex_unlock(&con->sock_mutex);
 }
 
 /* Connect a new socket to its peer */
@@ -1078,7 +1166,7 @@ static void tcp_connect_to_sock(struct connection *con)
 		goto out_err;
 
 	memset(&saddr, 0, sizeof(saddr));
-	result = nodeid_to_addr(con->nodeid, &saddr, NULL);
+	result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
 	if (result < 0) {
 		log_print("no address for nodeid %d", con->nodeid);
 		goto out_err;
@@ -1257,6 +1345,7 @@ static int sctp_listen_for_all(void)
 	int result = -EINVAL, num = 1, i, addr_len;
 	struct connection *con = nodeid2con(0, GFP_NOFS);
 	int bufsize = NEEDED_RMEM;
+	int one = 1;
 
 	if (!con)
 		return -ENOMEM;
@@ -1291,6 +1380,11 @@ static int sctp_listen_for_all(void)
 		goto create_delsock;
 	}
 
+	result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
+				   sizeof(one));
+	if (result < 0)
+		log_print("Could not set SCTP NODELAY error %d\n", result);
+
 	/* Init con struct */
 	sock->sk->sk_user_data = con;
 	con->sock = sock;
@@ -1496,13 +1590,7 @@ static void send_to_sock(struct connection *con)
 		}
 
 		spin_lock(&con->writequeue_lock);
-		e->offset += ret;
-		e->len -= ret;
-
-		if (e->len == 0 && e->users == 0) {
-			list_del(&e->list);
-			free_entry(e);
-		}
+		writequeue_entry_complete(e, ret);
 	}
 	spin_unlock(&con->writequeue_lock);
 out:
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 01fd5c11a7f..f704458ea5f 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -247,6 +247,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	struct dlm_ls *ls;
 	struct plock_op *op;
 	int rv;
+	unsigned char fl_flags = fl->fl_flags;
 
 	ls = dlm_find_lockspace_local(lockspace);
 	if (!ls)
@@ -258,9 +259,18 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 		goto out;
 	}
 
-	if (posix_lock_file_wait(file, fl) < 0)
-		log_error(ls, "dlm_posix_unlock: vfs unlock error %llx",
-			  (unsigned long long)number);
+	/* cause the vfs unlock to return ENOENT if lock is not found */
+	fl->fl_flags |= FL_EXISTS;
+
+	rv = posix_lock_file_wait(file, fl);
+	if (rv == -ENOENT) {
+		rv = 0;
+		goto out_free;
+	}
+	if (rv < 0) {
+		log_error(ls, "dlm_posix_unlock: vfs unlock error %d %llx",
+			  rv, (unsigned long long)number);
+	}
 
 	op->info.optype		= DLM_PLOCK_OP_UNLOCK;
 	op->info.pid		= fl->fl_pid;
@@ -296,9 +306,11 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	if (rv == -ENOENT)
 		rv = 0;
 
+out_free:
 	kfree(op);
 out:
 	dlm_put_lockspace(ls);
+	fl->fl_flags = fl_flags;
 	return rv;
 }
 EXPORT_SYMBOL_GPL(dlm_posix_unlock);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index aedea28a86a..a6bc63f6e31 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -305,27 +305,26 @@ static int recover_idr_empty(struct dlm_ls *ls)
 static int recover_idr_add(struct dlm_rsb *r)
 {
 	struct dlm_ls *ls = r->res_ls;
-	int rv, id;
-
-	rv = idr_pre_get(&ls->ls_recover_idr, GFP_NOFS);
-	if (!rv)
-		return -ENOMEM;
+	int rv;
 
+	idr_preload(GFP_NOFS);
 	spin_lock(&ls->ls_recover_idr_lock);
 	if (r->res_id) {
-		spin_unlock(&ls->ls_recover_idr_lock);
-		return -1;
-	}
-	rv = idr_get_new_above(&ls->ls_recover_idr, r, 1, &id);
-	if (rv) {
-		spin_unlock(&ls->ls_recover_idr_lock);
-		return rv;
+		rv = -1;
+		goto out_unlock;
 	}
-	r->res_id = id;
+	rv = idr_alloc(&ls->ls_recover_idr, r, 1, 0, GFP_NOWAIT);
+	if (rv < 0)
+		goto out_unlock;
+
+	r->res_id = rv;
 	ls->ls_recover_list_count++;
 	dlm_hold_rsb(r);
+	rv = 0;
+out_unlock:
 	spin_unlock(&ls->ls_recover_idr_lock);
-	return 0;
+	idr_preload_end();
+	return rv;
 }
 
 static void recover_idr_del(struct dlm_rsb *r)
@@ -351,24 +350,21 @@ static struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id)
 	return r;
 }
 
-static int recover_idr_clear_rsb(int id, void *p, void *data)
+static void recover_idr_clear(struct dlm_ls *ls)
 {
-	struct dlm_ls *ls = data;
-	struct dlm_rsb *r = p;
+	struct dlm_rsb *r;
+	int id;
 
-	r->res_id = 0;
-	r->res_recover_locks_count = 0;
-	ls->ls_recover_list_count--;
+	spin_lock(&ls->ls_recover_idr_lock);
 
-	dlm_put_rsb(r);
-	return 0;
-}
+	idr_for_each_entry(&ls->ls_recover_idr, r, id) {
+		idr_remove(&ls->ls_recover_idr, id);
+		r->res_id = 0;
+		r->res_recover_locks_count = 0;
+		ls->ls_recover_list_count--;
 
-static void recover_idr_clear(struct dlm_ls *ls)
-{
-	spin_lock(&ls->ls_recover_idr_lock);
-	idr_for_each(&ls->ls_recover_idr, recover_idr_clear_rsb, ls);
-	idr_remove_all(&ls->ls_recover_idr);
+		dlm_put_rsb(r);
+	}
 
 	if (ls->ls_recover_list_count != 0) {
 		log_error(ls, "warning: recover_list_count %d",
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index cc16562654d..434aa313f07 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,6 @@
 config ECRYPT_FS
-	tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
+	tristate "eCrypt filesystem layer support"
+	depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
 	select CRYPTO_ECB
 	select CRYPTO_CBC
 	select CRYPTO_MD5
@@ -12,3 +12,11 @@ config ECRYPT_FS
 
 	  To compile this file system support as a module, choose M here: the
 	  module will be called ecryptfs.
+
+config ECRYPT_FS_MESSAGING
+	bool "Enable notifications for userspace key wrap/unwrap"
+	depends on ECRYPT_FS
+	help
+	  Enables the /dev/ecryptfs entry for use by ecryptfsd. This allows
+	  for userspace to wrap/unwrap file encryption keys by other
+	  backends, like OpenSSL.
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
index 2cc9ee4ad2e..49678a69947 100644
--- a/fs/ecryptfs/Makefile
+++ b/fs/ecryptfs/Makefile
@@ -1,7 +1,10 @@
 #
-# Makefile for the Linux 2.6 eCryptfs
+# Makefile for the Linux eCryptfs
 #
 
 obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
 
-ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o
+ecryptfs-y := dentry.o file.o inode.o main.o super.o mmap.o read_write.o \
+	      crypto.o keystore.o kthread.o debug.o
+
+ecryptfs-$(CONFIG_ECRYPT_FS_MESSAGING) += messaging.o miscdev.o
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index a7b0c2dfb3d..cfa109a4d5a 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -243,7 +243,7 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
 	struct ecryptfs_key_sig *key_sig, *key_sig_tmp;
 
 	if (crypt_stat->tfm)
-		crypto_free_blkcipher(crypt_stat->tfm);
+		crypto_free_ablkcipher(crypt_stat->tfm);
 	if (crypt_stat->hash_tfm)
 		crypto_free_hash(crypt_stat->hash_tfm);
 	list_for_each_entry_safe(key_sig, key_sig_tmp,
@@ -301,17 +301,14 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
 	while (size > 0 && i < sg_size) {
 		pg = virt_to_page(addr);
 		offset = offset_in_page(addr);
-		if (sg)
-			sg_set_page(&sg[i], pg, 0, offset);
+		sg_set_page(&sg[i], pg, 0, offset);
 		remainder_of_page = PAGE_CACHE_SIZE - offset;
 		if (size >= remainder_of_page) {
-			if (sg)
-				sg[i].length = remainder_of_page;
+			sg[i].length = remainder_of_page;
 			addr += remainder_of_page;
 			size -= remainder_of_page;
 		} else {
-			if (sg)
-				sg[i].length = size;
+			sg[i].length = size;
 			addr += size;
 			size = 0;
 		}
@@ -322,6 +319,22 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
 	return i;
 }
 
+struct extent_crypt_result {
+	struct completion completion;
+	int rc;
+};
+
+static void extent_crypt_complete(struct crypto_async_request *req, int rc)
+{
+	struct extent_crypt_result *ecr = req->data;
+
+	if (rc == -EINPROGRESS)
+		return;
+
+	ecr->rc = rc;
+	complete(&ecr->completion);
+}
+
 /**
  * encrypt_scatterlist
  * @crypt_stat: Pointer to the crypt_stat struct to initialize.
@@ -337,11 +350,8 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
 			       struct scatterlist *src_sg, int size,
 			       unsigned char *iv)
 {
-	struct blkcipher_desc desc = {
-		.tfm = crypt_stat->tfm,
-		.info = iv,
-		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
-	};
+	struct ablkcipher_request *req = NULL;
+	struct extent_crypt_result ecr;
 	int rc = 0;
 
 	BUG_ON(!crypt_stat || !crypt_stat->tfm
@@ -352,24 +362,47 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
 		ecryptfs_dump_hex(crypt_stat->key,
 				  crypt_stat->key_size);
 	}
-	/* Consider doing this once, when the file is opened */
+
+	init_completion(&ecr.completion);
+
 	mutex_lock(&crypt_stat->cs_tfm_mutex);
-	if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
-		rc = crypto_blkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
-					     crypt_stat->key_size);
-		crypt_stat->flags |= ECRYPTFS_KEY_SET;
-	}
-	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
-				rc);
+	req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
+	if (!req) {
 		mutex_unlock(&crypt_stat->cs_tfm_mutex);
-		rc = -EINVAL;
+		rc = -ENOMEM;
 		goto out;
 	}
-	ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size);
-	crypto_blkcipher_encrypt_iv(&desc, dest_sg, src_sg, size);
+
+	ablkcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			extent_crypt_complete, &ecr);
+	/* Consider doing this once, when the file is opened */
+	if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
+		rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
+					      crypt_stat->key_size);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR,
+					"Error setting key; rc = [%d]\n",
+					rc);
+			mutex_unlock(&crypt_stat->cs_tfm_mutex);
+			rc = -EINVAL;
+			goto out;
+		}
+		crypt_stat->flags |= ECRYPTFS_KEY_SET;
+	}
 	mutex_unlock(&crypt_stat->cs_tfm_mutex);
+	ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size);
+	ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv);
+	rc = crypto_ablkcipher_encrypt(req);
+	if (rc == -EINPROGRESS || rc == -EBUSY) {
+		struct extent_crypt_result *ecr = req->base.data;
+
+		wait_for_completion(&ecr->completion);
+		rc = ecr->rc;
+		INIT_COMPLETION(ecr->completion);
+	}
 out:
+	ablkcipher_request_free(req);
 	return rc;
 }
 
@@ -627,35 +660,61 @@ static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
 			       struct scatterlist *src_sg, int size,
 			       unsigned char *iv)
 {
-	struct blkcipher_desc desc = {
-		.tfm = crypt_stat->tfm,
-		.info = iv,
-		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
-	};
+	struct ablkcipher_request *req = NULL;
+	struct extent_crypt_result ecr;
 	int rc = 0;
 
-	/* Consider doing this once, when the file is opened */
+	BUG_ON(!crypt_stat || !crypt_stat->tfm
+	       || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
+				crypt_stat->key_size);
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	}
+
+	init_completion(&ecr.completion);
+
 	mutex_lock(&crypt_stat->cs_tfm_mutex);
-	rc = crypto_blkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
-				     crypt_stat->key_size);
-	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
-				rc);
+	req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
+	if (!req) {
 		mutex_unlock(&crypt_stat->cs_tfm_mutex);
-		rc = -EINVAL;
+		rc = -ENOMEM;
 		goto out;
 	}
-	ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
-	rc = crypto_blkcipher_decrypt_iv(&desc, dest_sg, src_sg, size);
+
+	ablkcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			extent_crypt_complete, &ecr);
+	/* Consider doing this once, when the file is opened */
+	if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
+		rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
+					      crypt_stat->key_size);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR,
+					"Error setting key; rc = [%d]\n",
+					rc);
+			mutex_unlock(&crypt_stat->cs_tfm_mutex);
+			rc = -EINVAL;
+			goto out;
+		}
+		crypt_stat->flags |= ECRYPTFS_KEY_SET;
+	}
 	mutex_unlock(&crypt_stat->cs_tfm_mutex);
-	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Error decrypting; rc = [%d]\n",
-				rc);
-		goto out;
+	ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
+	ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv);
+	rc = crypto_ablkcipher_decrypt(req);
+	if (rc == -EINPROGRESS || rc == -EBUSY) {
+		struct extent_crypt_result *ecr = req->base.data;
+
+		wait_for_completion(&ecr->completion);
+		rc = ecr->rc;
+		INIT_COMPLETION(ecr->completion);
 	}
-	rc = size;
 out:
+	ablkcipher_request_free(req);
 	return rc;
+
 }
 
 /**
@@ -749,8 +808,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
 						    crypt_stat->cipher, "cbc");
 	if (rc)
 		goto out_unlock;
-	crypt_stat->tfm = crypto_alloc_blkcipher(full_alg_name, 0,
-						 CRYPTO_ALG_ASYNC);
+	crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0);
 	kfree(full_alg_name);
 	if (IS_ERR(crypt_stat->tfm)) {
 		rc = PTR_ERR(crypt_stat->tfm);
@@ -760,7 +818,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
 				crypt_stat->cipher);
 		goto out_unlock;
 	}
-	crypto_blkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
 	rc = 0;
 out_unlock:
 	mutex_unlock(&crypt_stat->cs_tfm_mutex);
@@ -2185,12 +2243,11 @@ out:
  */
 int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
 					 size_t *plaintext_name_size,
-					 struct dentry *ecryptfs_dir_dentry,
+					 struct super_block *sb,
 					 const char *name, size_t name_size)
 {
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-		&ecryptfs_superblock_to_private(
-			ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
+		&ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
 	char *decoded_name;
 	size_t decoded_name_size;
 	size_t packet_size;
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 1b5d9af937d..bf12ba5dd22 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -45,14 +45,12 @@
 static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct dentry *lower_dentry;
-	struct vfsmount *lower_mnt;
 	int rc = 1;
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
 	if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
 		goto out;
 	rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index cfb4b9fed52..df19d34a033 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -38,6 +38,7 @@
 #include <linux/nsproxy.h>
 #include <linux/backing-dev.h>
 #include <linux/ecryptfs.h>
+#include <linux/crypto.h>
 
 #define ECRYPTFS_DEFAULT_IV_BYTES 16
 #define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
@@ -172,6 +173,19 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
 #define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
 
+#ifdef CONFIG_ECRYPT_FS_MESSAGING
+# define ECRYPTFS_VERSIONING_MASK_MESSAGING (ECRYPTFS_VERSIONING_DEVMISC \
+					     | ECRYPTFS_VERSIONING_PUBKEY)
+#else
+# define ECRYPTFS_VERSIONING_MASK_MESSAGING 0
+#endif
+
+#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
+				  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
+				  | ECRYPTFS_VERSIONING_XATTR \
+				  | ECRYPTFS_VERSIONING_MULTKEY \
+				  | ECRYPTFS_VERSIONING_MASK_MESSAGING \
+				  | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
 struct ecryptfs_key_sig {
 	struct list_head crypt_stat_list;
 	char keysig[ECRYPTFS_SIG_SIZE_HEX + 1];
@@ -220,7 +234,7 @@ struct ecryptfs_crypt_stat {
 	size_t extent_shift;
 	unsigned int extent_mask;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
-	struct crypto_blkcipher *tfm;
+	struct crypto_ablkcipher *tfm;
 	struct crypto_hash *hash_tfm; /* Crypto context for generating
 				       * the initialization vectors */
 	unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
@@ -399,7 +413,9 @@ struct ecryptfs_daemon {
 	struct hlist_node euid_chain;
 };
 
+#ifdef CONFIG_ECRYPT_FS_MESSAGING
 extern struct mutex ecryptfs_daemon_hash_mux;
+#endif
 
 static inline size_t
 ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
@@ -509,6 +525,12 @@ ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)
 	return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt;
 }
 
+static inline struct path *
+ecryptfs_dentry_to_lower_path(struct dentry *dentry)
+{
+	return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path;
+}
+
 static inline void
 ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
 {
@@ -553,7 +575,7 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
 			     struct inode *ecryptfs_inode);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
 					 size_t *decrypted_name_size,
-					 struct dentry *ecryptfs_dentry,
+					 struct super_block *sb,
 					 const char *name, size_t name_size);
 int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
 int ecryptfs_encrypt_and_encode_filename(
@@ -604,6 +626,7 @@ int
 ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		  size_t size, int flags);
 int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
+#ifdef CONFIG_ECRYPT_FS_MESSAGING
 int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
 			      struct ecryptfs_message *msg, u32 seq);
 int ecryptfs_send_message(char *data, int data_len,
@@ -612,6 +635,24 @@ int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
 			       struct ecryptfs_message **emsg);
 int ecryptfs_init_messaging(void);
 void ecryptfs_release_messaging(void);
+#else
+static inline int ecryptfs_init_messaging(void)
+{
+	return 0;
+}
+static inline void ecryptfs_release_messaging(void)
+{ }
+static inline int ecryptfs_send_message(char *data, int data_len,
+					struct ecryptfs_msg_ctx **msg_ctx)
+{
+	return -ENOTCONN;
+}
+static inline int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
+					     struct ecryptfs_message **emsg)
+{
+	return -ENOMSG;
+}
+#endif
 
 void
 ecryptfs_write_header_metadata(char *virt,
@@ -649,12 +690,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 				     size_t offset_in_page, size_t size,
 				     struct inode *ecryptfs_inode);
 struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
-int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
-int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon);
 int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
 				 size_t *length_size);
 int ecryptfs_write_packet_length(char *dest, size_t size,
 				 size_t *packet_size_length);
+#ifdef CONFIG_ECRYPT_FS_MESSAGING
 int ecryptfs_init_ecryptfs_miscdev(void);
 void ecryptfs_destroy_ecryptfs_miscdev(void);
 int ecryptfs_send_miscdev(char *data, size_t data_size,
@@ -663,6 +703,9 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
 void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);
 int
 ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file);
+int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
+int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon);
+#endif
 int ecryptfs_init_kthread(void);
 void ecryptfs_destroy_kthread(void);
 int ecryptfs_privileged_open(struct file **lower_file,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index d45ba456812..24f1105fda3 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,6 +31,7 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
+#include <linux/aio.h>
 #include "ecryptfs_kernel.h"
 
 /**
@@ -67,9 +68,9 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
 }
 
 struct ecryptfs_getdents_callback {
-	void *dirent;
-	struct dentry *dentry;
-	filldir_t filldir;
+	struct dir_context ctx;
+	struct dir_context *caller;
+	struct super_block *sb;
 	int filldir_called;
 	int entries_written;
 };
@@ -87,7 +88,7 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
 
 	buf->filldir_called++;
 	rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
-						  buf->dentry, lower_name,
+						  buf->sb, lower_name,
 						  lower_namelen);
 	if (rc) {
 		printk(KERN_ERR "%s: Error attempting to decode and decrypt "
@@ -95,9 +96,10 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
 		       rc);
 		goto out;
 	}
-	rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
+	buf->caller->pos = buf->ctx.pos;
+	rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
 	kfree(name);
-	if (rc >= 0)
+	if (!rc)
 		buf->entries_written++;
 out:
 	return rc;
@@ -106,34 +108,29 @@ out:
 /**
  * ecryptfs_readdir
  * @file: The eCryptfs directory file
- * @dirent: Directory entry handle
- * @filldir: The filldir callback function
+ * @ctx: The actor to feed the entries to
  */
-static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	int rc;
 	struct file *lower_file;
-	struct inode *inode;
-	struct ecryptfs_getdents_callback buf;
-
+	struct inode *inode = file_inode(file);
+	struct ecryptfs_getdents_callback buf = {
+		.ctx.actor = ecryptfs_filldir,
+		.caller = ctx,
+		.sb = inode->i_sb,
+	};
 	lower_file = ecryptfs_file_to_lower(file);
-	lower_file->f_pos = file->f_pos;
-	inode = file->f_path.dentry->d_inode;
-	memset(&buf, 0, sizeof(buf));
-	buf.dirent = dirent;
-	buf.dentry = file->f_path.dentry;
-	buf.filldir = filldir;
-	buf.filldir_called = 0;
-	buf.entries_written = 0;
-	rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
-	file->f_pos = lower_file->f_pos;
+	lower_file->f_pos = ctx->pos;
+	rc = iterate_dir(lower_file, &buf.ctx);
+	ctx->pos = buf.ctx.pos;
 	if (rc < 0)
 		goto out;
 	if (buf.filldir_called && !buf.entries_written)
 		goto out;
 	if (rc >= 0)
 		fsstack_copy_attr_atime(inode,
-					lower_file->f_path.dentry->d_inode);
+					file_inode(lower_file));
 out:
 	return rc;
 }
@@ -199,7 +196,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 	struct dentry *ecryptfs_dentry = file->f_path.dentry;
 	/* Private value of ecryptfs_dentry allocated in
 	 * ecryptfs_lookup() */
-	struct dentry *lower_dentry;
 	struct ecryptfs_file_info *file_info;
 
 	mount_crypt_stat = &ecryptfs_superblock_to_private(
@@ -222,7 +218,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 		rc = -ENOMEM;
 		goto out;
 	}
-	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
 	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
 	mutex_lock(&crypt_stat->cs_mutex);
 	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) {
@@ -296,6 +291,12 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 static int
 ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
+	int rc;
+
+	rc = filemap_write_and_wait(file->f_mapping);
+	if (rc)
+		return rc;
+
 	return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
 }
 
@@ -339,7 +340,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 #endif
 
 const struct file_operations ecryptfs_dir_fops = {
-	.readdir = ecryptfs_readdir,
+	.iterate = ecryptfs_readdir,
 	.read = generic_read_dir,
 	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
@@ -360,7 +361,7 @@ const struct file_operations ecryptfs_main_fops = {
 	.aio_read = ecryptfs_read_update_atime,
 	.write = do_sync_write,
 	.aio_write = generic_file_aio_write,
-	.readdir = ecryptfs_readdir,
+	.iterate = ecryptfs_readdir,
 	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index cc7709e7c50..67e9b633969 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
 
 	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
 	fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
-	BUG_ON(!lower_dentry->d_count);
+	BUG_ON(!d_count(lower_dentry));
 
 	ecryptfs_set_dentry_private(dentry, dentry_info);
 	ecryptfs_set_dentry_lower(dentry, lower_dentry);
@@ -679,7 +679,7 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
 	set_fs(old_fs);
 	if (rc < 0)
 		goto out;
-	rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
+	rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb,
 						  lower_buf, rc);
 out:
 	kfree(lower_buf);
@@ -999,8 +999,8 @@ out:
 	return rc;
 }
 
-int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
-			  struct kstat *stat)
+static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
+				 struct kstat *stat)
 {
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
 	int rc = 0;
@@ -1021,14 +1021,13 @@ int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
 	return rc;
 }
 
-int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		     struct kstat *stat)
+static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			    struct kstat *stat)
 {
 	struct kstat lower_stat;
 	int rc;
 
-	rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
-			 ecryptfs_dentry_to_lower(dentry), &lower_stat);
+	rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat);
 	if (!rc) {
 		fsstack_copy_attr_all(dentry->d_inode,
 				      ecryptfs_inode_to_lower(dentry->d_inode));
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 2333203a120..7d52806c211 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1150,7 +1150,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 	struct ecryptfs_message *msg = NULL;
 	char *auth_tok_sig;
 	char *payload;
-	size_t payload_len;
+	size_t payload_len = 0;
 	int rc;
 
 	rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok);
@@ -1168,7 +1168,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 	rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error sending message to "
-				"ecryptfsd\n");
+				"ecryptfsd: %d\n", rc);
 		goto out;
 	}
 	rc = ecryptfs_wait_for_response(msg_ctx, &msg);
@@ -1202,8 +1202,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 				  crypt_stat->key_size);
 	}
 out:
-	if (msg)
-		kfree(msg);
+	kfree(msg);
 	return rc;
 }
 
@@ -1989,7 +1988,7 @@ pki_encrypt_session_key(struct key *auth_tok_key,
 	rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error sending message to "
-				"ecryptfsd\n");
+				"ecryptfsd: %d\n", rc);
 		goto out;
 	}
 	rc = ecryptfs_wait_for_response(msg_ctx, &msg);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 4e0886c9e5c..e924cf45aad 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -629,6 +629,7 @@ static struct file_system_type ecryptfs_fs_type = {
 	.kill_sb = ecryptfs_kill_block_super,
 	.fs_flags = 0
 };
+MODULE_ALIAS_FS("ecryptfs");
 
 /**
  * inode_info_init_once
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 5fa2471796c..49ff8ea08f1 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -97,8 +97,7 @@ static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx)
 void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)
 {
 	list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list);
-	if (msg_ctx->msg)
-		kfree(msg_ctx->msg);
+	kfree(msg_ctx->msg);
 	msg_ctx->msg = NULL;
 	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE;
 }
@@ -115,10 +114,9 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)
  */
 int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon)
 {
-	struct hlist_node *elem;
 	int rc;
 
-	hlist_for_each_entry(*daemon, elem,
+	hlist_for_each_entry(*daemon,
 			    &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()],
 			    euid_chain) {
 		if (uid_eq((*daemon)->file->f_cred->euid, current_euid())) {
@@ -284,7 +282,7 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
 	int rc;
 
 	rc = ecryptfs_find_daemon_by_euid(&daemon);
-	if (rc || !daemon) {
+	if (rc) {
 		rc = -ENOTCONN;
 		goto out;
 	}
@@ -445,7 +443,6 @@ void ecryptfs_release_messaging(void)
 		mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
 	}
 	if (ecryptfs_daemon_hash) {
-		struct hlist_node *elem;
 		struct ecryptfs_daemon *daemon;
 		int i;
 
@@ -453,7 +450,7 @@ void ecryptfs_release_messaging(void)
 		for (i = 0; i < (1 << ecryptfs_hash_bits); i++) {
 			int rc;
 
-			hlist_for_each_entry(daemon, elem,
+			hlist_for_each_entry(daemon,
 					     &ecryptfs_daemon_hash[i],
 					     euid_chain) {
 				rc = ecryptfs_exorcise_daemon(daemon);
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 412e6eda25f..e4141f25749 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -80,13 +80,6 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file)
 	int rc;
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
-	rc = try_module_get(THIS_MODULE);
-	if (rc == 0) {
-		rc = -EIO;
-		printk(KERN_ERR "%s: Error attempting to increment module use "
-		       "count; rc = [%d]\n", __func__, rc);
-		goto out_unlock_daemon_list;
-	}
 	rc = ecryptfs_find_daemon_by_euid(&daemon);
 	if (!rc) {
 		rc = -EINVAL;
@@ -96,7 +89,7 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file)
 	if (rc) {
 		printk(KERN_ERR "%s: Error attempting to spawn daemon; "
 		       "rc = [%d]\n", __func__, rc);
-		goto out_module_put_unlock_daemon_list;
+		goto out_unlock_daemon_list;
 	}
 	mutex_lock(&daemon->mux);
 	if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) {
@@ -108,9 +101,6 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file)
 	atomic_inc(&ecryptfs_num_miscdev_opens);
 out_unlock_daemon:
 	mutex_unlock(&daemon->mux);
-out_module_put_unlock_daemon_list:
-	if (rc)
-		module_put(THIS_MODULE);
 out_unlock_daemon_list:
 	mutex_unlock(&ecryptfs_daemon_hash_mux);
 	return rc;
@@ -147,7 +137,6 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file)
 		       "bug.\n", __func__, rc);
 		BUG();
 	}
-	module_put(THIS_MODULE);
 	return rc;
 }
 
@@ -471,6 +460,7 @@ out_free:
 
 
 static const struct file_operations ecryptfs_miscdev_fops = {
+	.owner   = THIS_MODULE,
 	.open    = ecryptfs_miscdev_open,
 	.poll    = ecryptfs_miscdev_poll,
 	.read    = ecryptfs_miscdev_read,
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index b2a34a192f4..09fe622274e 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -40,16 +40,12 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
 			 loff_t offset, size_t size)
 {
 	struct file *lower_file;
-	mm_segment_t fs_save;
 	ssize_t rc;
 
 	lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
 	if (!lower_file)
 		return -EIO;
-	fs_save = get_fs();
-	set_fs(get_ds());
-	rc = vfs_write(lower_file, data, size, &offset);
-	set_fs(fs_save);
+	rc = kernel_write(lower_file, data, size, offset);
 	mark_inode_dirty_sync(ecryptfs_inode);
 	return rc;
 }
@@ -236,17 +232,10 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
 			struct inode *ecryptfs_inode)
 {
 	struct file *lower_file;
-	mm_segment_t fs_save;
-	ssize_t rc;
-
 	lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
 	if (!lower_file)
 		return -EIO;
-	fs_save = get_fs();
-	set_fs(get_ds());
-	rc = vfs_read(lower_file, data, size, &offset);
-	set_fs(fs_save);
-	return rc;
+	return kernel_read(lower_file, offset, data, size);
 }
 
 /**
diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig
new file mode 100644
index 00000000000..367bbb10c54
--- /dev/null
+++ b/fs/efivarfs/Kconfig
@@ -0,0 +1,12 @@
+config EFIVAR_FS
+	tristate "EFI Variable filesystem"
+	depends on EFI
+	help
+	  efivarfs is a replacement filesystem for the old EFI
+	  variable support via sysfs, as it doesn't suffer from the
+	  same 1024-byte variable size limit.
+
+	  To compile this file system support as a module, choose M
+	  here. The module will be called efivarfs.
+
+	  If unsure, say N.
diff --git a/fs/efivarfs/Makefile b/fs/efivarfs/Makefile
new file mode 100644
index 00000000000..955d478177d
--- /dev/null
+++ b/fs/efivarfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the efivarfs filesystem
+#
+
+obj-$(CONFIG_EFIVAR_FS)		+= efivarfs.o
+
+efivarfs-objs			:= inode.o file.o super.o
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
new file mode 100644
index 00000000000..8dd524f3228
--- /dev/null
+++ b/fs/efivarfs/file.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/efi.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+static ssize_t efivarfs_file_write(struct file *file,
+		const char __user *userbuf, size_t count, loff_t *ppos)
+{
+	struct efivar_entry *var = file->private_data;
+	void *data;
+	u32 attributes;
+	struct inode *inode = file->f_mapping->host;
+	unsigned long datasize = count - sizeof(attributes);
+	ssize_t bytes = 0;
+	bool set = false;
+
+	if (count < sizeof(attributes))
+		return -EINVAL;
+
+	if (copy_from_user(&attributes, userbuf, sizeof(attributes)))
+		return -EFAULT;
+
+	if (attributes & ~(EFI_VARIABLE_MASK))
+		return -EINVAL;
+
+	data = kmalloc(datasize, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) {
+		bytes = -EFAULT;
+		goto out;
+	}
+
+	bytes = efivar_entry_set_get_size(var, attributes, &datasize,
+					  data, &set);
+	if (!set && bytes) {
+		if (bytes == -ENOENT)
+			bytes = -EIO;
+		goto out;
+	}
+
+	if (bytes == -ENOENT) {
+		drop_nlink(inode);
+		d_delete(file->f_dentry);
+		dput(file->f_dentry);
+	} else {
+		mutex_lock(&inode->i_mutex);
+		i_size_write(inode, datasize + sizeof(attributes));
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	bytes = count;
+
+out:
+	kfree(data);
+
+	return bytes;
+}
+
+static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
+		size_t count, loff_t *ppos)
+{
+	struct efivar_entry *var = file->private_data;
+	unsigned long datasize = 0;
+	u32 attributes;
+	void *data;
+	ssize_t size = 0;
+	int err;
+
+	err = efivar_entry_size(var, &datasize);
+
+	/*
+	 * efivarfs represents uncommitted variables with
+	 * zero-length files. Reading them should return EOF.
+	 */
+	if (err == -ENOENT)
+		return 0;
+	else if (err)
+		return err;
+
+	data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL);
+
+	if (!data)
+		return -ENOMEM;
+
+	size = efivar_entry_get(var, &attributes, &datasize,
+				data + sizeof(attributes));
+	if (size)
+		goto out_free;
+
+	memcpy(data, &attributes, sizeof(attributes));
+	size = simple_read_from_buffer(userbuf, count, ppos,
+				       data, datasize + sizeof(attributes));
+out_free:
+	kfree(data);
+
+	return size;
+}
+
+const struct file_operations efivarfs_file_operations = {
+	.open	= simple_open,
+	.read	= efivarfs_file_read,
+	.write	= efivarfs_file_write,
+	.llseek	= no_llseek,
+};
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
new file mode 100644
index 00000000000..7e787fb9029
--- /dev/null
+++ b/fs/efivarfs/inode.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/efi.h>
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+struct inode *efivarfs_get_inode(struct super_block *sb,
+				const struct inode *dir, int mode, dev_t dev)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_mode = mode;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		switch (mode & S_IFMT) {
+		case S_IFREG:
+			inode->i_fop = &efivarfs_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_op = &efivarfs_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+			inc_nlink(inode);
+			break;
+		}
+	}
+	return inode;
+}
+
+/*
+ * Return true if 'str' is a valid efivarfs filename of the form,
+ *
+ *	VariableName-12345678-1234-1234-1234-1234567891bc
+ */
+bool efivarfs_valid_name(const char *str, int len)
+{
+	static const char dashes[EFI_VARIABLE_GUID_LEN] = {
+		[8] = 1, [13] = 1, [18] = 1, [23] = 1
+	};
+	const char *s = str + len - EFI_VARIABLE_GUID_LEN;
+	int i;
+
+	/*
+	 * We need a GUID, plus at least one letter for the variable name,
+	 * plus the '-' separator
+	 */
+	if (len < EFI_VARIABLE_GUID_LEN + 2)
+		return false;
+
+	/* GUID must be preceded by a '-' */
+	if (*(s - 1) != '-')
+		return false;
+
+	/*
+	 * Validate that 's' is of the correct format, e.g.
+	 *
+	 *	12345678-1234-1234-1234-123456789abc
+	 */
+	for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) {
+		if (dashes[i]) {
+			if (*s++ != '-')
+				return false;
+		} else {
+			if (!isxdigit(*s++))
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid)
+{
+	guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]);
+	guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]);
+	guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]);
+	guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]);
+	guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]);
+	guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]);
+	guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]);
+	guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]);
+	guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]);
+	guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]);
+	guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]);
+	guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]);
+	guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]);
+	guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]);
+	guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]);
+	guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]);
+}
+
+static int efivarfs_create(struct inode *dir, struct dentry *dentry,
+			  umode_t mode, bool excl)
+{
+	struct inode *inode;
+	struct efivar_entry *var;
+	int namelen, i = 0, err = 0;
+
+	if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len))
+		return -EINVAL;
+
+	inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0);
+	if (!inode)
+		return -ENOMEM;
+
+	var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
+	if (!var) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* length of the variable name itself: remove GUID and separator */
+	namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
+
+	efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
+			&var->var.VendorGuid);
+
+	for (i = 0; i < namelen; i++)
+		var->var.VariableName[i] = dentry->d_name.name[i];
+
+	var->var.VariableName[i] = '\0';
+
+	inode->i_private = var;
+
+	efivar_entry_add(var, &efivarfs_list);
+	d_instantiate(dentry, inode);
+	dget(dentry);
+out:
+	if (err) {
+		kfree(var);
+		iput(inode);
+	}
+	return err;
+}
+
+static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct efivar_entry *var = dentry->d_inode->i_private;
+
+	if (efivar_entry_delete(var))
+		return -EINVAL;
+
+	drop_nlink(dentry->d_inode);
+	dput(dentry);
+	return 0;
+};
+
+/*
+ * Handle negative dentry.
+ */
+static struct dentry *efivarfs_lookup(struct inode *dir, struct dentry *dentry,
+				      unsigned int flags)
+{
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+	d_add(dentry, NULL);
+	return NULL;
+}
+
+const struct inode_operations efivarfs_dir_inode_operations = {
+	.lookup = efivarfs_lookup,
+	.unlink = efivarfs_unlink,
+	.create = efivarfs_create,
+};
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
new file mode 100644
index 00000000000..b5ff16addb7
--- /dev/null
+++ b/fs/efivarfs/internal.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef EFIVAR_FS_INTERNAL_H
+#define EFIVAR_FS_INTERNAL_H
+
+#include <linux/list.h>
+
+extern const struct file_operations efivarfs_file_operations;
+extern const struct inode_operations efivarfs_dir_inode_operations;
+extern bool efivarfs_valid_name(const char *str, int len);
+extern struct inode *efivarfs_get_inode(struct super_block *sb,
+			const struct inode *dir, int mode, dev_t dev);
+
+extern struct list_head efivarfs_list;
+
+#endif /* EFIVAR_FS_INTERNAL_H */
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
new file mode 100644
index 00000000000..a8766b880c0
--- /dev/null
+++ b/fs/efivarfs/super.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Copyright (C) 2012 Jeremy Kerr <jeremy.kerr@canonical.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ctype.h>
+#include <linux/efi.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/ucs2_string.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+
+#include "internal.h"
+
+LIST_HEAD(efivarfs_list);
+
+static void efivarfs_evict_inode(struct inode *inode)
+{
+	clear_inode(inode);
+}
+
+static const struct super_operations efivarfs_ops = {
+	.statfs = simple_statfs,
+	.drop_inode = generic_delete_inode,
+	.evict_inode = efivarfs_evict_inode,
+	.show_options = generic_show_options,
+};
+
+static struct super_block *efivarfs_sb;
+
+/*
+ * Compare two efivarfs file names.
+ *
+ * An efivarfs filename is composed of two parts,
+ *
+ *	1. A case-sensitive variable name
+ *	2. A case-insensitive GUID
+ *
+ * So we need to perform a case-sensitive match on part 1 and a
+ * case-insensitive match on part 2.
+ */
+static int efivarfs_d_compare(const struct dentry *parent,
+			      const struct dentry *dentry,
+			      unsigned int len, const char *str,
+			      const struct qstr *name)
+{
+	int guid = len - EFI_VARIABLE_GUID_LEN;
+
+	if (name->len != len)
+		return 1;
+
+	/* Case-sensitive compare for the variable name */
+	if (memcmp(str, name->name, guid))
+		return 1;
+
+	/* Case-insensitive compare for the GUID */
+	return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN);
+}
+
+static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
+{
+	unsigned long hash = init_name_hash();
+	const unsigned char *s = qstr->name;
+	unsigned int len = qstr->len;
+
+	if (!efivarfs_valid_name(s, len))
+		return -EINVAL;
+
+	while (len-- > EFI_VARIABLE_GUID_LEN)
+		hash = partial_name_hash(*s++, hash);
+
+	/* GUID is case-insensitive. */
+	while (len--)
+		hash = partial_name_hash(tolower(*s++), hash);
+
+	qstr->hash = end_name_hash(hash);
+	return 0;
+}
+
+/*
+ * Retaining negative dentries for an in-memory filesystem just wastes
+ * memory and lookup time: arrange for them to be deleted immediately.
+ */
+static int efivarfs_delete_dentry(const struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations efivarfs_d_ops = {
+	.d_compare = efivarfs_d_compare,
+	.d_hash = efivarfs_d_hash,
+	.d_delete = efivarfs_delete_dentry,
+};
+
+static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
+{
+	struct dentry *d;
+	struct qstr q;
+	int err;
+
+	q.name = name;
+	q.len = strlen(name);
+
+	err = efivarfs_d_hash(NULL, &q);
+	if (err)
+		return ERR_PTR(err);
+
+	d = d_alloc(parent, &q);
+	if (d)
+		return d;
+
+	return ERR_PTR(-ENOMEM);
+}
+
+static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
+			     unsigned long name_size, void *data)
+{
+	struct super_block *sb = (struct super_block *)data;
+	struct efivar_entry *entry;
+	struct inode *inode = NULL;
+	struct dentry *dentry, *root = sb->s_root;
+	unsigned long size = 0;
+	char *name;
+	int len, i;
+	int err = -ENOMEM;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return err;
+
+	memcpy(entry->var.VariableName, name16, name_size);
+	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
+
+	len = ucs2_strlen(entry->var.VariableName);
+
+	/* name, plus '-', plus GUID, plus NUL*/
+	name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL);
+	if (!name)
+		goto fail;
+
+	for (i = 0; i < len; i++)
+		name[i] = entry->var.VariableName[i] & 0xFF;
+
+	name[len] = '-';
+
+	efi_guid_unparse(&entry->var.VendorGuid, name + len + 1);
+
+	name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
+
+	inode = efivarfs_get_inode(sb, root->d_inode, S_IFREG | 0644, 0);
+	if (!inode)
+		goto fail_name;
+
+	dentry = efivarfs_alloc_dentry(root, name);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto fail_inode;
+	}
+
+	/* copied by the above to local storage in the dentry. */
+	kfree(name);
+
+	efivar_entry_size(entry, &size);
+	efivar_entry_add(entry, &efivarfs_list);
+
+	mutex_lock(&inode->i_mutex);
+	inode->i_private = entry;
+	i_size_write(inode, size + sizeof(entry->var.Attributes));
+	mutex_unlock(&inode->i_mutex);
+	d_add(dentry, inode);
+
+	return 0;
+
+fail_inode:
+	iput(inode);
+fail_name:
+	kfree(name);
+fail:
+	kfree(entry);
+	return err;
+}
+
+static int efivarfs_destroy(struct efivar_entry *entry, void *data)
+{
+	efivar_entry_remove(entry);
+	kfree(entry);
+	return 0;
+}
+
+static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode = NULL;
+	struct dentry *root;
+	int err;
+
+	efivarfs_sb = sb;
+
+	sb->s_maxbytes          = MAX_LFS_FILESIZE;
+	sb->s_blocksize         = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits    = PAGE_CACHE_SHIFT;
+	sb->s_magic             = EFIVARFS_MAGIC;
+	sb->s_op                = &efivarfs_ops;
+	sb->s_d_op		= &efivarfs_d_ops;
+	sb->s_time_gran         = 1;
+
+	inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+	if (!inode)
+		return -ENOMEM;
+	inode->i_op = &efivarfs_dir_inode_operations;
+
+	root = d_make_root(inode);
+	sb->s_root = root;
+	if (!root)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&efivarfs_list);
+
+	err = efivar_init(efivarfs_callback, (void *)sb, false,
+			  true, &efivarfs_list);
+	if (err)
+		__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+
+	return err;
+}
+
+static struct dentry *efivarfs_mount(struct file_system_type *fs_type,
+				    int flags, const char *dev_name, void *data)
+{
+	return mount_single(fs_type, flags, data, efivarfs_fill_super);
+}
+
+static void efivarfs_kill_sb(struct super_block *sb)
+{
+	kill_litter_super(sb);
+	efivarfs_sb = NULL;
+
+	/* Remove all entries and destroy */
+	__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
+}
+
+static struct file_system_type efivarfs_type = {
+	.name    = "efivarfs",
+	.mount   = efivarfs_mount,
+	.kill_sb = efivarfs_kill_sb,
+};
+
+static __init int efivarfs_init(void)
+{
+	if (!efi_enabled(EFI_RUNTIME_SERVICES))
+		return 0;
+
+	if (!efivars_kobject())
+		return 0;
+
+	return register_filesystem(&efivarfs_type);
+}
+
+MODULE_AUTHOR("Matthew Garrett, Jeremy Kerr");
+MODULE_DESCRIPTION("EFI Variable Filesystem");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_FS("efivarfs");
+
+module_init(efivarfs_init);
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
index 6ebfc1c207a..d020e3c30fe 100644
--- a/fs/efs/Kconfig
+++ b/fs/efs/Kconfig
@@ -1,6 +1,6 @@
 config EFS_FS
-	tristate "EFS file system support (read only) (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	tristate "EFS file system support (read only)"
+	depends on BLOCK
 	help
 	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
 	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 7ee6f7e3a60..b72307ccdf7 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -7,40 +7,38 @@
 #include <linux/buffer_head.h>
 #include "efs.h"
 
-static int efs_readdir(struct file *, void *, filldir_t);
+static int efs_readdir(struct file *, struct dir_context *);
 
 const struct file_operations efs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= efs_readdir,
+	.iterate	= efs_readdir,
 };
 
 const struct inode_operations efs_dir_inode_operations = {
 	.lookup		= efs_lookup,
 };
 
-static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct buffer_head *bh;
-
-	struct efs_dir		*dirblock;
-	struct efs_dentry	*dirslot;
-	efs_ino_t		inodenum;
+static int efs_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct inode *inode = file_inode(file);
 	efs_block_t		block;
-	int			slot, namelen;
-	char			*nameptr;
+	int			slot;
 
 	if (inode->i_size & (EFS_DIRBSIZE-1))
 		printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
 
 	/* work out where this entry can be found */
-	block = filp->f_pos >> EFS_DIRBSIZE_BITS;
+	block = ctx->pos >> EFS_DIRBSIZE_BITS;
 
 	/* each block contains at most 256 slots */
-	slot  = filp->f_pos & 0xff;
+	slot  = ctx->pos & 0xff;
 
 	/* look at all blocks */
 	while (block < inode->i_blocks) {
+		struct efs_dir		*dirblock;
+		struct buffer_head *bh;
+
 		/* read the dir block */
 		bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
 
@@ -57,11 +55,14 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
 			break;
 		}
 
-		while (slot < dirblock->slots) {
-			if (dirblock->space[slot] == 0) {
-				slot++;
+		for (; slot < dirblock->slots; slot++) {
+			struct efs_dentry *dirslot;
+			efs_ino_t inodenum;
+			const char *nameptr;
+			int namelen;
+
+			if (dirblock->space[slot] == 0)
 				continue;
-			}
 
 			dirslot  = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
 
@@ -72,39 +73,29 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
 #ifdef DEBUG
 			printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen);
 #endif
-			if (namelen > 0) {
-				/* found the next entry */
-				filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
-
-				/* copy filename and data in dirslot */
-				filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN);
-
-				/* sanity check */
-				if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
-					printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
-					slot++;
-					continue;
-				}
-
-				/* store position of next slot */
-				if (++slot == dirblock->slots) {
-					slot = 0;
-					block++;
-				}
+			if (!namelen)
+				continue;
+			/* found the next entry */
+			ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
+
+			/* sanity check */
+			if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
+				printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
+				continue;
+			}
+
+			/* copy filename and data in dirslot */
+			if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) {
 				brelse(bh);
-				filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
-				goto out;
+				return 0;
 			}
-			slot++;
 		}
 		brelse(bh);
 
 		slot = 0;
 		block++;
 	}
-
-	filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
-out:
+	ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
 	return 0;
 }
 
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 2002431ef9a..c6f57a74a55 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -33,6 +33,7 @@ static struct file_system_type efs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("efs");
 
 static struct pt_types sgi_pt_types[] = {
 	{0x00,		"SGI vh"},
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9fec1836057..9ad17b15b45 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,12 +34,14 @@
 #include <linux/mutex.h>
 #include <linux/anon_inodes.h>
 #include <linux/device.h>
+#include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/mman.h>
 #include <linux/atomic.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/compat.h>
 
 /*
  * LOCKING:
@@ -104,7 +106,7 @@
 struct epoll_filefd {
 	struct file *file;
 	int fd;
-};
+} __packed;
 
 /*
  * Structure used to track possible nested calls, for too deep recursions
@@ -128,6 +130,8 @@ struct nested_calls {
 /*
  * Each file descriptor added to the eventpoll interface will
  * have an entry of this type linked to the "rbr" RB tree.
+ * Avoid increasing the size of this struct, there can be many thousands
+ * of these on a server and we do not want this to take another cache line.
  */
 struct epitem {
 	/* RB tree node used to link this structure to the eventpoll RB tree */
@@ -158,7 +162,7 @@ struct epitem {
 	struct list_head fllink;
 
 	/* wakeup_source used when EPOLLWAKEUP is set */
-	struct wakeup_source *ws;
+	struct wakeup_source __rcu *ws;
 
 	/* The structure that describe the interested events and the source fd */
 	struct epoll_event event;
@@ -536,6 +540,38 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 	}
 }
 
+/* call only when ep->mtx is held */
+static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
+{
+	return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
+}
+
+/* call only when ep->mtx is held */
+static inline void ep_pm_stay_awake(struct epitem *epi)
+{
+	struct wakeup_source *ws = ep_wakeup_source(epi);
+
+	if (ws)
+		__pm_stay_awake(ws);
+}
+
+static inline bool ep_has_wakeup_source(struct epitem *epi)
+{
+	return rcu_access_pointer(epi->ws) ? true : false;
+}
+
+/* call when ep->mtx cannot be held (ep_poll_callback) */
+static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
+{
+	struct wakeup_source *ws;
+
+	rcu_read_lock();
+	ws = rcu_dereference(epi->ws);
+	if (ws)
+		__pm_stay_awake(ws);
+	rcu_read_unlock();
+}
+
 /**
  * ep_scan_ready_list - Scans the ready list in a way that makes possible for
  *                      the scan code, to call f_op->poll(). Also allows for
@@ -599,7 +635,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 		 */
 		if (!ep_is_linked(&epi->rdllink)) {
 			list_add_tail(&epi->rdllink, &ep->rdllist);
-			__pm_stay_awake(epi->ws);
+			ep_pm_stay_awake(epi);
 		}
 	}
 	/*
@@ -668,7 +704,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 		list_del_init(&epi->rdllink);
 	spin_unlock_irqrestore(&ep->lock, flags);
 
-	wakeup_source_unregister(epi->ws);
+	wakeup_source_unregister(ep_wakeup_source(epi));
 
 	/* At this point it is safe to free the eventpoll item */
 	kmem_cache_free(epi_cache, epi);
@@ -711,11 +747,15 @@ static void ep_free(struct eventpoll *ep)
 	 * point we are sure no poll callbacks will be lingering around, and also by
 	 * holding "epmutex" we can be sure that no file cleanup code will hit
 	 * us during this operation. So we can avoid the lock on "ep->lock".
+	 * We do not need to lock ep->mtx, either, we only do it to prevent
+	 * a lockdep warning.
 	 */
+	mutex_lock(&ep->mtx);
 	while ((rbp = rb_first(&ep->rbr)) != NULL) {
 		epi = rb_entry(rbp, struct epitem, rbn);
 		ep_remove(ep, epi);
 	}
+	mutex_unlock(&ep->mtx);
 
 	mutex_unlock(&epmutex);
 	mutex_destroy(&ep->mtx);
@@ -734,6 +774,13 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
+{
+	pt->_key = epi->event.events;
+
+	return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
+}
+
 static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 			       void *priv)
 {
@@ -741,10 +788,9 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 	poll_table pt;
 
 	init_poll_funcptr(&pt, NULL);
+
 	list_for_each_entry_safe(epi, tmp, head, rdllink) {
-		pt._key = epi->event.events;
-		if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
-		    epi->event.events)
+		if (ep_item_poll(epi, &pt))
 			return POLLIN | POLLRDNORM;
 		else {
 			/*
@@ -752,7 +798,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 			 * callback, but it's not actually ready, as far as
 			 * caller requested events goes. We can remove it here.
 			 */
-			__pm_relax(epi->ws);
+			__pm_relax(ep_wakeup_source(epi));
 			list_del_init(&epi->rdllink);
 		}
 	}
@@ -984,7 +1030,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	/* If this file is already in the ready list we exit soon */
 	if (!ep_is_linked(&epi->rdllink)) {
 		list_add_tail(&epi->rdllink, &ep->rdllist);
-		__pm_stay_awake(epi->ws);
+		ep_pm_stay_awake_rcu(epi);
 	}
 
 	/*
@@ -1146,6 +1192,7 @@ static int reverse_path_check(void)
 static int ep_create_wakeup_source(struct epitem *epi)
 {
 	const char *name;
+	struct wakeup_source *ws;
 
 	if (!epi->ep->ws) {
 		epi->ep->ws = wakeup_source_register("eventpoll");
@@ -1154,17 +1201,29 @@ static int ep_create_wakeup_source(struct epitem *epi)
 	}
 
 	name = epi->ffd.file->f_path.dentry->d_name.name;
-	epi->ws = wakeup_source_register(name);
-	if (!epi->ws)
+	ws = wakeup_source_register(name);
+
+	if (!ws)
 		return -ENOMEM;
+	rcu_assign_pointer(epi->ws, ws);
 
 	return 0;
 }
 
-static void ep_destroy_wakeup_source(struct epitem *epi)
+/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
+static noinline void ep_destroy_wakeup_source(struct epitem *epi)
 {
-	wakeup_source_unregister(epi->ws);
-	epi->ws = NULL;
+	struct wakeup_source *ws = ep_wakeup_source(epi);
+
+	RCU_INIT_POINTER(epi->ws, NULL);
+
+	/*
+	 * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
+	 * used internally by wakeup_source_remove, too (called by
+	 * wakeup_source_unregister), so we cannot use call_rcu
+	 */
+	synchronize_rcu();
+	wakeup_source_unregister(ws);
 }
 
 /*
@@ -1199,13 +1258,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 		if (error)
 			goto error_create_wakeup_source;
 	} else {
-		epi->ws = NULL;
+		RCU_INIT_POINTER(epi->ws, NULL);
 	}
 
 	/* Initialize the poll table using the queue callback */
 	epq.epi = epi;
 	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
-	epq.pt._key = event->events;
 
 	/*
 	 * Attach the item to the poll hooks and get current event bits.
@@ -1214,7 +1272,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	 * this operation completes, the poll callback can start hitting
 	 * the new item.
 	 */
-	revents = tfile->f_op->poll(tfile, &epq.pt);
+	revents = ep_item_poll(epi, &epq.pt);
 
 	/*
 	 * We have to check if something went wrong during the poll wait queue
@@ -1247,7 +1305,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	/* If the file is already "ready" we drop it inside the ready list */
 	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
 		list_add_tail(&epi->rdllink, &ep->rdllist);
-		__pm_stay_awake(epi->ws);
+		ep_pm_stay_awake(epi);
 
 		/* Notify waiting tasks that events are available */
 		if (waitqueue_active(&ep->wq))
@@ -1288,7 +1346,7 @@ error_unregister:
 		list_del_init(&epi->rdllink);
 	spin_unlock_irqrestore(&ep->lock, flags);
 
-	wakeup_source_unregister(epi->ws);
+	wakeup_source_unregister(ep_wakeup_source(epi));
 
 error_create_wakeup_source:
 	kmem_cache_free(epi_cache, epi);
@@ -1314,12 +1372,11 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 * f_op->poll() call and the new event set registering.
 	 */
 	epi->event.events = event->events; /* need barrier below */
-	pt._key = event->events;
 	epi->event.data = event->data; /* protected by mtx */
 	if (epi->event.events & EPOLLWAKEUP) {
-		if (!epi->ws)
+		if (!ep_has_wakeup_source(epi))
 			ep_create_wakeup_source(epi);
-	} else if (epi->ws) {
+	} else if (ep_has_wakeup_source(epi)) {
 		ep_destroy_wakeup_source(epi);
 	}
 
@@ -1347,7 +1404,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 	 * Get current event bits. We can safely use the file* here because
 	 * its usage count has been increased by the caller of this function.
 	 */
-	revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
+	revents = ep_item_poll(epi, &pt);
 
 	/*
 	 * If the item is "hot" and it is not registered inside the ready
@@ -1357,7 +1414,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 		spin_lock_irq(&ep->lock);
 		if (!ep_is_linked(&epi->rdllink)) {
 			list_add_tail(&epi->rdllink, &ep->rdllist);
-			__pm_stay_awake(epi->ws);
+			ep_pm_stay_awake(epi);
 
 			/* Notify waiting tasks that events are available */
 			if (waitqueue_active(&ep->wq))
@@ -1383,6 +1440,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 	unsigned int revents;
 	struct epitem *epi;
 	struct epoll_event __user *uevent;
+	struct wakeup_source *ws;
 	poll_table pt;
 
 	init_poll_funcptr(&pt, NULL);
@@ -1405,14 +1463,16 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 		 * instead, but then epi->ws would temporarily be out of sync
 		 * with ep_is_linked().
 		 */
-		if (epi->ws && epi->ws->active)
-			__pm_stay_awake(ep->ws);
-		__pm_relax(epi->ws);
+		ws = ep_wakeup_source(epi);
+		if (ws) {
+			if (ws->active)
+				__pm_stay_awake(ep->ws);
+			__pm_relax(ws);
+		}
+
 		list_del_init(&epi->rdllink);
 
-		pt._key = epi->event.events;
-		revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
-			epi->event.events;
+		revents = ep_item_poll(epi, &pt);
 
 		/*
 		 * If the event mask intersect the caller-requested one,
@@ -1424,7 +1484,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 			if (__put_user(revents, &uevent->events) ||
 			    __put_user(epi->event.data, &uevent->data)) {
 				list_add(&epi->rdllink, head);
-				__pm_stay_awake(epi->ws);
+				ep_pm_stay_awake(epi);
 				return eventcnt ? eventcnt : -EFAULT;
 			}
 			eventcnt++;
@@ -1444,7 +1504,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 				 * poll callback will queue them in ep->ovflist.
 				 */
 				list_add_tail(&epi->rdllink, &ep->rdllist);
-				__pm_stay_awake(epi->ws);
+				ep_pm_stay_awake(epi);
 			}
 		}
 	}
@@ -1543,7 +1603,8 @@ fetch_events:
 			}
 
 			spin_unlock_irqrestore(&ep->lock, flags);
-			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+			if (!freezable_schedule_hrtimeout_range(to, slack,
+								HRTIMER_MODE_ABS))
 				timed_out = 1;
 
 			spin_lock_irqsave(&ep->lock, flags);
@@ -1916,8 +1977,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 			return -EINVAL;
 		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
 			return -EFAULT;
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+		sigsaved = current->blocked;
+		set_current_blocked(&ksigmask);
 	}
 
 	error = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -1934,12 +1995,58 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 			       sizeof(sigsaved));
 			set_restore_sigmask();
 		} else
-			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+			set_current_blocked(&sigsaved);
 	}
 
 	return error;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
+			struct epoll_event __user *, events,
+			int, maxevents, int, timeout,
+			const compat_sigset_t __user *, sigmask,
+			compat_size_t, sigsetsize)
+{
+	long err;
+	compat_sigset_t csigmask;
+	sigset_t ksigmask, sigsaved;
+
+	/*
+	 * If the caller wants a certain signal mask to be set during the wait,
+	 * we apply it here.
+	 */
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &csigmask);
+		sigsaved = current->blocked;
+		set_current_blocked(&ksigmask);
+	}
+
+	err = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+	/*
+	 * If we changed the signal mask, we need to restore the original one.
+	 * In case we've got a signal while waiting, we do not restore the
+	 * signal mask yet, and we allow do_signal() to deliver the signal on
+	 * the way back to userspace, before the signal mask is restored.
+	 */
+	if (sigmask) {
+		if (err == -EINTR) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+			       sizeof(sigsaved));
+			set_restore_sigmask();
+		} else
+			set_current_blocked(&sigsaved);
+	}
+
+	return err;
+}
+#endif
+
 static int __init eventpoll_init(void)
 {
 	struct sysinfo si;
@@ -1964,6 +2071,12 @@ static int __init eventpoll_init(void)
 	/* Initialize the structure used to perform file's f_op->poll() calls */
 	ep_nested_calls_init(&poll_readywalk_ncalls);
 
+	/*
+	 * We can have many thousands of epitems, so prevent this from
+	 * using an extra cache line on 64-bit (and smaller) CPUs
+	 */
+	BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
+
 	/* Allocates slab cache used to allocate "struct epitem" items */
 	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
 			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
diff --git a/fs/exec.c b/fs/exec.c
index 20df02c1cc7..9c73def8764 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -110,20 +110,21 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 	static const struct open_flags uselib_flags = {
 		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 		.acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
-		.intent = LOOKUP_OPEN
+		.intent = LOOKUP_OPEN,
+		.lookup_flags = LOOKUP_FOLLOW,
 	};
 
 	if (IS_ERR(tmp))
 		goto out;
 
-	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
+	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
 	putname(tmp);
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out;
 
 	error = -EINVAL;
-	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
+	if (!S_ISREG(file_inode(file)->i_mode))
 		goto exit;
 
 	error = -EACCES;
@@ -355,7 +356,7 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
  * flags, permissions, and offset, so we use temporary values.  We'll update
  * them later in setup_arg_pages().
  */
-int bprm_mm_init(struct linux_binprm *bprm)
+static int bprm_mm_init(struct linux_binprm *bprm)
 {
 	int err;
 	struct mm_struct *mm = NULL;
@@ -613,7 +614,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		 * when the old and new regions overlap clear from new_end.
 		 */
 		free_pgd_range(&tlb, new_end, old_end, new_end,
-			vma->vm_next ? vma->vm_next->vm_start : 0);
+			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
 	} else {
 		/*
 		 * otherwise, clean from old_start; this is done to not touch
@@ -622,7 +623,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		 * for the others its just a little faster.
 		 */
 		free_pgd_range(&tlb, old_start, old_end, new_end,
-			vma->vm_next ? vma->vm_next->vm_start : 0);
+			vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
 	}
 	tlb_finish_mmu(&tlb, new_end, old_end);
 
@@ -756,15 +757,16 @@ struct file *open_exec(const char *name)
 	static const struct open_flags open_exec_flags = {
 		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 		.acc_mode = MAY_EXEC | MAY_OPEN,
-		.intent = LOOKUP_OPEN
+		.intent = LOOKUP_OPEN,
+		.lookup_flags = LOOKUP_FOLLOW,
 	};
 
-	file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags, LOOKUP_FOLLOW);
+	file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags);
 	if (IS_ERR(file))
 		goto out;
 
 	err = -EACCES;
-	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
+	if (!S_ISREG(file_inode(file)->i_mode))
 		goto exit;
 
 	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
@@ -802,6 +804,15 @@ int kernel_read(struct file *file, loff_t offset,
 
 EXPORT_SYMBOL(kernel_read);
 
+ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
+{
+	ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos);
+	if (res > 0)
+		flush_icache_range(addr, addr + len);
+	return res;
+}
+EXPORT_SYMBOL(read_code);
+
 static int exec_mmap(struct mm_struct *mm)
 {
 	struct task_struct *tsk;
@@ -898,11 +909,13 @@ static int de_thread(struct task_struct *tsk)
 
 		sig->notify_count = -1;	/* for exit_notify() */
 		for (;;) {
+			threadgroup_change_begin(tsk);
 			write_lock_irq(&tasklist_lock);
 			if (likely(leader->exit_state))
 				break;
 			__set_current_state(TASK_KILLABLE);
 			write_unlock_irq(&tasklist_lock);
+			threadgroup_change_end(tsk);
 			schedule();
 			if (unlikely(__fatal_signal_pending(tsk)))
 				goto killed;
@@ -919,6 +932,7 @@ static int de_thread(struct task_struct *tsk)
 		 * also take its birthdate (always earlier than our own).
 		 */
 		tsk->start_time = leader->start_time;
+		tsk->real_start_time = leader->real_start_time;
 
 		BUG_ON(!same_thread_group(leader, tsk));
 		BUG_ON(has_group_leader_pid(tsk));
@@ -934,9 +948,8 @@ static int de_thread(struct task_struct *tsk)
 		 * Note: The old leader also uses this pid until release_task
 		 *       is called.  Odd but simple and correct.
 		 */
-		detach_pid(tsk, PIDTYPE_PID);
 		tsk->pid = leader->pid;
-		attach_pid(tsk, PIDTYPE_PID,  task_pid(leader));
+		change_pid(tsk, PIDTYPE_PID, task_pid(leader));
 		transfer_pid(leader, tsk, PIDTYPE_PGID);
 		transfer_pid(leader, tsk, PIDTYPE_SID);
 
@@ -960,6 +973,7 @@ static int de_thread(struct task_struct *tsk)
 		if (unlikely(leader->ptrace))
 			__wake_up_parent(leader, leader->parent);
 		write_unlock_irq(&tasklist_lock);
+		threadgroup_change_end(tsk);
 
 		release_task(leader);
 	}
@@ -1027,17 +1041,7 @@ EXPORT_SYMBOL_GPL(get_task_comm);
 void set_task_comm(struct task_struct *tsk, char *buf)
 {
 	task_lock(tsk);
-
 	trace_task_rename(tsk, buf);
-
-	/*
-	 * Threads may access current->comm without holding
-	 * the task lock, so write the string carefully.
-	 * Readers without a lock may see incomplete new
-	 * names but are safe from non-terminating string reads.
-	 */
-	memset(tsk->comm, 0, TASK_COMM_LEN);
-	wmb();
 	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 	task_unlock(tsk);
 	perf_event_comm(tsk);
@@ -1098,7 +1102,7 @@ EXPORT_SYMBOL(flush_old_exec);
 
 void would_dump(struct linux_binprm *bprm, struct file *file)
 {
-	if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0)
+	if (inode_permission(file_inode(file), MAY_READ) < 0)
 		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
 }
 EXPORT_SYMBOL(would_dump);
@@ -1111,7 +1115,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	current->sas_ss_sp = current->sas_ss_size = 0;
 
 	if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))
-		set_dumpable(current->mm, SUID_DUMPABLE_ENABLED);
+		set_dumpable(current->mm, SUID_DUMP_USER);
 	else
 		set_dumpable(current->mm, suid_dumpable);
 
@@ -1133,13 +1137,6 @@ void setup_new_exec(struct linux_binprm * bprm)
 			set_dumpable(current->mm, suid_dumpable);
 	}
 
-	/*
-	 * Flush performance counters when crossing a
-	 * security domain:
-	 */
-	if (!get_dumpable(current->mm))
-		perf_event_exit_task(current);
-
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 
@@ -1203,6 +1200,15 @@ void install_exec_creds(struct linux_binprm *bprm)
 
 	commit_creds(bprm->cred);
 	bprm->cred = NULL;
+
+	/*
+	 * Disable monitoring for regular users
+	 * when executing setuid binaries. Must
+	 * wait until new credentials are committed
+	 * by commit_creds() above
+	 */
+	if (get_dumpable(current->mm) != SUID_DUMP_USER)
+		perf_event_exit_task(current);
 	/*
 	 * cred_guard_mutex must be held at least to this point to prevent
 	 * ptrace_attach() from altering our determination of the task's
@@ -1270,7 +1276,7 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
 int prepare_binprm(struct linux_binprm *bprm)
 {
 	umode_t mode;
-	struct inode * inode = bprm->file->f_path.dentry->d_inode;
+	struct inode * inode = file_inode(bprm->file);
 	int retval;
 
 	mode = inode->i_mode;
@@ -1459,7 +1465,6 @@ static int do_execve_common(const char *filename,
 	struct files_struct *displaced;
 	bool clear_in_exec;
 	int retval;
-	const struct cred *cred = current_cred();
 
 	/*
 	 * We move the actual failure in case of RLIMIT_NPROC excess from
@@ -1468,7 +1473,7 @@ static int do_execve_common(const char *filename,
 	 * whether NPROC limit is still exceeded.
 	 */
 	if ((current->flags & PF_NPROC_EXCEEDED) &&
-	    atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
+	    atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
 		retval = -EAGAIN;
 		goto out_ret;
 	}
@@ -1639,17 +1644,17 @@ EXPORT_SYMBOL(set_binfmt);
 void set_dumpable(struct mm_struct *mm, int value)
 {
 	switch (value) {
-	case SUID_DUMPABLE_DISABLED:
+	case SUID_DUMP_DISABLE:
 		clear_bit(MMF_DUMPABLE, &mm->flags);
 		smp_wmb();
 		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
 		break;
-	case SUID_DUMPABLE_ENABLED:
+	case SUID_DUMP_USER:
 		set_bit(MMF_DUMPABLE, &mm->flags);
 		smp_wmb();
 		clear_bit(MMF_DUMP_SECURELY, &mm->flags);
 		break;
-	case SUID_DUMPABLE_SAFE:
+	case SUID_DUMP_ROOT:
 		set_bit(MMF_DUMP_SECURELY, &mm->flags);
 		smp_wmb();
 		set_bit(MMF_DUMPABLE, &mm->flags);
@@ -1662,7 +1667,7 @@ int __get_dumpable(unsigned long mm_flags)
 	int ret;
 
 	ret = mm_flags & MMF_DUMPABLE_MASK;
-	return (ret > SUID_DUMPABLE_ENABLED) ? SUID_DUMPABLE_SAFE : ret;
+	return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret;
 }
 
 int get_dumpable(struct mm_struct *mm)
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index c61e62ac231..49f51ab4caa 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
 }
 
 static int
-exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+exofs_readdir(struct file *file, struct dir_context *ctx)
 {
-	loff_t pos = filp->f_pos;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
 	unsigned int offset = pos & ~PAGE_CACHE_MASK;
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
-	unsigned char *types = NULL;
-	int need_revalidate = (filp->f_version != inode->i_version);
+	int need_revalidate = (file->f_version != inode->i_version);
 
 	if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
 		return 0;
 
-	types = exofs_filetype_table;
-
 	for ( ; n < npages; n++, offset = 0) {
 		char *kaddr, *limit;
 		struct exofs_dir_entry *de;
@@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (IS_ERR(page)) {
 			EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
 				  inode->i_ino);
-			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			ctx->pos += PAGE_CACHE_SIZE - offset;
 			return PTR_ERR(page);
 		}
 		kaddr = page_address(page);
@@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			if (offset) {
 				offset = exofs_validate_entry(kaddr, offset,
 								chunk_mask);
-				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
 			}
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 			need_revalidate = 0;
 		}
 		de = (struct exofs_dir_entry *)(kaddr + offset);
@@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				return -EIO;
 			}
 			if (de->inode_no) {
-				int over;
-				unsigned char d_type = DT_UNKNOWN;
+				unsigned char t;
 
-				if (types && de->file_type < EXOFS_FT_MAX)
-					d_type = types[de->file_type];
+				if (de->file_type < EXOFS_FT_MAX)
+					t = exofs_filetype_table[de->file_type];
+				else
+					t = DT_UNKNOWN;
 
-				offset = (char *)de - kaddr;
-				over = filldir(dirent, de->name, de->name_len,
-						(n<<PAGE_CACHE_SHIFT) | offset,
+				if (!dir_emit(ctx, de->name, de->name_len,
 						le64_to_cpu(de->inode_no),
-						d_type);
-				if (over) {
+						t)) {
 					exofs_put_page(page);
 					return 0;
 				}
 			}
-			filp->f_pos += le16_to_cpu(de->rec_len);
+			ctx->pos += le16_to_cpu(de->rec_len);
 		}
 		exofs_put_page(page);
 	}
-
 	return 0;
 }
 
@@ -669,5 +663,5 @@ not_empty:
 const struct file_operations exofs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= exofs_readdir,
+	.iterate	= exofs_readdir,
 };
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1f80abd882..2ec8eb1ab26 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
 	return 0;
 }
 
-static void exofs_invalidatepage(struct page *page, unsigned long offset)
+static void exofs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
-	EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
+	EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
+		     page->index, offset, length);
 	WARN_ON(1);
 }
 
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index f936cb50dc0..b7442288860 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -401,7 +401,7 @@ static void _clear_bio(struct bio *bio)
 	struct bio_vec *bv;
 	unsigned i;
 
-	__bio_for_each_segment(bv, bio, i, 0) {
+	bio_for_each_segment_all(bv, bio, i) {
 		unsigned this_count = bv->bv_len;
 
 		if (likely(PAGE_SIZE == this_count))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index b963f38ac29..7682b970d0f 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -432,7 +432,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
 		if (!bio)
 			continue;
 
-		__bio_for_each_segment(bv, bio, i, 0) {
+		bio_for_each_segment_all(bv, bio, i) {
 			struct page *page = bv->bv_page;
 
 			SetPageUptodate(page);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 5e59280d42d..9d976332873 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -1010,6 +1010,7 @@ static struct file_system_type exofs_type = {
 	.mount          = exofs_mount,
 	.kill_sb        = generic_shutdown_super,
 };
+MODULE_ALIAS_FS("exofs");
 
 static int __init init_exofs(void)
 {
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 5df4bb4aab1..293bc2e47a7 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -44,14 +44,13 @@ find_acceptable_alias(struct dentry *result,
 {
 	struct dentry *dentry, *toput = NULL;
 	struct inode *inode;
-	struct hlist_node *p;
 
 	if (acceptable(context, result))
 		return result;
 
 	inode = result->d_inode;
 	spin_lock(&inode->i_lock);
-	hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		dget(dentry);
 		spin_unlock(&inode->i_lock);
 		if (toput)
@@ -213,6 +212,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 }
 
 struct getdents_callback {
+	struct dir_context ctx;
 	char *name;		/* name that was found. It already points to a
 				   buffer NAME_MAX+1 is size */
 	unsigned long ino;	/* the inum we are looking for */
@@ -255,7 +255,11 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
 	struct inode *dir = path->dentry->d_inode;
 	int error;
 	struct file *file;
-	struct getdents_callback buffer;
+	struct getdents_callback buffer = {
+		.ctx.actor = filldir_one,
+		.name = name,
+		.ino = child->d_inode->i_ino
+	};
 
 	error = -ENOTDIR;
 	if (!dir || !S_ISDIR(dir->i_mode))
@@ -272,17 +276,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
 		goto out;
 
 	error = -EINVAL;
-	if (!file->f_op->readdir)
+	if (!file->f_op->iterate)
 		goto out_close;
 
-	buffer.name = name;
-	buffer.ino = child->d_inode->i_ino;
-	buffer.found = 0;
 	buffer.sequence = 0;
 	while (1) {
 		int old_seq = buffer.sequence;
 
-		error = vfs_readdir(file, filldir_one, &buffer);
+		error = iterate_dir(file, &buffer.ctx);
 		if (buffer.found) {
 			error = 0;
 			break;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 2616d0ea5c5..9f9992b3792 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -159,15 +159,6 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
 	return bh;
 }
 
-static void release_blocks(struct super_block *sb, int count)
-{
-	if (count) {
-		struct ext2_sb_info *sbi = EXT2_SB(sb);
-
-		percpu_counter_add(&sbi->s_freeblocks_counter, count);
-	}
-}
-
 static void group_adjust_blocks(struct super_block *sb, int group_no,
 	struct ext2_group_desc *desc, struct buffer_head *bh, int count)
 {
@@ -568,8 +559,11 @@ do_more:
 	}
 error_return:
 	brelse(bitmap_bh);
-	release_blocks(sb, freed);
-	dquot_free_block_nodirty(inode, freed);
+	if (freed) {
+		percpu_counter_add(&sbi->s_freeblocks_counter, freed);
+		dquot_free_block_nodirty(inode, freed);
+		mark_inode_dirty(inode);
+	}
 }
 
 /**
@@ -1239,10 +1233,6 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 
 	*errp = -ENOSPC;
 	sb = inode->i_sb;
-	if (!sb) {
-		printk("ext2_new_blocks: nonexistent device");
-		return 0;
-	}
 
 	/*
 	 * Check quota for allocation of this block.
@@ -1416,9 +1406,11 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
-	dquot_free_block_nodirty(inode, *count-num);
-	mark_inode_dirty(inode);
-	*count = num;
+	if (num < *count) {
+		dquot_free_block_nodirty(inode, *count-num);
+		mark_inode_dirty(inode);
+		*count = num;
+	}
 	return ret_block;
 
 io_error:
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 0f4f5c92925..6e1d4ab09d7 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -287,17 +287,17 @@ static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
 }
 
 static int
-ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
+ext2_readdir(struct file *file, struct dir_context *ctx)
 {
-	loff_t pos = filp->f_pos;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	unsigned int offset = pos & ~PAGE_CACHE_MASK;
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
 	unsigned char *types = NULL;
-	int need_revalidate = filp->f_version != inode->i_version;
+	int need_revalidate = file->f_version != inode->i_version;
 
 	if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
 		return 0;
@@ -314,16 +314,16 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 			ext2_error(sb, __func__,
 				   "bad page in #%lu",
 				   inode->i_ino);
-			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			ctx->pos += PAGE_CACHE_SIZE - offset;
 			return PTR_ERR(page);
 		}
 		kaddr = page_address(page);
 		if (unlikely(need_revalidate)) {
 			if (offset) {
 				offset = ext2_validate_entry(kaddr, offset, chunk_mask);
-				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
 			}
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 			need_revalidate = 0;
 		}
 		de = (ext2_dirent *)(kaddr+offset);
@@ -336,22 +336,19 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 				return -EIO;
 			}
 			if (de->inode) {
-				int over;
 				unsigned char d_type = DT_UNKNOWN;
 
 				if (types && de->file_type < EXT2_FT_MAX)
 					d_type = types[de->file_type];
 
-				offset = (char *)de - kaddr;
-				over = filldir(dirent, de->name, de->name_len,
-						(n<<PAGE_CACHE_SHIFT) | offset,
-						le32_to_cpu(de->inode), d_type);
-				if (over) {
+				if (!dir_emit(ctx, de->name, de->name_len,
+						le32_to_cpu(de->inode),
+						d_type)) {
 					ext2_put_page(page);
 					return 0;
 				}
 			}
-			filp->f_pos += ext2_rec_len_from_disk(de->rec_len);
+			ctx->pos += ext2_rec_len_from_disk(de->rec_len);
 		}
 		ext2_put_page(page);
 	}
@@ -724,7 +721,7 @@ not_empty:
 const struct file_operations ext2_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ext2_readdir,
+	.iterate	= ext2_readdir,
 	.unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 8f370e012e6..7cadd823bb3 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -118,7 +118,6 @@ void ext2_free_inode (struct inode * inode)
 	 * as writing the quota to disk may need the lock as well.
 	 */
 	/* Quota is already initialized in iput() */
-	ext2_xattr_delete_inode(inode);
 	dquot_free_inode(inode);
 	dquot_drop(inode);
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6363ac66faf..0a87bb10998 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,9 +31,11 @@
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
+#include <linux/aio.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
+#include "xattr.h"
 
 static int __ext2_write_inode(struct inode *inode, int do_sync);
 
@@ -88,6 +90,7 @@ void ext2_evict_inode(struct inode * inode)
 		inode->i_size = 0;
 		if (inode->i_blocks)
 			ext2_truncate_blocks(inode, 0);
+		ext2_xattr_delete_inode(inode);
 	}
 
 	invalidate_inode_buffers(inode);
@@ -495,6 +498,10 @@ static int ext2_alloc_branch(struct inode *inode,
 		 * parent to disk.
 		 */
 		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
+			goto failed;
+		}
 		branch[n].bh = bh;
 		lock_buffer(bh);
 		memset(bh->b_data, 0, blocksize);
@@ -523,6 +530,14 @@ static int ext2_alloc_branch(struct inode *inode,
 	}
 	*blks = num;
 	return err;
+
+failed:
+	for (i = 1; i < n; i++)
+		bforget(branch[i].bh);
+	for (i = 0; i < indirect_blks; i++)
+		ext2_free_blocks(inode, new_blocks[i], 1);
+	ext2_free_blocks(inode, new_blocks[i], num);
+	return err;
 }
 
 /**
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 2de655f5d62..5d46c09863f 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -19,7 +19,7 @@
 
 long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	unsigned int flags;
 	unsigned short rsv_window_size;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 73b0d951983..256dd5f4c1c 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -119,6 +119,29 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
 	return ext2_add_nondir(dentry, inode);
 }
 
+static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode = ext2_new_inode(dir, mode, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &ext2_file_inode_operations;
+	if (ext2_use_xip(inode->i_sb)) {
+		inode->i_mapping->a_ops = &ext2_aops_xip;
+		inode->i_fop = &ext2_xip_file_operations;
+	} else if (test_opt(inode->i_sb, NOBH)) {
+		inode->i_mapping->a_ops = &ext2_nobh_aops;
+		inode->i_fop = &ext2_file_operations;
+	} else {
+		inode->i_mapping->a_ops = &ext2_aops;
+		inode->i_fop = &ext2_file_operations;
+	}
+	mark_inode_dirty(inode);
+	d_tmpfile(dentry, inode);
+	unlock_new_inode(inode);
+	return 0;
+}
+
 static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode * inode;
@@ -398,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 #endif
 	.setattr	= ext2_setattr,
 	.get_acl	= ext2_get_acl,
+	.tmpfile	= ext2_tmpfile,
 };
 
 const struct inode_operations ext2_special_inode_operations = {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index fa04d023177..288534920fe 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1500,7 +1500,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
 			bh = sb_bread(sb, tmp_bh.b_blocknr);
 		else
 			bh = sb_getblk(sb, tmp_bh.b_blocknr);
-		if (!bh) {
+		if (unlikely(!bh)) {
 			err = -EIO;
 			goto out;
 		}
@@ -1536,6 +1536,7 @@ static struct file_system_type ext2_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("ext2");
 
 static int __init init_ext2_fs(void)
 {
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index b6754dbbce3..2d7557db3ae 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -662,10 +662,10 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			ea_idebug(inode, "creating block %d", block);
 
 			new_bh = sb_getblk(sb, block);
-			if (!new_bh) {
+			if (unlikely(!new_bh)) {
 				ext2_free_blocks(inode, block, 1);
 				mark_inode_dirty(inode);
-				error = -EIO;
+				error = -ENOMEM;
 				goto cleanup;
 			}
 			lock_buffer(new_bh);
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index dd91264ba94..f522425aaa2 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -28,8 +28,7 @@ static unsigned char ext3_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
-static int ext3_dx_readdir(struct file * filp,
-			   void * dirent, filldir_t filldir);
+static int ext3_dx_readdir(struct file *, struct dir_context *);
 
 static unsigned char get_dtype(struct super_block *sb, int filetype)
 {
@@ -91,36 +90,30 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
 	return error_msg == NULL ? 1 : 0;
 }
 
-static int ext3_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext3_readdir(struct file *file, struct dir_context *ctx)
 {
-	int error = 0;
 	unsigned long offset;
-	int i, stored;
+	int i;
 	struct ext3_dir_entry_2 *de;
 	int err;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	int ret = 0;
 	int dir_has_error = 0;
 
 	if (is_dx_dir(inode)) {
-		err = ext3_dx_readdir(filp, dirent, filldir);
-		if (err != ERR_BAD_DX_DIR) {
-			ret = err;
-			goto out;
-		}
+		err = ext3_dx_readdir(file, ctx);
+		if (err != ERR_BAD_DX_DIR)
+			return err;
 		/*
 		 * We don't set the inode dirty flag since it's not
 		 * critical that it get flushed back to the disk.
 		 */
-		EXT3_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
+		EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
 	}
-	stored = 0;
-	offset = filp->f_pos & (sb->s_blocksize - 1);
+	offset = ctx->pos & (sb->s_blocksize - 1);
 
-	while (!error && !stored && filp->f_pos < inode->i_size) {
-		unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
+	while (ctx->pos < inode->i_size) {
+		unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
 		struct buffer_head map_bh;
 		struct buffer_head *bh = NULL;
 
@@ -129,12 +122,12 @@ static int ext3_readdir(struct file * filp,
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
-			if (!ra_has_index(&filp->f_ra, index))
+			if (!ra_has_index(&file->f_ra, index))
 				page_cache_sync_readahead(
 					sb->s_bdev->bd_inode->i_mapping,
-					&filp->f_ra, filp,
+					&file->f_ra, file,
 					index, 1);
-			filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 			bh = ext3_bread(NULL, inode, blk, 0, &err);
 		}
 
@@ -146,22 +139,21 @@ static int ext3_readdir(struct file * filp,
 			if (!dir_has_error) {
 				ext3_error(sb, __func__, "directory #%lu "
 					"contains a hole at offset %lld",
-					inode->i_ino, filp->f_pos);
+					inode->i_ino, ctx->pos);
 				dir_has_error = 1;
 			}
 			/* corrupt size?  Maybe no more blocks to read */
-			if (filp->f_pos > inode->i_blocks << 9)
+			if (ctx->pos > inode->i_blocks << 9)
 				break;
-			filp->f_pos += sb->s_blocksize - offset;
+			ctx->pos += sb->s_blocksize - offset;
 			continue;
 		}
 
-revalidate:
 		/* If the dir block has changed since the last call to
 		 * readdir(2), then we might be pointing to an invalid
 		 * dirent right now.  Scan from the start of the block
 		 * to make sure. */
-		if (filp->f_version != inode->i_version) {
+		if (offset && file->f_version != inode->i_version) {
 			for (i = 0; i < sb->s_blocksize && i < offset; ) {
 				de = (struct ext3_dir_entry_2 *)
 					(bh->b_data + i);
@@ -177,53 +169,40 @@ revalidate:
 				i += ext3_rec_len_from_disk(de->rec_len);
 			}
 			offset = i;
-			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
 				| offset;
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 		}
 
-		while (!error && filp->f_pos < inode->i_size
+		while (ctx->pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
 			if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
 						   bh, offset)) {
-				/* On error, skip the f_pos to the
+				/* On error, skip the to the
                                    next block. */
-				filp->f_pos = (filp->f_pos |
+				ctx->pos = (ctx->pos |
 						(sb->s_blocksize - 1)) + 1;
-				brelse (bh);
-				ret = stored;
-				goto out;
+				break;
 			}
 			offset += ext3_rec_len_from_disk(de->rec_len);
 			if (le32_to_cpu(de->inode)) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation.
-				 */
-				u64 version = filp->f_version;
-
-				error = filldir(dirent, de->name,
-						de->name_len,
-						filp->f_pos,
-						le32_to_cpu(de->inode),
-						get_dtype(sb, de->file_type));
-				if (error)
-					break;
-				if (version != filp->f_version)
-					goto revalidate;
-				stored ++;
+				if (!dir_emit(ctx, de->name, de->name_len,
+					      le32_to_cpu(de->inode),
+					      get_dtype(sb, de->file_type))) {
+					brelse(bh);
+					return 0;
+				}
 			}
-			filp->f_pos += ext3_rec_len_from_disk(de->rec_len);
+			ctx->pos += ext3_rec_len_from_disk(de->rec_len);
 		}
 		offset = 0;
 		brelse (bh);
+		if (ctx->pos < inode->i_size)
+			if (!dir_relax(inode))
+				return 0;
 	}
-out:
-	return ret;
+	return 0;
 }
 
 static inline int is_32bit_api(void)
@@ -452,62 +431,54 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
  * for all entres on the fname linked list.  (Normally there is only
  * one entry on the linked list, unless there are 62 bit hash collisions.)
  */
-static int call_filldir(struct file * filp, void * dirent,
-			filldir_t filldir, struct fname *fname)
+static bool call_filldir(struct file *file, struct dir_context *ctx,
+			struct fname *fname)
 {
-	struct dir_private_info *info = filp->private_data;
-	loff_t	curr_pos;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct super_block * sb;
-	int error;
-
-	sb = inode->i_sb;
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
 
 	if (!fname) {
 		printk("call_filldir: called with null fname?!?\n");
-		return 0;
+		return true;
 	}
-	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+	ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
 	while (fname) {
-		error = filldir(dirent, fname->name,
-				fname->name_len, curr_pos,
+		if (!dir_emit(ctx, fname->name, fname->name_len,
 				fname->inode,
-				get_dtype(sb, fname->file_type));
-		if (error) {
-			filp->f_pos = curr_pos;
+				get_dtype(sb, fname->file_type))) {
 			info->extra_fname = fname;
-			return error;
+			return false;
 		}
 		fname = fname->next;
 	}
-	return 0;
+	return true;
 }
 
-static int ext3_dx_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dir_private_info *info = filp->private_data;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
 	struct fname *fname;
 	int	ret;
 
 	if (!info) {
-		info = ext3_htree_create_dir_info(filp, filp->f_pos);
+		info = ext3_htree_create_dir_info(file, ctx->pos);
 		if (!info)
 			return -ENOMEM;
-		filp->private_data = info;
+		file->private_data = info;
 	}
 
-	if (filp->f_pos == ext3_get_htree_eof(filp))
+	if (ctx->pos == ext3_get_htree_eof(file))
 		return 0;	/* EOF */
 
 	/* Some one has messed with f_pos; reset the world */
-	if (info->last_pos != filp->f_pos) {
+	if (info->last_pos != ctx->pos) {
 		free_rb_tree_fname(&info->root);
 		info->curr_node = NULL;
 		info->extra_fname = NULL;
-		info->curr_hash = pos2maj_hash(filp, filp->f_pos);
-		info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+		info->curr_hash = pos2maj_hash(file, ctx->pos);
+		info->curr_minor_hash = pos2min_hash(file, ctx->pos);
 	}
 
 	/*
@@ -515,7 +486,7 @@ static int ext3_dx_readdir(struct file * filp,
 	 * chain, return them first.
 	 */
 	if (info->extra_fname) {
-		if (call_filldir(filp, dirent, filldir, info->extra_fname))
+		if (!call_filldir(file, ctx, info->extra_fname))
 			goto finished;
 		info->extra_fname = NULL;
 		goto next_node;
@@ -529,17 +500,17 @@ static int ext3_dx_readdir(struct file * filp,
 		 * cached entries.
 		 */
 		if ((!info->curr_node) ||
-		    (filp->f_version != inode->i_version)) {
+		    (file->f_version != inode->i_version)) {
 			info->curr_node = NULL;
 			free_rb_tree_fname(&info->root);
-			filp->f_version = inode->i_version;
-			ret = ext3_htree_fill_tree(filp, info->curr_hash,
+			file->f_version = inode->i_version;
+			ret = ext3_htree_fill_tree(file, info->curr_hash,
 						   info->curr_minor_hash,
 						   &info->next_hash);
 			if (ret < 0)
 				return ret;
 			if (ret == 0) {
-				filp->f_pos = ext3_get_htree_eof(filp);
+				ctx->pos = ext3_get_htree_eof(file);
 				break;
 			}
 			info->curr_node = rb_first(&info->root);
@@ -548,7 +519,7 @@ static int ext3_dx_readdir(struct file * filp,
 		fname = rb_entry(info->curr_node, struct fname, rb_hash);
 		info->curr_hash = fname->hash;
 		info->curr_minor_hash = fname->minor_hash;
-		if (call_filldir(filp, dirent, filldir, fname))
+		if (!call_filldir(file, ctx, fname))
 			break;
 	next_node:
 		info->curr_node = rb_next(info->curr_node);
@@ -559,7 +530,7 @@ static int ext3_dx_readdir(struct file * filp,
 			info->curr_minor_hash = fname->minor_hash;
 		} else {
 			if (info->next_hash == ~0) {
-				filp->f_pos = ext3_get_htree_eof(filp);
+				ctx->pos = ext3_get_htree_eof(file);
 				break;
 			}
 			info->curr_hash = info->next_hash;
@@ -567,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
 		}
 	}
 finished:
-	info->last_pos = filp->f_pos;
+	info->last_pos = ctx->pos;
 	return 0;
 }
 
@@ -582,7 +553,7 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
 const struct file_operations ext3_dir_operations = {
 	.llseek		= ext3_dir_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ext3_readdir,
+	.iterate	= ext3_readdir,
 	.unlocked_ioctl = ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index b31dbd4c46a..1cb9c7e10c6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -48,9 +48,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	trace_ext3_sync_file_enter(file, datasync);
 
-	if (inode->i_sb->s_flags & MS_RDONLY)
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		/* Make sure that we read updated state */
+		smp_rmb();
+		if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
+			return -EROFS;
 		return 0;
-
+	}
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (ret)
 		goto out;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b176d425354..2bd85486b87 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
+#include <linux/aio.h>
 #include "ext3.h"
 #include "xattr.h"
 #include "acl.h"
@@ -218,7 +219,8 @@ void ext3_evict_inode (struct inode *inode)
 	 */
 	if (inode->i_nlink && ext3_should_journal_data(inode) &&
 	    EXT3_SB(inode->i_sb)->s_journal &&
-	    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+	    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+	    inode->i_ino != EXT3_JOURNAL_INO) {
 		tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
 		journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
 
@@ -676,6 +678,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
 		 * parent to disk.
 		 */
 		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
+			goto failed;
+		}
 		branch[n].bh = bh;
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
@@ -717,7 +723,7 @@ failed:
 		BUFFER_TRACE(branch[i].bh, "call journal_forget");
 		ext3_journal_forget(handle, branch[i].bh);
 	}
-	for (i = 0; i <indirect_blks; i++)
+	for (i = 0; i < indirect_blks; i++)
 		ext3_free_blocks(handle, inode, new_blocks[i], 1);
 
 	ext3_free_blocks(handle, inode, new_blocks[i], num);
@@ -1078,8 +1084,8 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
 	if (!err && buffer_mapped(&dummy)) {
 		struct buffer_head *bh;
 		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-		if (!bh) {
-			*errp = -EIO;
+		if (unlikely(!bh)) {
+			*errp = -ENOMEM;
 			goto err;
 		}
 		if (buffer_new(&dummy)) {
@@ -1819,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping,
 	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
 }
 
-static void ext3_invalidatepage(struct page *page, unsigned long offset)
+static void ext3_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
 {
 	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
 
-	trace_ext3_invalidatepage(page, offset);
+	trace_ext3_invalidatepage(page, offset, length);
 
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		ClearPageChecked(page);
 
-	journal_invalidatepage(journal, page, offset);
+	journal_invalidatepage(journal, page, offset, length);
 }
 
 static int ext3_releasepage(struct page *page, gfp_t wait)
@@ -1978,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = {
 	.direct_IO		= ext3_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
 	.error_remove_page	= generic_error_remove_page,
 };
 
@@ -2729,12 +2737,12 @@ static int __ext3_get_inode_loc(struct inode *inode,
 		return -EIO;
 
 	bh = sb_getblk(inode->i_sb, block);
-	if (!bh) {
+	if (unlikely(!bh)) {
 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
 				"unable to read inode block - "
 				"inode=%lu, block="E3FSBLK,
 				 inode->i_ino, block);
-		return -EIO;
+		return -ENOMEM;
 	}
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
@@ -2783,7 +2791,7 @@ static int __ext3_get_inode_loc(struct inode *inode,
 
 			bitmap_bh = sb_getblk(inode->i_sb,
 					le32_to_cpu(desc->bg_inode_bitmap));
-			if (!bitmap_bh)
+			if (unlikely(!bitmap_bh))
 				goto make_io;
 
 			/*
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 677a5c27dc6..4d96e9a6453 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -14,7 +14,7 @@
 
 long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	unsigned int flags;
 	unsigned short rsv_window_size;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 890b8947c54..998ea111e53 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -36,7 +36,6 @@
 #define NAMEI_RA_CHUNKS  2
 #define NAMEI_RA_BLOCKS  4
 #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
 
 static struct buffer_head *ext3_append(handle_t *handle,
 					struct inode *inode,
@@ -577,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 		if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
 					(block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
 						+((char *)de - bh->b_data))) {
-			/* On error, skip the f_pos to the next block. */
-			dir_file->f_pos = (dir_file->f_pos |
-					(dir->i_sb->s_blocksize - 1)) + 1;
-			brelse (bh);
-			return count;
+			/* silently ignore the rest of the block */
+			break;
 		}
 		ext3fs_dirhash(de->name, de->name_len, hinfo);
 		if ((hinfo->hash < start_hash) ||
@@ -624,7 +620,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 
 	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
 		       start_minor_hash));
-	dir = dir_file->f_path.dentry->d_inode;
+	dir = file_inode(dir_file);
 	if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
 		hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
 		if (hinfo.hash_version <= DX_HASH_TEA)
@@ -638,7 +634,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	}
 	hinfo.hash = start_hash;
 	hinfo.minor_hash = 0;
-	frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
+	frame = dx_probe(NULL, file_inode(dir_file), &hinfo, frames, &err);
 	if (!frame)
 		return err;
 
@@ -1763,6 +1759,45 @@ retry:
 	return err;
 }
 
+static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, retries = 0;
+
+	dquot_initialize(dir);
+
+retry:
+	handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  4 + EXT3_XATTR_TRANS_BLOCKS);
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	inode = ext3_new_inode (handle, dir, NULL, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ext3_file_inode_operations;
+		inode->i_fop = &ext3_file_operations;
+		ext3_set_aops(inode);
+		err = ext3_orphan_add(handle, inode);
+		if (err)
+			goto err_drop_inode;
+		mark_inode_dirty(inode);
+		d_tmpfile(dentry, inode);
+		unlock_new_inode(inode);
+	}
+	ext3_journal_stop(handle);
+	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+err_drop_inode:
+	ext3_journal_stop(handle);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
+}
+
 static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
 {
 	handle_t *handle;
@@ -2304,7 +2339,7 @@ static int ext3_link (struct dentry * old_dentry,
 
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT3_INDEX_EXTRA_TRANS_BLOCKS);
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -2318,6 +2353,11 @@ retry:
 	err = ext3_add_entry(handle, dentry, inode);
 	if (!err) {
 		ext3_mark_inode_dirty(handle, inode);
+		/* this can happen only for tmpfile being
+		 * linked the first time
+		 */
+		if (inode->i_nlink == 1)
+			ext3_orphan_del(handle, inode);
 		d_instantiate(dentry, inode);
 	} else {
 		drop_nlink(inode);
@@ -2520,6 +2560,7 @@ const struct inode_operations ext3_dir_inode_operations = {
 	.mkdir		= ext3_mkdir,
 	.rmdir		= ext3_rmdir,
 	.mknod		= ext3_mknod,
+	.tmpfile	= ext3_tmpfile,
 	.rename		= ext3_rename,
 	.setattr	= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 0f814f3450d..27105655502 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -116,8 +116,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
 	int err;
 
 	bh = sb_getblk(sb, blk);
-	if (!bh)
-		return ERR_PTR(-EIO);
+	if (unlikely(!bh))
+		return ERR_PTR(-ENOMEM);
 	if ((err = ext3_journal_get_write_access(handle, bh))) {
 		brelse(bh);
 		bh = ERR_PTR(err);
@@ -234,8 +234,8 @@ static int setup_new_group_blocks(struct super_block *sb,
 			goto exit_bh;
 
 		gdb = sb_getblk(sb, block);
-		if (!gdb) {
-			err = -EIO;
+		if (unlikely(!gdb)) {
+			err = -ENOMEM;
 			goto exit_bh;
 		}
 		if ((err = ext3_journal_get_write_access(handle, gdb))) {
@@ -722,8 +722,8 @@ static void update_backups(struct super_block *sb,
 			break;
 
 		bh = sb_getblk(sb, group * bpg + blk_off);
-		if (!bh) {
-			err = -EIO;
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
 			break;
 		}
 		ext3_debug("update metadata backup %#04lx\n",
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6e50223b329..c47f1475072 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -174,6 +174,11 @@ static void ext3_handle_error(struct super_block *sb)
 	if (test_opt (sb, ERRORS_RO)) {
 		ext3_msg(sb, KERN_CRIT,
 			"error: remounting filesystem read-only");
+		/*
+		 * Make sure updated value of ->s_mount_state will be visible
+		 * before ->s_flags update.
+		 */
+		smp_wmb();
 		sb->s_flags |= MS_RDONLY;
 	}
 	ext3_commit_super(sb, es, 1);
@@ -291,8 +296,14 @@ void ext3_abort(struct super_block *sb, const char *function,
 	ext3_msg(sb, KERN_CRIT,
 		"error: remounting filesystem read-only");
 	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
-	sb->s_flags |= MS_RDONLY;
 	set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
+	/*
+	 * Make sure updated value of ->s_mount_state will be visible
+	 * before ->s_flags update.
+	 */
+	smp_wmb();
+	sb->s_flags |= MS_RDONLY;
+
 	if (EXT3_SB(sb)->s_journal)
 		journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
@@ -353,7 +364,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
 	return bdev;
 
 fail:
-	ext3_msg(sb, "error: failed to open journal device %s: %ld",
+	ext3_msg(sb, KERN_ERR, "error: failed to open journal device %s: %ld",
 		__bdevname(dev, b), PTR_ERR(bdev));
 
 	return NULL;
@@ -362,22 +373,19 @@ fail:
 /*
  * Release the journal device
  */
-static int ext3_blkdev_put(struct block_device *bdev)
+static void ext3_blkdev_put(struct block_device *bdev)
 {
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
-static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
+static void ext3_blkdev_remove(struct ext3_sb_info *sbi)
 {
 	struct block_device *bdev;
-	int ret = -ENODEV;
-
 	bdev = sbi->journal_bdev;
 	if (bdev) {
-		ret = ext3_blkdev_put(bdev);
+		ext3_blkdev_put(bdev);
 		sbi->journal_bdev = NULL;
 	}
-	return ret;
 }
 
 static inline struct inode *orphan_list_entry(struct list_head *l)
@@ -887,7 +895,7 @@ static ext3_fsblk_t get_sb_block(void **data, struct super_block *sb)
 	/*todo: use simple_strtoll with >32bit ext3 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
-		ext3_msg(sb, "error: invalid sb specification: %s",
+		ext3_msg(sb, KERN_ERR, "error: invalid sb specification: %s",
 		       (char *) *data);
 		return 1;
 	}
@@ -916,21 +924,24 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
 			"Not enough memory for storing quotafile name");
 		return 0;
 	}
-	if (sbi->s_qf_names[qtype] &&
-		strcmp(sbi->s_qf_names[qtype], qname)) {
-		ext3_msg(sb, KERN_ERR,
-			"%s quota file already specified", QTYPE2NAME(qtype));
+	if (sbi->s_qf_names[qtype]) {
+		int same = !strcmp(sbi->s_qf_names[qtype], qname);
+
 		kfree(qname);
-		return 0;
+		if (!same) {
+			ext3_msg(sb, KERN_ERR,
+				 "%s quota file already specified",
+				 QTYPE2NAME(qtype));
+		}
+		return same;
 	}
-	sbi->s_qf_names[qtype] = qname;
-	if (strchr(sbi->s_qf_names[qtype], '/')) {
+	if (strchr(qname, '/')) {
 		ext3_msg(sb, KERN_ERR,
 			"quotafile must be on filesystem root");
-		kfree(sbi->s_qf_names[qtype]);
-		sbi->s_qf_names[qtype] = NULL;
+		kfree(qname);
 		return 0;
 	}
+	sbi->s_qf_names[qtype] = qname;
 	set_opt(sbi->s_mount_opt, QUOTA);
 	return 1;
 }
@@ -945,11 +956,10 @@ static int clear_qf_name(struct super_block *sb, int qtype) {
 			" when quota turned on");
 		return 0;
 	}
-	/*
-	 * The space will be released later when all options are confirmed
-	 * to be correct
-	 */
-	sbi->s_qf_names[qtype] = NULL;
+	if (sbi->s_qf_names[qtype]) {
+		kfree(sbi->s_qf_names[qtype]);
+		sbi->s_qf_names[qtype] = NULL;
+	}
 	return 1;
 }
 #endif
@@ -2605,7 +2615,18 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 #ifdef CONFIG_QUOTA
 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++)
-		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+		if (sbi->s_qf_names[i]) {
+			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+							 GFP_KERNEL);
+			if (!old_opts.s_qf_names[i]) {
+				int j;
+
+				for (j = 0; j < i; j++)
+					kfree(old_opts.s_qf_names[j]);
+				return -ENOMEM;
+			}
+		} else
+			old_opts.s_qf_names[i] = NULL;
 #endif
 
 	/*
@@ -2698,9 +2719,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
 	for (i = 0; i < MAXQUOTAS; i++)
-		if (old_opts.s_qf_names[i] &&
-		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
-			kfree(old_opts.s_qf_names[i]);
+		kfree(old_opts.s_qf_names[i]);
 #endif
 	if (enable_quota)
 		dquot_resume(sb, -1);
@@ -2714,9 +2733,7 @@ restore_opts:
 #ifdef CONFIG_QUOTA
 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
-		if (sbi->s_qf_names[i] &&
-		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
-			kfree(sbi->s_qf_names[i]);
+		kfree(sbi->s_qf_names[i]);
 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
 	}
 #endif
@@ -3058,6 +3075,7 @@ static struct file_system_type ext3_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("ext3");
 
 static int __init init_ext3_fs(void)
 {
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index d22ebb7a4f5..b1fc96383e0 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -813,10 +813,10 @@ inserted:
 			ea_idebug(inode, "creating block %d", block);
 
 			new_bh = sb_getblk(sb, block);
-			if (!new_bh) {
+			if (unlikely(!new_bh)) {
 getblk_failed:
 				ext3_free_blocks(handle, inode, block, 1);
-				error = -EIO;
+				error = -ENOMEM;
 				goto cleanup;
 			}
 			lock_buffer(new_bh);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 987358740cb..efea5d5c44c 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -71,4 +71,5 @@ config EXT4_DEBUG
 	  Enables run-time debugging support for the ext4 filesystem.
 
 	  If you select Y here, then you will be able to turn on debugging
-	  with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
+	  with a command such as:
+		echo 1 > /sys/module/ext4/parameters/mballoc_debug
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index e6e0d988439..39a54a0e9fe 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -324,8 +324,8 @@ ext4_acl_chmod(struct inode *inode)
 	if (error)
 		return error;
 retry:
-	handle = ext4_journal_start(inode,
-			EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+	handle = ext4_journal_start(inode, EXT4_HT_XATTR,
+				    ext4_jbd2_credits_xattr(inode));
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
 		ext4_std_error(inode->i_sb, error);
@@ -422,7 +422,8 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
 		acl = NULL;
 
 retry:
-	handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+	handle = ext4_journal_start(inode, EXT4_HT_XATTR,
+				    ext4_jbd2_credits_xattr(inode));
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
 		goto release_and_out;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cf1821784a1..58339393fa6 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -30,6 +30,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
  */
 
 /*
+ * Calculate block group number for a given block number
+ */
+ext4_group_t ext4_get_group_number(struct super_block *sb,
+				   ext4_fsblk_t block)
+{
+	ext4_group_t group;
+
+	if (test_opt2(sb, STD_GROUP_SIZE))
+		group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+			 block) >>
+			(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
+	else
+		ext4_get_group_no_and_offset(sb, block, &group, NULL);
+	return group;
+}
+
+/*
  * Calculate the block group number and offset into the block/cluster
  * allocation bitmap, given a block number
  */
@@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 }
 
-static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
-			ext4_group_t block_group)
+/*
+ * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
+ * and 0 otherwise.
+ */
+static inline int ext4_block_in_group(struct super_block *sb,
+				      ext4_fsblk_t block,
+				      ext4_group_t block_group)
 {
 	ext4_group_t actual_group;
-	ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
-	if (actual_group == block_group)
-		return 1;
-	return 0;
+
+	actual_group = ext4_get_group_number(sb, block);
+	return (actual_group == block_group) ? 1 : 0;
 }
 
 /* Return the number of clusters used for file system metadata; this
@@ -358,7 +379,7 @@ void ext4_validate_block_bitmap(struct super_block *sb,
 }
 
 /**
- * ext4_read_block_bitmap()
+ * ext4_read_block_bitmap_nowait()
  * @sb:			super block
  * @block_group:	given block group
  *
@@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 	trace_ext4_read_block_bitmap_load(sb, block_group);
 	bh->b_end_io = ext4_end_bitmap_read;
 	get_bh(bh);
-	submit_bh(READ, bh);
+	submit_bh(READ | REQ_META | REQ_PRIO, bh);
 	return bh;
 verify:
 	ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -457,6 +478,8 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	struct buffer_head *bh;
 
 	bh = ext4_read_block_bitmap_nowait(sb, block_group);
+	if (!bh)
+		return NULL;
 	if (ext4_wait_block_bitmap(sb, block_group, bh)) {
 		put_bh(bh);
 		return NULL;
@@ -476,31 +499,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
 				  s64 nclusters, unsigned int flags)
 {
-	s64 free_clusters, dirty_clusters, root_clusters;
+	s64 free_clusters, dirty_clusters, rsv, resv_clusters;
 	struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
 	struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
 
 	free_clusters  = percpu_counter_read_positive(fcc);
 	dirty_clusters = percpu_counter_read_positive(dcc);
-	root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es));
+	resv_clusters = atomic64_read(&sbi->s_resv_clusters);
+
+	/*
+	 * r_blocks_count should always be multiple of the cluster ratio so
+	 * we are safe to do a plane bit shift only.
+	 */
+	rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
+	      resv_clusters;
 
-	if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
+	if (free_clusters - (nclusters + rsv + dirty_clusters) <
 					EXT4_FREECLUSTERS_WATERMARK) {
-		free_clusters  = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
+		free_clusters  = percpu_counter_sum_positive(fcc);
 		dirty_clusters = percpu_counter_sum_positive(dcc);
 	}
 	/* Check whether we have space after accounting for current
 	 * dirty clusters & root reserved clusters.
 	 */
-	if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
+	if (free_clusters >= (rsv + nclusters + dirty_clusters))
 		return 1;
 
 	/* Hm, nope.  Are (enough) root reserved clusters available? */
 	if (uid_eq(sbi->s_resuid, current_fsuid()) ||
 	    (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
 	    capable(CAP_SYS_RESOURCE) ||
-		(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+	    (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
 
+		if (free_clusters >= (nclusters + dirty_clusters +
+				      resv_clusters))
+			return 1;
+	}
+	/* No free blocks. Let's see if we can dip into reserved pool */
+	if (flags & EXT4_MB_USE_RESERVED) {
 		if (free_clusters >= (nclusters + dirty_clusters))
 			return 1;
 	}
@@ -628,7 +664,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 	brelse(bitmap_bh);
 	printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
 	       ", computed = %llu, %llu\n",
-	       EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
+	       EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
 	       desc_count, bitmap_count);
 	return bitmap_count;
 #else
@@ -646,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 
 static inline int test_root(ext4_group_t a, int b)
 {
-	int num = b;
-
-	while (a > num)
-		num *= b;
-	return num == a;
+	while (1) {
+		if (a < b)
+			return 0;
+		if (a == b)
+			return 1;
+		if ((a % b) != 0)
+			return 0;
+		a = a / b;
+	}
 }
 
 static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 80a28b29727..3c7d288ae94 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -29,8 +29,7 @@
 #include "ext4.h"
 #include "xattr.h"
 
-static int ext4_dx_readdir(struct file *filp,
-			   void *dirent, filldir_t filldir);
+static int ext4_dx_readdir(struct file *, struct dir_context *);
 
 /**
  * Check if the given dir-inode refers to an htree-indexed directory
@@ -46,7 +45,8 @@ static int is_dx_dir(struct inode *inode)
 	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
 		     EXT4_FEATURE_COMPAT_DIR_INDEX) &&
 	    ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
-	     ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+	     ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
+	     ext4_has_inline_data(inode)))
 		return 1;
 
 	return 0;
@@ -102,59 +102,56 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
 	return 1;
 }
 
-static int ext4_readdir(struct file *filp,
-			 void *dirent, filldir_t filldir)
+static int ext4_readdir(struct file *file, struct dir_context *ctx)
 {
-	int error = 0;
 	unsigned int offset;
 	int i, stored;
 	struct ext4_dir_entry_2 *de;
 	int err;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	int ret = 0;
 	int dir_has_error = 0;
 
-	if (ext4_has_inline_data(inode)) {
-		int has_inline_data = 1;
-		ret = ext4_read_inline_dir(filp, dirent, filldir,
-					   &has_inline_data);
-		if (has_inline_data)
-			return ret;
-	}
-
 	if (is_dx_dir(inode)) {
-		err = ext4_dx_readdir(filp, dirent, filldir);
+		err = ext4_dx_readdir(file, ctx);
 		if (err != ERR_BAD_DX_DIR) {
-			ret = err;
-			goto out;
+			return err;
 		}
 		/*
 		 * We don't set the inode dirty flag since it's not
 		 * critical that it get flushed back to the disk.
 		 */
-		ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
+		ext4_clear_inode_flag(file_inode(file),
 				      EXT4_INODE_INDEX);
 	}
+
+	if (ext4_has_inline_data(inode)) {
+		int has_inline_data = 1;
+		int ret = ext4_read_inline_dir(file, ctx,
+					   &has_inline_data);
+		if (has_inline_data)
+			return ret;
+	}
+
 	stored = 0;
-	offset = filp->f_pos & (sb->s_blocksize - 1);
+	offset = ctx->pos & (sb->s_blocksize - 1);
 
-	while (!error && !stored && filp->f_pos < inode->i_size) {
+	while (ctx->pos < inode->i_size) {
 		struct ext4_map_blocks map;
 		struct buffer_head *bh = NULL;
 
-		map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+		map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
 		map.m_len = 1;
 		err = ext4_map_blocks(NULL, inode, &map, 0);
 		if (err > 0) {
 			pgoff_t index = map.m_pblk >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
-			if (!ra_has_index(&filp->f_ra, index))
+			if (!ra_has_index(&file->f_ra, index))
 				page_cache_sync_readahead(
 					sb->s_bdev->bd_inode->i_mapping,
-					&filp->f_ra, filp,
+					&file->f_ra, file,
 					index, 1);
-			filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 			bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
 		}
 
@@ -164,16 +161,16 @@ static int ext4_readdir(struct file *filp,
 		 */
 		if (!bh) {
 			if (!dir_has_error) {
-				EXT4_ERROR_FILE(filp, 0,
+				EXT4_ERROR_FILE(file, 0,
 						"directory contains a "
 						"hole at offset %llu",
-					   (unsigned long long) filp->f_pos);
+					   (unsigned long long) ctx->pos);
 				dir_has_error = 1;
 			}
 			/* corrupt size?  Maybe no more blocks to read */
-			if (filp->f_pos > inode->i_blocks << 9)
+			if (ctx->pos > inode->i_blocks << 9)
 				break;
-			filp->f_pos += sb->s_blocksize - offset;
+			ctx->pos += sb->s_blocksize - offset;
 			continue;
 		}
 
@@ -181,20 +178,20 @@ static int ext4_readdir(struct file *filp,
 		if (!buffer_verified(bh) &&
 		    !ext4_dirent_csum_verify(inode,
 				(struct ext4_dir_entry *)bh->b_data)) {
-			EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
+			EXT4_ERROR_FILE(file, 0, "directory fails checksum "
 					"at offset %llu",
-					(unsigned long long)filp->f_pos);
-			filp->f_pos += sb->s_blocksize - offset;
+					(unsigned long long)ctx->pos);
+			ctx->pos += sb->s_blocksize - offset;
+			brelse(bh);
 			continue;
 		}
 		set_buffer_verified(bh);
 
-revalidate:
 		/* If the dir block has changed since the last call to
 		 * readdir(2), then we might be pointing to an invalid
 		 * dirent right now.  Scan from the start of the block
 		 * to make sure. */
-		if (filp->f_version != inode->i_version) {
+		if (file->f_version != inode->i_version) {
 			for (i = 0; i < sb->s_blocksize && i < offset; ) {
 				de = (struct ext4_dir_entry_2 *)
 					(bh->b_data + i);
@@ -211,57 +208,46 @@ revalidate:
 							    sb->s_blocksize);
 			}
 			offset = i;
-			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
 				| offset;
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 		}
 
-		while (!error && filp->f_pos < inode->i_size
+		while (ctx->pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-			if (ext4_check_dir_entry(inode, filp, de, bh,
+			if (ext4_check_dir_entry(inode, file, de, bh,
 						 bh->b_data, bh->b_size,
 						 offset)) {
 				/*
-				 * On error, skip the f_pos to the next block
+				 * On error, skip to the next block
 				 */
-				filp->f_pos = (filp->f_pos |
+				ctx->pos = (ctx->pos |
 						(sb->s_blocksize - 1)) + 1;
-				brelse(bh);
-				ret = stored;
-				goto out;
+				break;
 			}
 			offset += ext4_rec_len_from_disk(de->rec_len,
 					sb->s_blocksize);
 			if (le32_to_cpu(de->inode)) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation.
-				 */
-				u64 version = filp->f_version;
-
-				error = filldir(dirent, de->name,
+				if (!dir_emit(ctx, de->name,
 						de->name_len,
-						filp->f_pos,
 						le32_to_cpu(de->inode),
-						get_dtype(sb, de->file_type));
-				if (error)
-					break;
-				if (version != filp->f_version)
-					goto revalidate;
-				stored++;
+						get_dtype(sb, de->file_type))) {
+					brelse(bh);
+					return 0;
+				}
 			}
-			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+			ctx->pos += ext4_rec_len_from_disk(de->rec_len,
 						sb->s_blocksize);
 		}
 		offset = 0;
 		brelse(bh);
+		if (ctx->pos < inode->i_size) {
+			if (!dir_relax(inode))
+				return 0;
+		}
 	}
-out:
-	return ret;
+	return 0;
 }
 
 static inline int is_32bit_api(void)
@@ -333,7 +319,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
  *
  * For non-htree, ext4_llseek already chooses the proper max offset.
  */
-loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
+static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
 	int dx_dir = is_dx_dir(inode);
@@ -489,16 +475,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
  * for all entres on the fname linked list.  (Normally there is only
  * one entry on the linked list, unless there are 62 bit hash collisions.)
  */
-static int call_filldir(struct file *filp, void *dirent,
-			filldir_t filldir, struct fname *fname)
+static int call_filldir(struct file *file, struct dir_context *ctx,
+			struct fname *fname)
 {
-	struct dir_private_info *info = filp->private_data;
-	loff_t	curr_pos;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct super_block *sb;
-	int error;
-
-	sb = inode->i_sb;
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
 
 	if (!fname) {
 		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
@@ -506,47 +488,44 @@ static int call_filldir(struct file *filp, void *dirent,
 			 inode->i_ino, current->comm);
 		return 0;
 	}
-	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+	ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
 	while (fname) {
-		error = filldir(dirent, fname->name,
-				fname->name_len, curr_pos,
+		if (!dir_emit(ctx, fname->name,
+				fname->name_len,
 				fname->inode,
-				get_dtype(sb, fname->file_type));
-		if (error) {
-			filp->f_pos = curr_pos;
+				get_dtype(sb, fname->file_type))) {
 			info->extra_fname = fname;
-			return error;
+			return 1;
 		}
 		fname = fname->next;
 	}
 	return 0;
 }
 
-static int ext4_dx_readdir(struct file *filp,
-			 void *dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dir_private_info *info = filp->private_data;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
 	struct fname *fname;
 	int	ret;
 
 	if (!info) {
-		info = ext4_htree_create_dir_info(filp, filp->f_pos);
+		info = ext4_htree_create_dir_info(file, ctx->pos);
 		if (!info)
 			return -ENOMEM;
-		filp->private_data = info;
+		file->private_data = info;
 	}
 
-	if (filp->f_pos == ext4_get_htree_eof(filp))
+	if (ctx->pos == ext4_get_htree_eof(file))
 		return 0;	/* EOF */
 
 	/* Some one has messed with f_pos; reset the world */
-	if (info->last_pos != filp->f_pos) {
+	if (info->last_pos != ctx->pos) {
 		free_rb_tree_fname(&info->root);
 		info->curr_node = NULL;
 		info->extra_fname = NULL;
-		info->curr_hash = pos2maj_hash(filp, filp->f_pos);
-		info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+		info->curr_hash = pos2maj_hash(file, ctx->pos);
+		info->curr_minor_hash = pos2min_hash(file, ctx->pos);
 	}
 
 	/*
@@ -554,7 +533,7 @@ static int ext4_dx_readdir(struct file *filp,
 	 * chain, return them first.
 	 */
 	if (info->extra_fname) {
-		if (call_filldir(filp, dirent, filldir, info->extra_fname))
+		if (call_filldir(file, ctx, info->extra_fname))
 			goto finished;
 		info->extra_fname = NULL;
 		goto next_node;
@@ -568,17 +547,17 @@ static int ext4_dx_readdir(struct file *filp,
 		 * cached entries.
 		 */
 		if ((!info->curr_node) ||
-		    (filp->f_version != inode->i_version)) {
+		    (file->f_version != inode->i_version)) {
 			info->curr_node = NULL;
 			free_rb_tree_fname(&info->root);
-			filp->f_version = inode->i_version;
-			ret = ext4_htree_fill_tree(filp, info->curr_hash,
+			file->f_version = inode->i_version;
+			ret = ext4_htree_fill_tree(file, info->curr_hash,
 						   info->curr_minor_hash,
 						   &info->next_hash);
 			if (ret < 0)
 				return ret;
 			if (ret == 0) {
-				filp->f_pos = ext4_get_htree_eof(filp);
+				ctx->pos = ext4_get_htree_eof(file);
 				break;
 			}
 			info->curr_node = rb_first(&info->root);
@@ -587,7 +566,7 @@ static int ext4_dx_readdir(struct file *filp,
 		fname = rb_entry(info->curr_node, struct fname, rb_hash);
 		info->curr_hash = fname->hash;
 		info->curr_minor_hash = fname->minor_hash;
-		if (call_filldir(filp, dirent, filldir, fname))
+		if (call_filldir(file, ctx, fname))
 			break;
 	next_node:
 		info->curr_node = rb_next(info->curr_node);
@@ -598,7 +577,7 @@ static int ext4_dx_readdir(struct file *filp,
 			info->curr_minor_hash = fname->minor_hash;
 		} else {
 			if (info->next_hash == ~0) {
-				filp->f_pos = ext4_get_htree_eof(filp);
+				ctx->pos = ext4_get_htree_eof(file);
 				break;
 			}
 			info->curr_hash = info->next_hash;
@@ -606,7 +585,7 @@ static int ext4_dx_readdir(struct file *filp,
 		}
 	}
 finished:
-	info->last_pos = filp->f_pos;
+	info->last_pos = ctx->pos;
 	return 0;
 }
 
@@ -621,7 +600,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
 const struct file_operations ext4_dir_operations = {
 	.llseek		= ext4_dir_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ext4_readdir,
+	.iterate	= ext4_readdir,
 	.unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8462eb3c33a..b577e45425b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t;
 #define EXT4_MB_STREAM_ALLOC		0x0800
 /* Use reserved root blocks if needed */
 #define EXT4_MB_USE_ROOT_BLOCKS		0x1000
+/* Use blocks from reserved pool */
+#define EXT4_MB_USE_RESERVED		0x2000
 
 struct ext4_allocation_request {
 	/* target inode for block we're allocating */
@@ -175,61 +177,34 @@ struct ext4_map_blocks {
 };
 
 /*
- * For delayed allocation tracking
- */
-struct mpage_da_data {
-	struct inode *inode;
-	sector_t b_blocknr;		/* start block number of extent */
-	size_t b_size;			/* size of extent */
-	unsigned long b_state;		/* state of the extent */
-	unsigned long first_page, next_page;	/* extent of pages */
-	struct writeback_control *wbc;
-	int io_done;
-	int pages_written;
-	int retval;
-};
-
-/*
  * Flags for ext4_io_end->flags
  */
 #define	EXT4_IO_END_UNWRITTEN	0x0001
-#define EXT4_IO_END_ERROR	0x0002
-#define EXT4_IO_END_QUEUED	0x0004
-#define EXT4_IO_END_DIRECT	0x0008
-
-struct ext4_io_page {
-	struct page	*p_page;
-	atomic_t	p_count;
-};
-
-#define MAX_IO_PAGES 128
+#define EXT4_IO_END_DIRECT	0x0002
 
 /*
- * For converting uninitialized extents on a work queue.
- *
- * 'page' is only used from the writepage() path; 'pages' is only used for
- * buffered writes; they are used to keep page references until conversion
- * takes place.  For AIO/DIO, neither field is filled in.
+ * For converting uninitialized extents on a work queue. 'handle' is used for
+ * buffered writeback.
  */
 typedef struct ext4_io_end {
 	struct list_head	list;		/* per-file finished IO list */
+	handle_t		*handle;	/* handle reserved for extent
+						 * conversion */
 	struct inode		*inode;		/* file being written to */
+	struct bio		*bio;		/* Linked list of completed
+						 * bios covering the extent */
 	unsigned int		flag;		/* unwritten or not */
-	struct page		*page;		/* for writepage() path */
 	loff_t			offset;		/* offset in the file */
 	ssize_t			size;		/* size of the extent */
-	struct work_struct	work;		/* data work queue */
 	struct kiocb		*iocb;		/* iocb struct for AIO */
 	int			result;		/* error value for AIO */
-	int			num_io_pages;   /* for writepages() */
-	struct ext4_io_page	*pages[MAX_IO_PAGES]; /* for writepages() */
+	atomic_t		count;		/* reference counter */
 } ext4_io_end_t;
 
 struct ext4_io_submit {
 	int			io_op;
 	struct bio		*io_bio;
 	ext4_io_end_t		*io_end;
-	struct ext4_io_page	*io_page;
 	sector_t		io_next_block;
 };
 
@@ -338,9 +313,9 @@ struct ext4_group_desc
  */
 
 struct flex_groups {
-	atomic_t free_inodes;
-	atomic_t free_clusters;
-	atomic_t used_dirs;
+	atomic64_t	free_clusters;
+	atomic_t	free_inodes;
+	atomic_t	used_dirs;
 };
 
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
@@ -406,7 +381,7 @@ struct flex_groups {
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x004B80FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE		0x004380FF /* User modifiable flags */
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
@@ -560,9 +535,8 @@ enum {
 #define EXT4_GET_BLOCKS_UNINIT_EXT		0x0002
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
 						 EXT4_GET_BLOCKS_CREATE)
-	/* Caller is from the delayed allocation writeout path,
-	   so set the magic i_delalloc_reserve_flag after taking the
-	   inode allocation semaphore for */
+	/* Caller is from the delayed allocation writeout path
+	 * finally doing the actual allocation of delayed blocks */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
 	/* caller is from the direct IO path, request to creation of an
 	unitialized extents if not allocated, split the uninitialized
@@ -574,14 +548,17 @@ enum {
 	/* Convert extent to initialized after IO complete */
 #define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
 					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
-	/* Punch out blocks of an extent */
-#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT		0x0020
+	/* Eventual metadata allocation (due to growing extent tree)
+	 * should not fail, so try to use reserved blocks for that.*/
+#define EXT4_GET_BLOCKS_METADATA_NOFAIL		0x0020
 	/* Don't normalize allocation size (used for fallocate) */
 #define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
 	/* Request will not result in inode size update (user for fallocate) */
 #define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
 	/* Do not take i_data_sem locking in ext4_map_blocks */
 #define EXT4_GET_BLOCKS_NO_LOCK			0x0100
+	/* Do not put hole in extent cache */
+#define EXT4_GET_BLOCKS_NO_PUT_HOLE		0x0200
 
 /*
  * Flags used by ext4_free_blocks
@@ -594,11 +571,6 @@ enum {
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER	0x0020
 
 /*
- * Flags used by ext4_discard_partial_page_buffers
- */
-#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED	0x0001
-
-/*
  * ioctl commands
  */
 #define	EXT4_IOC_GETFLAGS		FS_IOC_GETFLAGS
@@ -617,6 +589,7 @@ enum {
 #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 #define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
 #define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
+#define EXT4_IOC_SWAP_BOOT		_IO('f', 17)
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -810,17 +783,6 @@ do {									       \
 
 #endif /* defined(__KERNEL__) || defined(__linux__) */
 
-/*
- * storage for cached extent
- * If ec_len == 0, then the cache is invalid.
- * If ec_start == 0, then the cache represents a gap (null mapping)
- */
-struct ext4_ext_cache {
-	ext4_fsblk_t	ec_start;
-	ext4_lblk_t	ec_block;
-	__u32		ec_len; /* must be 32bit to return holes */
-};
-
 #include "extents_status.h"
 
 /*
@@ -887,7 +849,6 @@ struct ext4_inode_info {
 	struct inode vfs_inode;
 	struct jbd2_inode *jinode;
 
-	struct ext4_ext_cache i_cached_extent;
 	/*
 	 * File creation time. Its function is same as that of
 	 * struct timespec i_{a,c,m}time in the generic inode.
@@ -901,6 +862,9 @@ struct ext4_inode_info {
 	/* extents status tree */
 	struct ext4_es_tree i_es_tree;
 	rwlock_t i_es_lock;
+	struct list_head i_es_lru;
+	unsigned int i_es_lru_nr;	/* protected by i_es_lock */
+	unsigned long i_touch_when;	/* jiffies of last accessing */
 
 	/* ialloc */
 	ext4_group_t	i_last_alloc_group;
@@ -925,11 +889,22 @@ struct ext4_inode_info {
 	qsize_t i_reserved_quota;
 #endif
 
-	/* completed IOs that might need unwritten extents handling */
-	struct list_head i_completed_io_list;
+	/* Lock protecting lists below */
 	spinlock_t i_completed_io_lock;
+	/*
+	 * Completed IOs that need unwritten extents handling and have
+	 * transaction reserved
+	 */
+	struct list_head i_rsv_conversion_list;
+	/*
+	 * Completed IOs that need unwritten extents handling and don't have
+	 * transaction reserved
+	 */
+	struct list_head i_unrsv_conversion_list;
 	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
 	atomic_t i_unwritten; /* Nr. of inflight conversions pending */
+	struct work_struct i_rsv_conversion_work;
+	struct work_struct i_unrsv_conversion_work;
 
 	spinlock_t i_block_reservation_lock;
 
@@ -959,7 +934,7 @@ struct ext4_inode_info {
 #define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
 
 /*
- * Mount flags
+ * Mount flags set via mount options or defaults
  */
 #define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
 #define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */
@@ -985,15 +960,22 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
-#define EXT4_MOUNT_MBLK_IO_SUBMIT	0x4000000 /* multi-block io submits */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
 #define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
 #define EXT4_MOUNT_INIT_INODE_TABLE	0x80000000 /* Initialize uninitialized itables */
 
+/*
+ * Mount flags set either automatically (could not be set by mount option)
+ * based on per file system feature or property or in special cases such as
+ * distinguishing between explicit mount option definition and default.
+ */
 #define EXT4_MOUNT2_EXPLICIT_DELALLOC	0x00000001 /* User explicitly
 						      specified delalloc */
+#define EXT4_MOUNT2_STD_GROUP_SIZE	0x00000002 /* We have standard group
+						      size of blocksize * 8
+						      blocks */
 
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
@@ -1190,6 +1172,7 @@ struct ext4_sb_info {
 	unsigned int s_mount_flags;
 	unsigned int s_def_mount_opt;
 	ext4_fsblk_t s_sb_block;
+	atomic64_t s_resv_clusters;
 	kuid_t s_resuid;
 	kgid_t s_resgid;
 	unsigned short s_mount_state;
@@ -1258,7 +1241,6 @@ struct ext4_sb_info {
 	unsigned int s_mb_stats;
 	unsigned int s_mb_order2_reqs;
 	unsigned int s_mb_group_prealloc;
-	unsigned int s_max_writeback_mb_bump;
 	unsigned int s_max_dir_size_kb;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
@@ -1294,8 +1276,10 @@ struct ext4_sb_info {
 	struct flex_groups *s_flex_groups;
 	ext4_group_t s_flex_groups_allocated;
 
-	/* workqueue for dio unwritten */
-	struct workqueue_struct *dio_unwritten_wq;
+	/* workqueue for unreserved extent convertions (dio) */
+	struct workqueue_struct *unrsv_conversion_wq;
+	/* workqueue for reserved extent conversions (buffered io) */
+	struct workqueue_struct *rsv_conversion_wq;
 
 	/* timer for periodic error stats printing */
 	struct timer_list s_err_report;
@@ -1316,6 +1300,13 @@ struct ext4_sb_info {
 
 	/* Precomputed FS UUID checksum for seeding other checksums */
 	__u32 s_csum_seed;
+
+	/* Reclaim extents from extent status tree */
+	struct shrinker s_es_shrinker;
+	struct list_head s_es_lru;
+	unsigned long s_es_last_sorted;
+	struct percpu_counter s_extent_cache_cnt;
+	spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1338,6 +1329,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 	return ino == EXT4_ROOT_INO ||
 		ino == EXT4_USR_QUOTA_INO ||
 		ino == EXT4_GRP_QUOTA_INO ||
+		ino == EXT4_BOOT_LOADER_INO ||
 		ino == EXT4_JOURNAL_INO ||
 		ino == EXT4_RESIZE_INO ||
 		(ino >= EXT4_FIRST_INO(sb) &&
@@ -1348,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
 					      struct ext4_io_end *io_end)
 {
 	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		/* Writeback has to have coversion transaction reserved */
+		WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
+			!(io_end->flag & EXT4_IO_END_DIRECT));
 		io_end->flag |= EXT4_IO_END_UNWRITTEN;
 		atomic_inc(&EXT4_I(inode)->i_unwritten);
 	}
@@ -1379,6 +1374,7 @@ enum {
 	EXT4_STATE_DIOREAD_LOCK,	/* Disable support for dio read
 					   nolocking */
 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
+	EXT4_STATE_ORDERED_MODE,	/* data=ordered mode */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
@@ -1789,9 +1785,6 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
  */
 #define ERR_BAD_DX_DIR	-75000
 
-void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
-			ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
-
 /*
  * Timeout and state flag for lazy initialization inode thread.
  */
@@ -1913,6 +1906,13 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
 				  struct buffer_head *bh);
 
 /* balloc.c */
+extern void ext4_get_group_no_and_offset(struct super_block *sb,
+					 ext4_fsblk_t blocknr,
+					 ext4_group_t *blockgrpp,
+					 ext4_grpblk_t *offsetp);
+extern ext4_group_t ext4_get_group_number(struct super_block *sb,
+					  ext4_fsblk_t block);
+
 extern void ext4_validate_block_bitmap(struct super_block *sb,
 				       struct ext4_group_desc *desc,
 				       unsigned int block_group,
@@ -2000,16 +2000,26 @@ static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_unwritten_io(struct inode *);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
 			  dx_hash_info *hinfo);
 
 /* ialloc.c */
-extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t,
-				    const struct qstr *qstr, __u32 goal,
-				    uid_t *owner);
+extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
+				      const struct qstr *qstr, __u32 goal,
+				      uid_t *owner, int handle_type,
+				      unsigned int line_no, int nblocks);
+
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \
+	__ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
+			 0, 0, 0)
+#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
+				    type, nblocks)		    \
+	__ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
+			 (type), __LINE__, (nblocks))
+
+
 extern void ext4_free_inode(handle_t *, struct inode *);
 extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
@@ -2078,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
-extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2086,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_discard_partial_page_buffers(handle_t *handle,
-		struct address_space *mapping, loff_t from,
-		loff_t length, int flags);
+extern int ext4_block_truncate_page(handle_t *handle,
+		struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+			     loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2101,8 +2114,10 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 				const struct iovec *iov, loff_t offset,
 				unsigned long nr_segs);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
-extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
-extern void ext4_ind_truncate(struct inode *inode);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
+extern void ext4_ind_truncate(handle_t *, struct inode *inode);
+extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+				 ext4_lblk_t first, ext4_lblk_t stop);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2110,6 +2125,7 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
+extern int ext4_ind_migrate(struct inode *inode);
 
 /* namei.c */
 extern int ext4_dirent_csum_verify(struct inode *inode,
@@ -2151,42 +2167,98 @@ extern void *ext4_kvzalloc(size_t size, gfp_t flags);
 extern void ext4_kvfree(void *ptr);
 extern int ext4_alloc_flex_bg_array(struct super_block *sb,
 				    ext4_group_t ngroup);
+extern const char *ext4_decode_error(struct super_block *sb, int errno,
+				     char nbuf[16]);
+
 extern __printf(4, 5)
 void __ext4_error(struct super_block *, const char *, unsigned int,
 		  const char *, ...);
-#define ext4_error(sb, message...)	__ext4_error(sb, __func__,	\
-						     __LINE__, ## message)
 extern __printf(5, 6)
-void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
 		      const char *, ...);
 extern __printf(5, 6)
-void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
 		     const char *, ...);
 extern void __ext4_std_error(struct super_block *, const char *,
 			     unsigned int, int);
 extern __printf(4, 5)
 void __ext4_abort(struct super_block *, const char *, unsigned int,
 		  const char *, ...);
-#define ext4_abort(sb, message...)	__ext4_abort(sb, __func__, \
-						       __LINE__, ## message)
 extern __printf(4, 5)
 void __ext4_warning(struct super_block *, const char *, unsigned int,
 		    const char *, ...);
-#define ext4_warning(sb, message...)	__ext4_warning(sb, __func__, \
-						       __LINE__, ## message)
 extern __printf(3, 4)
-void ext4_msg(struct super_block *, const char *, const char *, ...);
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
 extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
 			   const char *, unsigned int, const char *);
-#define dump_mmp_msg(sb, mmp, msg)	__dump_mmp_msg(sb, mmp, __func__, \
-						       __LINE__, msg)
 extern __printf(7, 8)
 void __ext4_grp_locked_error(const char *, unsigned int,
 			     struct super_block *, ext4_group_t,
 			     unsigned long, ext4_fsblk_t,
 			     const char *, ...);
-#define ext4_grp_locked_error(sb, grp, message...) \
-	__ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+
+#ifdef CONFIG_PRINTK
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...)		\
+	__ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...)		\
+	__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...)					\
+	__ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...)					\
+	__ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...)					\
+	__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...)				\
+	__ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg)					\
+	__dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\
+	__ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+				fmt, ##__VA_ARGS__)
+
+#else
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error_inode(inode, "", 0, block, " ");			\
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error_file(file, "", 0, block, " ");			\
+} while (0)
+#define ext4_error(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error(sb, "", 0, " ");					\
+} while (0)
+#define ext4_abort(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_abort(sb, "", 0, " ");					\
+} while (0)
+#define ext4_warning(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_warning(sb, "", 0, " ");					\
+} while (0)
+#define ext4_msg(sb, level, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_msg(sb, "", " ");					\
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg)					\
+	__dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);				\
+	__ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");	\
+} while (0)
+
+#endif
+
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
 					__u32 compat);
@@ -2227,6 +2299,8 @@ extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
 				       struct ext4_group_desc *gdp);
 extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
 				     struct ext4_group_desc *gdp);
+extern int ext4_register_li_request(struct super_block *sb,
+				    ext4_group_t first_not_zeroed);
 
 static inline int ext4_has_group_desc_csum(struct super_block *sb)
 {
@@ -2295,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 {
 	 struct ext4_group_info ***grp_info;
 	 long indexv, indexh;
+	 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
 	 grp_info = EXT4_SB(sb)->s_group_info;
 	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
 	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2454,6 +2529,80 @@ extern const struct file_operations ext4_file_operations;
 extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 extern void ext4_unwritten_wait(struct inode *inode);
 
+/* inline.c */
+extern int ext4_has_inline_data(struct inode *inode);
+extern int ext4_get_inline_size(struct inode *inode);
+extern int ext4_get_max_inline_size(struct inode *inode);
+extern int ext4_find_inline_data_nolock(struct inode *inode);
+extern void ext4_write_inline_data(struct inode *inode,
+				   struct ext4_iloc *iloc,
+				   void *buffer, loff_t pos,
+				   unsigned int len);
+extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+				    unsigned int len);
+extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
+				 unsigned int len);
+extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+
+extern int ext4_readpage_inline(struct inode *inode, struct page *page);
+extern int ext4_try_to_write_inline_data(struct address_space *mapping,
+					 struct inode *inode,
+					 loff_t pos, unsigned len,
+					 unsigned flags,
+					 struct page **pagep);
+extern int ext4_write_inline_data_end(struct inode *inode,
+				      loff_t pos, unsigned len,
+				      unsigned copied,
+				      struct page *page);
+extern struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+				  unsigned len,
+				  struct page *page);
+extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
+					   struct inode *inode,
+					   loff_t pos, unsigned len,
+					   unsigned flags,
+					   struct page **pagep,
+					   void **fsdata);
+extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+					 unsigned len, unsigned copied,
+					 struct page *page);
+extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+				     struct inode *inode);
+extern int ext4_try_create_inline_dir(handle_t *handle,
+				      struct inode *parent,
+				      struct inode *inode);
+extern int ext4_read_inline_dir(struct file *filp,
+				struct dir_context *ctx,
+				int *has_inline_data);
+extern int htree_inlinedir_to_tree(struct file *dir_file,
+				   struct inode *dir, ext4_lblk_t block,
+				   struct dx_hash_info *hinfo,
+				   __u32 start_hash, __u32 start_minor_hash,
+				   int *has_inline_data);
+extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+					const struct qstr *d_name,
+					struct ext4_dir_entry_2 **res_dir,
+					int *has_inline_data);
+extern int ext4_delete_inline_entry(handle_t *handle,
+				    struct inode *dir,
+				    struct ext4_dir_entry_2 *de_del,
+				    struct buffer_head *bh,
+				    int *has_inline_data);
+extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
+extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+					struct ext4_dir_entry_2 **parent_de,
+					int *retval);
+extern int ext4_inline_data_fiemap(struct inode *inode,
+				   struct fiemap_extent_info *fieinfo,
+				   int *has_inline);
+extern int ext4_try_to_evict_inline_data(handle_t *handle,
+					 struct inode *inode,
+					 int needed);
+extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+
+extern int ext4_convert_inline_data(struct inode *inode);
+
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
@@ -2467,6 +2616,24 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
 extern int ext4_handle_dirty_dirent_node(handle_t *handle,
 					 struct inode *inode,
 					 struct buffer_head *bh);
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+				struct ext4_dir_entry_2 *de,
+				umode_t mode) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
 
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2489,19 +2656,18 @@ struct ext4_extent;
 
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
-extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
-				       int chunk);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			       struct ext4_map_blocks *map, int flags);
-extern void ext4_ext_truncate(struct inode *);
-extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
-				loff_t length);
+extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+				 ext4_lblk_t end);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
 			  loff_t len);
-extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-			  ssize_t len);
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+					  loff_t offset, ssize_t len);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
 			   struct ext4_map_blocks *map, int flags);
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2520,23 +2686,36 @@ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
 						  struct ext4_ext_path *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
 extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_range(struct inode *inode,
+				    ext4_lblk_t lblk_start,
+				    ext4_lblk_t lblk_end);
 extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 
 
 /* move_extent.c */
+extern void ext4_double_down_write_data_sem(struct inode *first,
+					    struct inode *second);
+extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
+					  struct inode *donor_inode);
+void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
+void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 start_orig, __u64 start_donor,
 			     __u64 len, __u64 *moved_len);
 
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
-extern void ext4_add_complete_io(ext4_io_end_t *io_end);
 extern void ext4_exit_pageio(void);
-extern void ext4_ioend_wait(struct inode *);
-extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+				struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
+extern void ext4_end_io_unrsv_work(struct work_struct *work);
 extern void ext4_io_submit(struct ext4_io_submit *io);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
 			       struct page *page,
@@ -2549,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
 extern int ext4_mmp_csum_verify(struct super_block *sb,
 				struct mmp_struct *mmp);
 
-/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
 enum ext4_state_bits {
 	BH_Uninit	/* blocks are allocated but uninitialized on disk */
-	  = BH_JBDPrivateStart,
+	 = BH_JBDPrivateStart,
 	BH_AllocFromCluster,	/* allocated blocks were part of already
-				 * allocated cluster. Note that this flag will
-				 * never, ever appear in a buffer_head's state
-				 * flag. See EXT4_MAP_FROM_CLUSTER to see where
-				 * this is used. */
+				 * allocated cluster. */
 };
 
-BUFFER_FNS(Uninit, uninit)
-TAS_BUFFER_FNS(Uninit, uninit)
-
 /*
  * Add new method to test whether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 487fda12bc0..51bc821ade9 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -193,12 +193,6 @@ static inline unsigned short ext_depth(struct inode *inode)
 	return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
 }
 
-static inline void
-ext4_ext_invalidate_cache(struct inode *inode)
-{
-	EXT4_I(inode)->i_cached_extent.ec_len = 0;
-}
-
 static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
 {
 	/* We can not have an uninitialized extent of zero length! */
@@ -276,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
 				     0xffff);
 }
 
+#define ext4_ext_dirty(handle, inode, path) \
+		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+		     struct inode *inode, struct ext4_ext_path *path);
+
 #endif /* _EXT4_EXTENTS */
 
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b4323ba846b..72a3600aedb 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,11 +6,153 @@
 
 #include <trace/events/ext4.h>
 
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+	handle_t *handle = current->journal_info;
+	unsigned long ref_cnt = (unsigned long)handle;
+
+	BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+	ref_cnt++;
+	handle = (handle_t *)ref_cnt;
+
+	current->journal_info = handle;
+	return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+	unsigned long ref_cnt = (unsigned long)handle;
+
+	BUG_ON(ref_cnt == 0);
+
+	ref_cnt--;
+	handle = (handle_t *)ref_cnt;
+
+	current->journal_info = handle;
+}
+
+/*
+ * Wrappers for jbd2_journal_start/end.
+ */
+static int ext4_journal_check_start(struct super_block *sb)
+{
+	journal_t *journal;
+
+	might_sleep();
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
+	journal = EXT4_SB(sb)->s_journal;
+	/*
+	 * Special case here: if the journal has aborted behind our
+	 * backs (eg. EIO in the commit thread), then we still need to
+	 * take the FS itself readonly cleanly.
+	 */
+	if (journal && is_journal_aborted(journal)) {
+		ext4_abort(sb, "Detected aborted journal");
+		return -EROFS;
+	}
+	return 0;
+}
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+				  int type, int blocks, int rsv_blocks)
+{
+	journal_t *journal;
+	int err;
+
+	trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+	err = ext4_journal_check_start(sb);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	journal = EXT4_SB(sb)->s_journal;
+	if (!journal)
+		return ext4_get_nojournal();
+	return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+				   type, line);
+}
+
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
+{
+	struct super_block *sb;
+	int err;
+	int rc;
+
+	if (!ext4_handle_valid(handle)) {
+		ext4_put_nojournal(handle);
+		return 0;
+	}
+	sb = handle->h_transaction->t_journal->j_private;
+	err = handle->h_err;
+	rc = jbd2_journal_stop(handle);
+
+	if (!err)
+		err = rc;
+	if (err)
+		__ext4_std_error(sb, where, line, err);
+	return err;
+}
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+					int type)
+{
+	struct super_block *sb;
+	int err;
+
+	if (!ext4_handle_valid(handle))
+		return ext4_get_nojournal();
+
+	sb = handle->h_journal->j_private;
+	trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+					  _RET_IP_);
+	err = ext4_journal_check_start(sb);
+	if (err < 0) {
+		jbd2_journal_free_reserved(handle);
+		return ERR_PTR(err);
+	}
+
+	err = jbd2_journal_start_reserved(handle, type, line);
+	if (err < 0)
+		return ERR_PTR(err);
+	return handle;
+}
+
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
+			       const char *err_fn, struct buffer_head *bh,
+			       handle_t *handle, int err)
+{
+	char nbuf[16];
+	const char *errstr = ext4_decode_error(NULL, err, nbuf);
+
+	BUG_ON(!ext4_handle_valid(handle));
+
+	if (bh)
+		BUFFER_TRACE(bh, "abort");
+
+	if (!handle->h_err)
+		handle->h_err = err;
+
+	if (is_handle_aborted(handle))
+		return;
+
+	printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
+	       caller, line, errstr, err_fn);
+
+	jbd2_journal_abort_handle(handle);
+}
+
 int __ext4_journal_get_write_access(const char *where, unsigned int line,
 				    handle_t *handle, struct buffer_head *bh)
 {
 	int err = 0;
 
+	might_sleep();
+
 	if (ext4_handle_valid(handle)) {
 		err = jbd2_journal_get_write_access(handle, bh);
 		if (err)
@@ -107,6 +249,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 {
 	int err = 0;
 
+	might_sleep();
+
+	set_buffer_meta(bh);
+	set_buffer_prio(bh);
 	if (ext4_handle_valid(handle)) {
 		err = jbd2_journal_dirty_metadata(handle, bh);
 		if (err) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 7177f9b21cb..2877258d949 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -29,11 +29,13 @@
  * block to complete the transaction.
  *
  * For extents-enabled fs we may have to allocate and modify up to
- * 5 levels of tree + root which are stored in the inode. */
+ * 5 levels of tree, data block (for each of these we need bitmap + group
+ * summaries), root which is stored in the inode, sb
+ */
 
 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)				\
 	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
-	 ? 27U : 8U)
+	 ? 20U : 8U)
 
 /* Extended attribute operations touch at most two data buffers,
  * two bitmap buffers, and two group summaries, in addition to the inode
@@ -59,12 +61,6 @@
 #define EXT4_META_TRANS_BLOCKS(sb)	(EXT4_XATTR_TRANS_BLOCKS + \
 					EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
 
-/* Delete operations potentially hit one directory's namespace plus an
- * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
- * generous.  We can grow the delete transaction later if necessary. */
-
-#define EXT4_DELETE_TRANS_BLOCKS(sb)	(2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64)
-
 /* Define an arbitrary limit for the amount of data we will anticipate
  * writing to any given transaction.  For unbounded transactions such as
  * write(2) and truncate(2) we can write more than this, but we always
@@ -110,6 +106,37 @@
 #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
 #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 
+static inline int ext4_jbd2_credits_xattr(struct inode *inode)
+{
+	int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+
+	/*
+	 * In case of inline data, we may push out the data to a block,
+	 * so we need to reserve credits for this eventuality
+	 */
+	if (ext4_has_inline_data(inode))
+		credits += ext4_writepage_trans_blocks(inode) + 1;
+	return credits;
+}
+
+
+/*
+ * Ext4 handle operation types -- for logging purposes
+ */
+#define EXT4_HT_MISC             0
+#define EXT4_HT_INODE            1
+#define EXT4_HT_WRITE_PAGE       2
+#define EXT4_HT_MAP_BLOCKS       3
+#define EXT4_HT_DIR              4
+#define EXT4_HT_TRUNCATE         5
+#define EXT4_HT_QUOTA            6
+#define EXT4_HT_RESIZE           7
+#define EXT4_HT_MIGRATE          8
+#define EXT4_HT_MOVE_EXTENTS     9
+#define EXT4_HT_XATTR           10
+#define EXT4_HT_EXT_CONVERT     11
+#define EXT4_HT_MAX             12
+
 /**
  *   struct ext4_journal_cb_entry - Base structure for callback information.
  *
@@ -170,16 +197,20 @@ static inline void ext4_journal_callback_add(handle_t *handle,
  * ext4_journal_callback_del: delete a registered callback
  * @handle: active journal transaction handle on which callback was registered
  * @jce: registered journal callback entry to unregister
+ * Return true if object was sucessfully removed
  */
-static inline void ext4_journal_callback_del(handle_t *handle,
+static inline bool ext4_journal_callback_try_del(handle_t *handle,
 					     struct ext4_journal_cb_entry *jce)
 {
+	bool deleted;
 	struct ext4_sb_info *sbi =
 			EXT4_SB(handle->h_transaction->t_journal->j_private);
 
 	spin_lock(&sbi->s_md_lock);
+	deleted = !list_empty(&jce->jce_list);
 	list_del_init(&jce->jce_list);
 	spin_unlock(&sbi->s_md_lock);
+	return deleted;
 }
 
 int
@@ -234,7 +265,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
 #define ext4_handle_dirty_super(handle, sb) \
 	__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
 
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+				  int type, int blocks, int rsv_blocks);
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
 
 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -268,14 +300,38 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
 	return 1;
 }
 
-static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
+#define ext4_journal_start_sb(sb, type, nblocks)			\
+	__ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start(inode, type, nblocks)			\
+	__ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+	__ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
+
+static inline handle_t *__ext4_journal_start(struct inode *inode,
+					     unsigned int line, int type,
+					     int blocks, int rsv_blocks)
 {
-	return ext4_journal_start_sb(inode->i_sb, nblocks);
+	return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+				       rsv_blocks);
 }
 
 #define ext4_journal_stop(handle) \
 	__ext4_journal_stop(__func__, __LINE__, (handle))
 
+#define ext4_journal_start_reserved(handle, type) \
+	__ext4_journal_start_reserved((handle), __LINE__, (type))
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+					int type);
+
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		jbd2_journal_free_reserved(handle);
+}
+
 static inline handle_t *ext4_journal_current_handle(void)
 {
 	return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5ae1674ec12..7097b0f680e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -112,7 +112,7 @@ static int ext4_split_extent_at(handle_t *handle,
 			     int flags);
 
 static int ext4_find_delayed_extent(struct inode *inode,
-				    struct ext4_ext_cache *newex);
+				    struct extent_status *newes);
 
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
 					    struct inode *inode,
@@ -157,11 +157,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
  *  - ENOMEM
  *  - EIO
  */
-#define ext4_ext_dirty(handle, inode, path) \
-		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
-static int __ext4_ext_dirty(const char *where, unsigned int line,
-			    handle_t *handle, struct inode *inode,
-			    struct ext4_ext_path *path)
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+		     struct inode *inode, struct ext4_ext_path *path)
 {
 	int err;
 	if (path->p_bh) {
@@ -714,7 +711,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 	eh->eh_magic = EXT4_EXT_MAGIC;
 	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
 	ext4_mark_inode_dirty(handle, inode);
-	ext4_ext_invalidate_cache(inode);
 	return 0;
 }
 
@@ -725,6 +721,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	struct ext4_extent_header *eh;
 	struct buffer_head *bh;
 	short int depth, i, ppos = 0, alloc = 0;
+	int ret;
 
 	eh = ext_inode_hdr(inode);
 	depth = ext_depth(inode);
@@ -752,12 +749,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		path[ppos].p_ext = NULL;
 
 		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
-		if (unlikely(!bh))
+		if (unlikely(!bh)) {
+			ret = -ENOMEM;
 			goto err;
+		}
 		if (!bh_uptodate_or_lock(bh)) {
 			trace_ext4_ext_load_extent(inode, block,
 						path[ppos].p_block);
-			if (bh_submit_read(bh) < 0) {
+			ret = bh_submit_read(bh);
+			if (ret < 0) {
 				put_bh(bh);
 				goto err;
 			}
@@ -768,13 +768,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 			put_bh(bh);
 			EXT4_ERROR_INODE(inode,
 					 "ppos %d > depth %d", ppos, depth);
+			ret = -EIO;
 			goto err;
 		}
 		path[ppos].p_bh = bh;
 		path[ppos].p_hdr = eh;
 		i--;
 
-		if (ext4_ext_check_block(inode, eh, i, bh))
+		ret = ext4_ext_check_block(inode, eh, i, bh);
+		if (ret < 0)
 			goto err;
 	}
 
@@ -796,7 +798,7 @@ err:
 	ext4_ext_drop_refs(path);
 	if (alloc)
 		kfree(path);
-	return ERR_PTR(-EIO);
+	return ERR_PTR(ret);
 }
 
 /*
@@ -950,8 +952,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		goto cleanup;
 	}
 	bh = sb_getblk(inode->i_sb, newblock);
-	if (!bh) {
-		err = -EIO;
+	if (unlikely(!bh)) {
+		err = -ENOMEM;
 		goto cleanup;
 	}
 	lock_buffer(bh);
@@ -1023,8 +1025,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		oldblock = newblock;
 		newblock = ablocks[--a];
 		bh = sb_getblk(inode->i_sb, newblock);
-		if (!bh) {
-			err = -EIO;
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
 			goto cleanup;
 		}
 		lock_buffer(bh);
@@ -1136,11 +1138,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 		return err;
 
 	bh = sb_getblk(inode->i_sb, newblock);
-	if (!bh) {
-		err = -EIO;
-		ext4_std_error(inode->i_sb, err);
-		return err;
-	}
+	if (unlikely(!bh))
+		return -ENOMEM;
 	lock_buffer(bh);
 
 	err = ext4_journal_get_create_access(handle, bh);
@@ -1582,10 +1581,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	unsigned short ext1_ee_len, ext2_ee_len, max_len;
 
 	/*
-	 * Make sure that either both extents are uninitialized, or
-	 * both are _not_.
+	 * Make sure that both extents are initialized. We don't merge
+	 * uninitialized extents so that we can be sure that end_io code has
+	 * the extent that was written properly split out and conversion to
+	 * initialized is trivial.
 	 */
-	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+	if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
 		return 0;
 
 	if (ext4_ext_is_uninitialized(ex1))
@@ -1809,39 +1810,101 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	}
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
+	eh = path[depth].p_hdr;
 	if (unlikely(path[depth].p_hdr == NULL)) {
 		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
 		return -EIO;
 	}
 
 	/* try to insert block into found extent and return */
-	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
-		&& ext4_can_extents_be_merged(inode, ex, newext)) {
-		ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
-			  ext4_ext_is_uninitialized(newext),
-			  ext4_ext_get_actual_len(newext),
-			  le32_to_cpu(ex->ee_block),
-			  ext4_ext_is_uninitialized(ex),
-			  ext4_ext_get_actual_len(ex),
-			  ext4_ext_pblock(ex));
-		err = ext4_ext_get_access(handle, inode, path + depth);
-		if (err)
-			return err;
+	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)) {
 
 		/*
-		 * ext4_can_extents_be_merged should have checked that either
-		 * both extents are uninitialized, or both aren't. Thus we
-		 * need to check only one of them here.
+		 * Try to see whether we should rather test the extent on
+		 * right from ex, or from the left of ex. This is because
+		 * ext4_ext_find_extent() can return either extent on the
+		 * left, or on the right from the searched position. This
+		 * will make merging more effective.
 		 */
-		if (ext4_ext_is_uninitialized(ex))
-			uninitialized = 1;
-		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+		if (ex < EXT_LAST_EXTENT(eh) &&
+		    (le32_to_cpu(ex->ee_block) +
+		    ext4_ext_get_actual_len(ex) <
+		    le32_to_cpu(newext->ee_block))) {
+			ex += 1;
+			goto prepend;
+		} else if ((ex > EXT_FIRST_EXTENT(eh)) &&
+			   (le32_to_cpu(newext->ee_block) +
+			   ext4_ext_get_actual_len(newext) <
+			   le32_to_cpu(ex->ee_block)))
+			ex -= 1;
+
+		/* Try to append newex to the ex */
+		if (ext4_can_extents_be_merged(inode, ex, newext)) {
+			ext_debug("append [%d]%d block to %u:[%d]%d"
+				  "(from %llu)\n",
+				  ext4_ext_is_uninitialized(newext),
+				  ext4_ext_get_actual_len(newext),
+				  le32_to_cpu(ex->ee_block),
+				  ext4_ext_is_uninitialized(ex),
+				  ext4_ext_get_actual_len(ex),
+				  ext4_ext_pblock(ex));
+			err = ext4_ext_get_access(handle, inode,
+						  path + depth);
+			if (err)
+				return err;
+
+			/*
+			 * ext4_can_extents_be_merged should have checked
+			 * that either both extents are uninitialized, or
+			 * both aren't. Thus we need to check only one of
+			 * them here.
+			 */
+			if (ext4_ext_is_uninitialized(ex))
+				uninitialized = 1;
+			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
 					+ ext4_ext_get_actual_len(newext));
-		if (uninitialized)
-			ext4_ext_mark_uninitialized(ex);
-		eh = path[depth].p_hdr;
-		nearex = ex;
-		goto merge;
+			if (uninitialized)
+				ext4_ext_mark_uninitialized(ex);
+			eh = path[depth].p_hdr;
+			nearex = ex;
+			goto merge;
+		}
+
+prepend:
+		/* Try to prepend newex to the ex */
+		if (ext4_can_extents_be_merged(inode, newext, ex)) {
+			ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
+				  "(from %llu)\n",
+				  le32_to_cpu(newext->ee_block),
+				  ext4_ext_is_uninitialized(newext),
+				  ext4_ext_get_actual_len(newext),
+				  le32_to_cpu(ex->ee_block),
+				  ext4_ext_is_uninitialized(ex),
+				  ext4_ext_get_actual_len(ex),
+				  ext4_ext_pblock(ex));
+			err = ext4_ext_get_access(handle, inode,
+						  path + depth);
+			if (err)
+				return err;
+
+			/*
+			 * ext4_can_extents_be_merged should have checked
+			 * that either both extents are uninitialized, or
+			 * both aren't. Thus we need to check only one of
+			 * them here.
+			 */
+			if (ext4_ext_is_uninitialized(ex))
+				uninitialized = 1;
+			ex->ee_block = newext->ee_block;
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
+			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+					+ ext4_ext_get_actual_len(newext));
+			if (uninitialized)
+				ext4_ext_mark_uninitialized(ex);
+			eh = path[depth].p_hdr;
+			nearex = ex;
+			goto merge;
+		}
 	}
 
 	depth = ext_depth(inode);
@@ -1876,8 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	 * There is no free space in the found leaf.
 	 * We're gonna add a new leaf in the tree.
 	 */
-	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
-		flags = EXT4_MB_USE_ROOT_BLOCKS;
+	if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+		flags = EXT4_MB_USE_RESERVED;
 	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
 	if (err)
 		goto cleanup;
@@ -1960,7 +2023,6 @@ cleanup:
 		ext4_ext_drop_refs(npath);
 		kfree(npath);
 	}
-	ext4_ext_invalidate_cache(inode);
 	return err;
 }
 
@@ -1969,8 +2031,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 				    struct fiemap_extent_info *fieinfo)
 {
 	struct ext4_ext_path *path = NULL;
-	struct ext4_ext_cache newex;
 	struct ext4_extent *ex;
+	struct extent_status es;
 	ext4_lblk_t next, next_del, start = 0, end = 0;
 	ext4_lblk_t last = block + num;
 	int exists, depth = 0, err = 0;
@@ -2044,37 +2106,48 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 		BUG_ON(end <= start);
 
 		if (!exists) {
-			newex.ec_block = start;
-			newex.ec_len = end - start;
-			newex.ec_start = 0;
+			es.es_lblk = start;
+			es.es_len = end - start;
+			es.es_pblk = 0;
 		} else {
-			newex.ec_block = le32_to_cpu(ex->ee_block);
-			newex.ec_len = ext4_ext_get_actual_len(ex);
-			newex.ec_start = ext4_ext_pblock(ex);
+			es.es_lblk = le32_to_cpu(ex->ee_block);
+			es.es_len = ext4_ext_get_actual_len(ex);
+			es.es_pblk = ext4_ext_pblock(ex);
 			if (ext4_ext_is_uninitialized(ex))
 				flags |= FIEMAP_EXTENT_UNWRITTEN;
 		}
 
 		/*
-		 * Find delayed extent and update newex accordingly. We call
-		 * it even in !exists case to find out whether newex is the
+		 * Find delayed extent and update es accordingly. We call
+		 * it even in !exists case to find out whether es is the
 		 * last existing extent or not.
 		 */
-		next_del = ext4_find_delayed_extent(inode, &newex);
+		next_del = ext4_find_delayed_extent(inode, &es);
 		if (!exists && next_del) {
 			exists = 1;
-			flags |= FIEMAP_EXTENT_DELALLOC;
+			flags |= (FIEMAP_EXTENT_DELALLOC |
+				  FIEMAP_EXTENT_UNKNOWN);
 		}
 		up_read(&EXT4_I(inode)->i_data_sem);
 
-		if (unlikely(newex.ec_len == 0)) {
-			EXT4_ERROR_INODE(inode, "newex.ec_len == 0");
+		if (unlikely(es.es_len == 0)) {
+			EXT4_ERROR_INODE(inode, "es.es_len == 0");
 			err = -EIO;
 			break;
 		}
 
-		/* This is possible iff next == next_del == EXT_MAX_BLOCKS */
-		if (next == next_del) {
+		/*
+		 * This is possible iff next == next_del == EXT_MAX_BLOCKS.
+		 * we need to check next == EXT_MAX_BLOCKS because it is
+		 * possible that an extent is with unwritten and delayed
+		 * status due to when an extent is delayed allocated and
+		 * is allocated by fallocate status tree will track both of
+		 * them in a extent.
+		 *
+		 * So we could return a unwritten and delayed extent, and
+		 * its block is equal to 'next'.
+		 */
+		if (next == next_del && next == EXT_MAX_BLOCKS) {
 			flags |= FIEMAP_EXTENT_LAST;
 			if (unlikely(next_del != EXT_MAX_BLOCKS ||
 				     next != EXT_MAX_BLOCKS)) {
@@ -2089,9 +2162,9 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 
 		if (exists) {
 			err = fiemap_fill_next_extent(fieinfo,
-				(__u64)newex.ec_block << blksize_bits,
-				(__u64)newex.ec_start << blksize_bits,
-				(__u64)newex.ec_len << blksize_bits,
+				(__u64)es.es_lblk << blksize_bits,
+				(__u64)es.es_pblk << blksize_bits,
+				(__u64)es.es_len << blksize_bits,
 				flags);
 			if (err < 0)
 				break;
@@ -2101,7 +2174,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 			}
 		}
 
-		block = newex.ec_block + newex.ec_len;
+		block = es.es_lblk + es.es_len;
 	}
 
 	if (path) {
@@ -2112,21 +2185,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 	return err;
 }
 
-static void
-ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
-			__u32 len, ext4_fsblk_t start)
-{
-	struct ext4_ext_cache *cex;
-	BUG_ON(len == 0);
-	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-	trace_ext4_ext_put_in_cache(inode, block, len, start);
-	cex = &EXT4_I(inode)->i_cached_extent;
-	cex->ec_block = block;
-	cex->ec_len = len;
-	cex->ec_start = start;
-	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-}
-
 /*
  * ext4_ext_put_gap_in_cache:
  * calculate boundaries of the gap that the requested block fits into
@@ -2143,9 +2201,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 
 	ex = path[depth].p_ext;
 	if (ex == NULL) {
-		/* there is no extent yet, so gap is [0;-] */
-		lblock = 0;
-		len = EXT_MAX_BLOCKS;
+		/*
+		 * there is no extent yet, so gap is [0;-] and we
+		 * don't cache it
+		 */
 		ext_debug("cache gap(whole file):");
 	} else if (block < le32_to_cpu(ex->ee_block)) {
 		lblock = block;
@@ -2154,6 +2213,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 				block,
 				le32_to_cpu(ex->ee_block),
 				 ext4_ext_get_actual_len(ex));
+		if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+			ext4_es_insert_extent(inode, lblock, len, ~0,
+					      EXTENT_STATUS_HOLE);
 	} else if (block >= le32_to_cpu(ex->ee_block)
 			+ ext4_ext_get_actual_len(ex)) {
 		ext4_lblk_t next;
@@ -2167,58 +2229,15 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 				block);
 		BUG_ON(next == lblock);
 		len = next - lblock;
+		if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+			ext4_es_insert_extent(inode, lblock, len, ~0,
+					      EXTENT_STATUS_HOLE);
 	} else {
 		lblock = len = 0;
 		BUG();
 	}
 
 	ext_debug(" -> %u:%lu\n", lblock, len);
-	ext4_ext_put_in_cache(inode, lblock, len, 0);
-}
-
-/*
- * ext4_ext_in_cache()
- * Checks to see if the given block is in the cache.
- * If it is, the cached extent is stored in the given
- * cache extent pointer.
- *
- * @inode: The files inode
- * @block: The block to look for in the cache
- * @ex:    Pointer where the cached extent will be stored
- *         if it contains block
- *
- * Return 0 if cache is invalid; 1 if the cache is valid
- */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
-		  struct ext4_extent *ex)
-{
-	struct ext4_ext_cache *cex;
-	int ret = 0;
-
-	/*
-	 * We borrow i_block_reservation_lock to protect i_cached_extent
-	 */
-	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-	cex = &EXT4_I(inode)->i_cached_extent;
-
-	/* has cache valid data? */
-	if (cex->ec_len == 0)
-		goto errout;
-
-	if (in_range(block, cex->ec_block, cex->ec_len)) {
-		ex->ee_block = cpu_to_le32(cex->ec_block);
-		ext4_ext_store_pblock(ex, cex->ec_start);
-		ex->ee_len = cpu_to_le16(cex->ec_len);
-		ext_debug("%u cached by %u:%u:%llu\n",
-				block,
-				cex->ec_block, cex->ec_len, cex->ec_start);
-		ret = 1;
-	}
-errout:
-	trace_ext4_ext_in_cache(inode, block, ret);
-	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-	return ret;
 }
 
 /*
@@ -2310,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
 }
 
 /*
- * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
  *
- * if nrblocks are fit in a single extent (chunk flag is 1), then
- * in the worse case, each tree level index/leaf need to be changed
- * if the tree split due to insert a new extent, then the old tree
- * index/leaf need to be updated too
+ * If we add a single extent, then in the worse case, each tree level
+ * index/leaf need to be changed in case of the tree split.
  *
- * If the nrblocks are discontiguous, they could cause
- * the whole tree split more than once, but this is really rare.
+ * If more extents are inserted, they could cause the whole tree split more
+ * than once, but this is really rare.
  */
-int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
 {
 	int index;
 	int depth;
@@ -2331,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 
 	depth = ext_depth(inode);
 
-	if (chunk)
+	if (extents <= 1)
 		index = depth * 2;
 	else
 		index = depth * 3;
@@ -2339,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 	return index;
 }
 
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+	else if (ext4_should_journal_data(inode))
+		return EXT4_FREE_BLOCKS_FORGET;
+	return 0;
+}
+
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 			      struct ext4_extent *ex,
-			      ext4_fsblk_t *partial_cluster,
+			      long long *partial_cluster,
 			      ext4_lblk_t from, ext4_lblk_t to)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
 	ext4_fsblk_t pblk;
-	int flags = 0;
-
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
-	else if (ext4_should_journal_data(inode))
-		flags |= EXT4_FREE_BLOCKS_FORGET;
+	int flags = get_default_free_blocks_flags(inode);
 
 	/*
 	 * For bigalloc file systems, we never free a partial cluster
@@ -2370,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	 * partial cluster here.
 	 */
 	pblk = ext4_ext_pblock(ex) + ee_len - 1;
-	if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+	if ((*partial_cluster > 0) &&
+	    (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
 		ext4_free_blocks(handle, inode, NULL,
 				 EXT4_C2B(sbi, *partial_cluster),
 				 sbi->s_cluster_ratio, flags);
@@ -2396,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		/* tail removal */
 		ext4_lblk_t num;
+		unsigned int unaligned;
 
 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
 		pblk = ext4_ext_pblock(ex) + ee_len - num;
-		ext_debug("free last %u blocks starting %llu\n", num, pblk);
+		/*
+		 * Usually we want to free partial cluster at the end of the
+		 * extent, except for the situation when the cluster is still
+		 * used by any other extent (partial_cluster is negative).
+		 */
+		if (*partial_cluster < 0 &&
+		    -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+			flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+		ext_debug("free last %u blocks starting %llu partial %lld\n",
+			  num, pblk, *partial_cluster);
 		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
 		/*
 		 * If the block range to be freed didn't start at the
 		 * beginning of a cluster, and we removed the entire
-		 * extent, save the partial cluster here, since we
-		 * might need to delete if we determine that the
-		 * truncate operation has removed all of the blocks in
-		 * the cluster.
+		 * extent and the cluster is not used by any other extent,
+		 * save the partial cluster here, since we might need to
+		 * delete if we determine that the truncate operation has
+		 * removed all of the blocks in the cluster.
+		 *
+		 * On the other hand, if we did not manage to free the whole
+		 * extent, we have to mark the cluster as used (store negative
+		 * cluster number in partial_cluster).
 		 */
-		if (pblk & (sbi->s_cluster_ratio - 1) &&
-		    (ee_len == num))
+		unaligned = pblk & (sbi->s_cluster_ratio - 1);
+		if (unaligned && (ee_len == num) &&
+		    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
 			*partial_cluster = EXT4_B2C(sbi, pblk);
-		else
+		else if (unaligned)
+			*partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+		else if (*partial_cluster > 0)
 			*partial_cluster = 0;
-	} else if (from == le32_to_cpu(ex->ee_block)
-		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-		/* head removal */
-		ext4_lblk_t num;
-		ext4_fsblk_t start;
-
-		num = to - from;
-		start = ext4_ext_pblock(ex);
-
-		ext_debug("free first %u blocks starting %llu\n", num, start);
-		ext4_free_blocks(handle, inode, NULL, start, num, flags);
-
-	} else {
-		printk(KERN_INFO "strange request: removal(2) "
-				"%u-%u from %u:%u\n",
-				from, to, le32_to_cpu(ex->ee_block), ee_len);
-	}
+	} else
+		ext4_error(sbi->s_sb, "strange request: removal(2) "
+			   "%u-%u from %u:%u\n",
+			   from, to, le32_to_cpu(ex->ee_block), ee_len);
 	return 0;
 }
 
@@ -2443,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
  * @handle: The journal handle
  * @inode:  The files inode
  * @path:   The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ *                   has been released from it. It gets negative in case
+ *                   that the cluster is still used.
  * @start:  The first block to remove
  * @end:   The last block to remove
  */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-		 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+		 struct ext4_ext_path *path,
+		 long long *partial_cluster,
 		 ext4_lblk_t start, ext4_lblk_t end)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2461,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	unsigned short ex_ee_len;
 	unsigned uninitialized = 0;
 	struct ext4_extent *ex;
+	ext4_fsblk_t pblk;
 
 	/* the header must be checked already in ext4_ext_remove_space() */
 	ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2472,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		return -EIO;
 	}
 	/* find where to start removing */
-	ex = EXT_LAST_EXTENT(eh);
+	ex = path[depth].p_ext;
+	if (!ex)
+		ex = EXT_LAST_EXTENT(eh);
 
 	ex_ee_block = le32_to_cpu(ex->ee_block);
 	ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2499,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 
 		/* If this extent is beyond the end of the hole, skip it */
 		if (end < ex_ee_block) {
+			/*
+			 * We're going to skip this extent and move to another,
+			 * so if this extent is not cluster aligned we have
+			 * to mark the current cluster as used to avoid
+			 * accidentally freeing it later on
+			 */
+			pblk = ext4_ext_pblock(ex);
+			if (pblk & (sbi->s_cluster_ratio - 1))
+				*partial_cluster =
+					-((long long)EXT4_B2C(sbi, pblk));
 			ex--;
 			ex_ee_block = le32_to_cpu(ex->ee_block);
 			ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2574,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 					sizeof(struct ext4_extent));
 			}
 			le16_add_cpu(&eh->eh_entries, -1);
-		} else
+		} else if (*partial_cluster > 0)
 			*partial_cluster = 0;
 
 		err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2592,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		err = ext4_ext_correct_indexes(handle, inode, path);
 
 	/*
-	 * If there is still a entry in the leaf node, check to see if
-	 * it references the partial cluster.  This is the only place
-	 * where it could; if it doesn't, we can free the cluster.
+	 * Free the partial cluster only if the current extent does not
+	 * reference it. Otherwise we might free used cluster.
 	 */
-	if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+	if (*partial_cluster > 0 &&
 	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
 	     *partial_cluster)) {
-		int flags = EXT4_FREE_BLOCKS_FORGET;
-
-		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-			flags |= EXT4_FREE_BLOCKS_METADATA;
+		int flags = get_default_free_blocks_flags(inode);
 
 		ext4_free_blocks(handle, inode, NULL,
 				 EXT4_C2B(sbi, *partial_cluster),
@@ -2640,27 +2680,25 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
 	return 1;
 }
 
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
-				 ext4_lblk_t end)
+int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+			  ext4_lblk_t end)
 {
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
 	struct ext4_ext_path *path = NULL;
-	ext4_fsblk_t partial_cluster = 0;
+	long long partial_cluster = 0;
 	handle_t *handle;
 	int i = 0, err = 0;
 
 	ext_debug("truncate since %u to %u\n", start, end);
 
 	/* probably first extent we're gonna free will be last in block */
-	handle = ext4_journal_start(inode, depth + 1);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
 again:
-	ext4_ext_invalidate_cache(inode);
-
-	trace_ext4_ext_remove_space(inode, start, depth);
+	trace_ext4_ext_remove_space(inode, start, end, depth);
 
 	/*
 	 * Check if we are removing extents inside the extent tree. If that
@@ -2710,12 +2748,14 @@ again:
 
 			/*
 			 * Split the extent in two so that 'end' is the last
-			 * block in the first new extent
+			 * block in the first new extent. Also we should not
+			 * fail removing space due to ENOSPC so try to use
+			 * reserved block if that happens.
 			 */
 			err = ext4_split_extent_at(handle, inode, path,
-						end + 1, split_flag,
-						EXT4_GET_BLOCKS_PRE_IO |
-						EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+					end + 1, split_flag,
+					EXT4_GET_BLOCKS_PRE_IO |
+					EXT4_GET_BLOCKS_METADATA_NOFAIL);
 
 			if (err < 0)
 				goto out;
@@ -2826,17 +2866,14 @@ again:
 		}
 	}
 
-	trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
-			path->p_hdr->eh_entries);
+	trace_ext4_ext_remove_space_done(inode, start, end, depth,
+			partial_cluster, path->p_hdr->eh_entries);
 
 	/* If we still have something in the partial cluster and we have removed
 	 * even the first extent, then we should free the blocks in the partial
 	 * cluster as well. */
-	if (partial_cluster && path->p_hdr->eh_entries == 0) {
-		int flags = EXT4_FREE_BLOCKS_FORGET;
-
-		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-			flags |= EXT4_FREE_BLOCKS_METADATA;
+	if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
+		int flags = get_default_free_blocks_flags(inode);
 
 		ext4_free_blocks(handle, inode, NULL,
 				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -2968,7 +3005,7 @@ static int ext4_split_extent_at(handle_t *handle,
 {
 	ext4_fsblk_t newblock;
 	ext4_lblk_t ee_block;
-	struct ext4_extent *ex, newex, orig_ex;
+	struct ext4_extent *ex, newex, orig_ex, zero_ex;
 	struct ext4_extent *ex2 = NULL;
 	unsigned int ee_len, depth;
 	int err = 0;
@@ -2988,6 +3025,10 @@ static int ext4_split_extent_at(handle_t *handle,
 	newblock = split - ee_block + ext4_ext_pblock(ex);
 
 	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+	BUG_ON(!ext4_ext_is_uninitialized(ex) &&
+	       split_flag & (EXT4_EXT_MAY_ZEROOUT |
+			     EXT4_EXT_MARK_UNINIT1 |
+			     EXT4_EXT_MARK_UNINIT2));
 
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
@@ -3035,12 +3076,29 @@ static int ext4_split_extent_at(handle_t *handle,
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
 		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
-			if (split_flag & EXT4_EXT_DATA_VALID1)
+			if (split_flag & EXT4_EXT_DATA_VALID1) {
 				err = ext4_ext_zeroout(inode, ex2);
-			else
+				zero_ex.ee_block = ex2->ee_block;
+				zero_ex.ee_len = cpu_to_le16(
+						ext4_ext_get_actual_len(ex2));
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex2));
+			} else {
 				err = ext4_ext_zeroout(inode, ex);
-		} else
+				zero_ex.ee_block = ex->ee_block;
+				zero_ex.ee_len = cpu_to_le16(
+						ext4_ext_get_actual_len(ex));
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex));
+			}
+		} else {
 			err = ext4_ext_zeroout(inode, &orig_ex);
+			zero_ex.ee_block = orig_ex.ee_block;
+			zero_ex.ee_len = cpu_to_le16(
+						ext4_ext_get_actual_len(&orig_ex));
+			ext4_ext_store_pblock(&zero_ex,
+					      ext4_ext_pblock(&orig_ex));
+		}
 
 		if (err)
 			goto fix_extent_len;
@@ -3048,6 +3106,12 @@ static int ext4_split_extent_at(handle_t *handle,
 		ex->ee_len = cpu_to_le16(ee_len);
 		ext4_ext_try_to_merge(handle, inode, path, ex);
 		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+		if (err)
+			goto fix_extent_len;
+
+		/* update extent status tree */
+		err = ext4_es_zeroout(inode, &zero_ex);
+
 		goto out;
 	} else if (err)
 		goto fix_extent_len;
@@ -3086,6 +3150,7 @@ static int ext4_split_extent(handle_t *handle,
 	int err = 0;
 	int uninitialized;
 	int split_flag1, flags1;
+	int allocated = map->m_len;
 
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
@@ -3105,20 +3170,29 @@ static int ext4_split_extent(handle_t *handle,
 				map->m_lblk + map->m_len, split_flag1, flags1);
 		if (err)
 			goto out;
+	} else {
+		allocated = ee_len - (map->m_lblk - ee_block);
 	}
-
+	/*
+	 * Update path is required because previous ext4_split_extent_at() may
+	 * result in split of original leaf or extent zeroout.
+	 */
 	ext4_ext_drop_refs(path);
 	path = ext4_ext_find_extent(inode, map->m_lblk, path);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	uninitialized = ext4_ext_is_uninitialized(ex);
+	split_flag1 = 0;
 
 	if (map->m_lblk >= ee_block) {
-		split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT |
-					    EXT4_EXT_DATA_VALID2);
-		if (uninitialized)
+		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
+		if (uninitialized) {
 			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
-		if (split_flag & EXT4_EXT_MARK_UNINIT2)
-			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
+						     EXT4_EXT_MARK_UNINIT2);
+		}
 		err = ext4_split_extent_at(handle, inode, path,
 				map->m_lblk, split_flag1, flags);
 		if (err)
@@ -3127,7 +3201,7 @@ static int ext4_split_extent(handle_t *handle,
 
 	ext4_ext_show_leaf(inode, path);
 out:
-	return err ? err : map->m_len;
+	return err ? err : allocated;
 }
 
 /*
@@ -3153,35 +3227,36 @@ out:
 static int ext4_ext_convert_to_initialized(handle_t *handle,
 					   struct inode *inode,
 					   struct ext4_map_blocks *map,
-					   struct ext4_ext_path *path)
+					   struct ext4_ext_path *path,
+					   int flags)
 {
 	struct ext4_sb_info *sbi;
 	struct ext4_extent_header *eh;
 	struct ext4_map_blocks split_map;
 	struct ext4_extent zero_ex;
-	struct ext4_extent *ex;
+	struct ext4_extent *ex, *abut_ex;
 	ext4_lblk_t ee_block, eof_block;
-	unsigned int ee_len, depth;
-	int allocated, max_zeroout = 0;
+	unsigned int ee_len, depth, map_len = map->m_len;
+	int allocated = 0, max_zeroout = 0;
 	int err = 0;
 	int split_flag = 0;
 
 	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
 		"block %llu, max_blocks %u\n", inode->i_ino,
-		(unsigned long long)map->m_lblk, map->m_len);
+		(unsigned long long)map->m_lblk, map_len);
 
 	sbi = EXT4_SB(inode->i_sb);
 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 		inode->i_sb->s_blocksize_bits;
-	if (eof_block < map->m_lblk + map->m_len)
-		eof_block = map->m_lblk + map->m_len;
+	if (eof_block < map->m_lblk + map_len)
+		eof_block = map->m_lblk + map_len;
 
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
-	allocated = ee_len - (map->m_lblk - ee_block);
+	zero_ex.ee_len = 0;
 
 	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
 
@@ -3191,77 +3266,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 
 	/*
 	 * Attempt to transfer newly initialized blocks from the currently
-	 * uninitialized extent to its left neighbor. This is much cheaper
+	 * uninitialized extent to its neighbor. This is much cheaper
 	 * than an insertion followed by a merge as those involve costly
-	 * memmove() calls. This is the common case in steady state for
-	 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
-	 * writes.
+	 * memmove() calls. Transferring to the left is the common case in
+	 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
+	 * followed by append writes.
 	 *
 	 * Limitations of the current logic:
-	 *  - L1: we only deal with writes at the start of the extent.
-	 *    The approach could be extended to writes at the end
-	 *    of the extent but this scenario was deemed less common.
-	 *  - L2: we do not deal with writes covering the whole extent.
+	 *  - L1: we do not deal with writes covering the whole extent.
 	 *    This would require removing the extent if the transfer
 	 *    is possible.
-	 *  - L3: we only attempt to merge with an extent stored in the
+	 *  - L2: we only attempt to merge with an extent stored in the
 	 *    same extent tree node.
 	 */
-	if ((map->m_lblk == ee_block) &&	/*L1*/
-		(map->m_len < ee_len) &&	/*L2*/
-		(ex > EXT_FIRST_EXTENT(eh))) {	/*L3*/
-		struct ext4_extent *prev_ex;
+	if ((map->m_lblk == ee_block) &&
+		/* See if we can merge left */
+		(map_len < ee_len) &&		/*L1*/
+		(ex > EXT_FIRST_EXTENT(eh))) {	/*L2*/
 		ext4_lblk_t prev_lblk;
 		ext4_fsblk_t prev_pblk, ee_pblk;
-		unsigned int prev_len, write_len;
+		unsigned int prev_len;
 
-		prev_ex = ex - 1;
-		prev_lblk = le32_to_cpu(prev_ex->ee_block);
-		prev_len = ext4_ext_get_actual_len(prev_ex);
-		prev_pblk = ext4_ext_pblock(prev_ex);
+		abut_ex = ex - 1;
+		prev_lblk = le32_to_cpu(abut_ex->ee_block);
+		prev_len = ext4_ext_get_actual_len(abut_ex);
+		prev_pblk = ext4_ext_pblock(abut_ex);
 		ee_pblk = ext4_ext_pblock(ex);
-		write_len = map->m_len;
 
 		/*
-		 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
 		 * upon those conditions:
-		 * - C1: prev_ex is initialized,
-		 * - C2: prev_ex is logically abutting ex,
-		 * - C3: prev_ex is physically abutting ex,
-		 * - C4: prev_ex can receive the additional blocks without
+		 * - C1: abut_ex is initialized,
+		 * - C2: abut_ex is logically abutting ex,
+		 * - C3: abut_ex is physically abutting ex,
+		 * - C4: abut_ex can receive the additional blocks without
 		 *   overflowing the (initialized) length limit.
 		 */
-		if ((!ext4_ext_is_uninitialized(prev_ex)) &&		/*C1*/
+		if ((!ext4_ext_is_uninitialized(abut_ex)) &&		/*C1*/
 			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
 			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
-			(prev_len < (EXT_INIT_MAX_LEN - write_len))) {	/*C4*/
+			(prev_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
 			err = ext4_ext_get_access(handle, inode, path + depth);
 			if (err)
 				goto out;
 
 			trace_ext4_ext_convert_to_initialized_fastpath(inode,
-				map, ex, prev_ex);
+				map, ex, abut_ex);
 
-			/* Shift the start of ex by 'write_len' blocks */
-			ex->ee_block = cpu_to_le32(ee_block + write_len);
-			ext4_ext_store_pblock(ex, ee_pblk + write_len);
-			ex->ee_len = cpu_to_le16(ee_len - write_len);
+			/* Shift the start of ex by 'map_len' blocks */
+			ex->ee_block = cpu_to_le32(ee_block + map_len);
+			ext4_ext_store_pblock(ex, ee_pblk + map_len);
+			ex->ee_len = cpu_to_le16(ee_len - map_len);
 			ext4_ext_mark_uninitialized(ex); /* Restore the flag */
 
-			/* Extend prev_ex by 'write_len' blocks */
-			prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+			/* Extend abut_ex by 'map_len' blocks */
+			abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
 
-			/* Mark the block containing both extents as dirty */
-			ext4_ext_dirty(handle, inode, path + depth);
+			/* Result: number of initialized blocks past m_lblk */
+			allocated = map_len;
+		}
+	} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
+		   (map_len < ee_len) &&	/*L1*/
+		   ex < EXT_LAST_EXTENT(eh)) {	/*L2*/
+		/* See if we can merge right */
+		ext4_lblk_t next_lblk;
+		ext4_fsblk_t next_pblk, ee_pblk;
+		unsigned int next_len;
+
+		abut_ex = ex + 1;
+		next_lblk = le32_to_cpu(abut_ex->ee_block);
+		next_len = ext4_ext_get_actual_len(abut_ex);
+		next_pblk = ext4_ext_pblock(abut_ex);
+		ee_pblk = ext4_ext_pblock(ex);
 
-			/* Update path to point to the right extent */
-			path[depth].p_ext = prev_ex;
+		/*
+		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+		 * upon those conditions:
+		 * - C1: abut_ex is initialized,
+		 * - C2: abut_ex is logically abutting ex,
+		 * - C3: abut_ex is physically abutting ex,
+		 * - C4: abut_ex can receive the additional blocks without
+		 *   overflowing the (initialized) length limit.
+		 */
+		if ((!ext4_ext_is_uninitialized(abut_ex)) &&		/*C1*/
+		    ((map->m_lblk + map_len) == next_lblk) &&		/*C2*/
+		    ((ee_pblk + ee_len) == next_pblk) &&		/*C3*/
+		    (next_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
+			err = ext4_ext_get_access(handle, inode, path + depth);
+			if (err)
+				goto out;
+
+			trace_ext4_ext_convert_to_initialized_fastpath(inode,
+				map, ex, abut_ex);
+
+			/* Shift the start of abut_ex by 'map_len' blocks */
+			abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
+			ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
+			ex->ee_len = cpu_to_le16(ee_len - map_len);
+			ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+			/* Extend abut_ex by 'map_len' blocks */
+			abut_ex->ee_len = cpu_to_le16(next_len + map_len);
 
 			/* Result: number of initialized blocks past m_lblk */
-			allocated = write_len;
-			goto out;
+			allocated = map_len;
 		}
 	}
+	if (allocated) {
+		/* Mark the block containing both extents as dirty */
+		ext4_ext_dirty(handle, inode, path + depth);
+
+		/* Update path to point to the right extent */
+		path[depth].p_ext = abut_ex;
+		goto out;
+	} else
+		allocated = ee_len - (map->m_lblk - ee_block);
 
 	WARN_ON(map->m_lblk < ee_block);
 	/*
@@ -3272,13 +3391,16 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 
 	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
 		max_zeroout = sbi->s_extent_max_zeroout_kb >>
-			inode->i_sb->s_blocksize_bits;
+			(inode->i_sb->s_blocksize_bits - 10);
 
 	/* If extent is less than s_max_zeroout_kb, zeroout directly */
 	if (max_zeroout && (ee_len <= max_zeroout)) {
 		err = ext4_ext_zeroout(inode, ex);
 		if (err)
 			goto out;
+		zero_ex.ee_block = ex->ee_block;
+		zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
+		ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
 
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
@@ -3332,11 +3454,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	}
 
 	allocated = ext4_split_extent(handle, inode, path,
-				      &split_map, split_flag, 0);
+				      &split_map, split_flag, flags);
 	if (allocated < 0)
 		err = allocated;
 
 out:
+	/* If we have gotten a failure, don't zero out status tree */
+	if (!err)
+		err = ext4_es_zeroout(inode, &zero_ex);
 	return err ? err : allocated;
 }
 
@@ -3419,8 +3544,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 		"block %llu, max_blocks %u\n", inode->i_ino,
 		  (unsigned long long)ee_block, ee_len);
 
-	/* If extent is larger than requested then split is required */
+	/* If extent is larger than requested it is a clear sign that we still
+	 * have some extent state machine issues left. So extent_split is still
+	 * required.
+	 * TODO: Once all related issues will be fixed this situation should be
+	 * illegal.
+	 */
 	if (ee_block != map->m_lblk || ee_len > map->m_len) {
+#ifdef EXT4_DEBUG
+		ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+			     " len %u; IO logical block %llu, len %u\n",
+			     inode->i_ino, (unsigned long long)ee_block, ee_len,
+			     (unsigned long long)map->m_lblk, map->m_len);
+#endif
 		err = ext4_split_unwritten_extents(handle, inode, map, path,
 						   EXT4_GET_BLOCKS_CONVERT);
 		if (err < 0)
@@ -3519,19 +3655,19 @@ out:
  *
  * Return 1 if there is a delalloc block in the range, otherwise 0.
  */
-static int ext4_find_delalloc_range(struct inode *inode,
-				    ext4_lblk_t lblk_start,
-				    ext4_lblk_t lblk_end)
+int ext4_find_delalloc_range(struct inode *inode,
+			     ext4_lblk_t lblk_start,
+			     ext4_lblk_t lblk_end)
 {
 	struct extent_status es;
 
-	es.start = lblk_start;
-	ext4_es_find_extent(inode, &es);
-	if (es.len == 0)
+	ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
+	if (es.es_len == 0)
 		return 0; /* there is no delay extent in this tree */
-	else if (es.start <= lblk_start && lblk_start < es.start + es.len)
+	else if (es.es_lblk <= lblk_start &&
+		 lblk_start < es.es_lblk + es.es_len)
 		return 1;
-	else if (lblk_start <= es.start && es.start <= lblk_end)
+	else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
 		return 1;
 	else
 		return 0;
@@ -3638,6 +3774,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		  flags, allocated);
 	ext4_ext_show_leaf(inode, path);
 
+	/*
+	 * When writing into uninitialized space, we should not fail to
+	 * allocate metadata blocks for the new extent block if needed.
+	 */
+	flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
+
 	trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
 						    allocated, newblock);
 
@@ -3656,6 +3798,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 			ext4_set_io_unwritten_flag(inode, io);
 		else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
 		if (ext4_should_dioread_nolock(inode))
 			map->m_flags |= EXT4_MAP_UNINIT;
 		goto out;
@@ -3670,6 +3813,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 						 path, map->m_len);
 		} else
 			err = ret;
+		map->m_flags |= EXT4_MAP_MAPPED;
+		if (allocated > map->m_len)
+			allocated = map->m_len;
+		map->m_len = allocated;
 		goto out2;
 	}
 	/* buffered IO case */
@@ -3677,8 +3824,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	 * repeat fallocate creation request
 	 * we already have an unwritten extent
 	 */
-	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) {
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
 		goto map_out;
+	}
 
 	/* buffered READ or buffered write_begin() lookup */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3694,7 +3843,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	}
 
 	/* buffered write, writepage time, convert*/
-	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
+	ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
 	if (ret >= 0)
 		ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -3717,6 +3866,7 @@ out:
 					allocated - map->m_len);
 		allocated = map->m_len;
 	}
+	map->m_len = allocated;
 
 	/*
 	 * If we have done fallocate with the offset that is already
@@ -3898,35 +4048,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		  map->m_lblk, map->m_len, inode->i_ino);
 	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 
-	/* check in cache */
-	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
-		if (!newex.ee_start_lo && !newex.ee_start_hi) {
-			if ((sbi->s_cluster_ratio > 1) &&
-			    ext4_find_delalloc_cluster(inode, map->m_lblk))
-				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
-
-			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
-				/*
-				 * block isn't allocated yet and
-				 * user doesn't want to allocate it
-				 */
-				goto out2;
-			}
-			/* we should allocate requested block */
-		} else {
-			/* block is already allocated */
-			if (sbi->s_cluster_ratio > 1)
-				map->m_flags |= EXT4_MAP_FROM_CLUSTER;
-			newblock = map->m_lblk
-				   - le32_to_cpu(newex.ee_block)
-				   + ext4_ext_pblock(&newex);
-			/* number of remaining blocks in the extent */
-			allocated = ext4_ext_get_actual_len(&newex) -
-				(map->m_lblk - le32_to_cpu(newex.ee_block));
-			goto out;
-		}
-	}
-
 	/* find extent for this block */
 	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
 	if (IS_ERR(path)) {
@@ -3973,15 +4094,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
 				  ee_block, ee_len, newblock);
 
-			/*
-			 * Do not put uninitialized extent
-			 * in the cache
-			 */
-			if (!ext4_ext_is_uninitialized(ex)) {
-				ext4_ext_put_in_cache(inode, ee_block,
-					ee_len, ee_start);
+			if (!ext4_ext_is_uninitialized(ex))
 				goto out;
-			}
+
 			allocated = ext4_ext_handle_uninitialized_extents(
 				handle, inode, map, path, flags,
 				allocated, newblock);
@@ -4002,7 +4117,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		 * put just found gap into cache to speed up
 		 * subsequent requests
 		 */
-		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+		if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
+			ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
 		goto out2;
 	}
 
@@ -4108,6 +4224,7 @@ got_allocated_blocks:
 	/* Mark uninitialized */
 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
 		ext4_ext_mark_uninitialized(&newex);
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
 		/*
 		 * io_end structure was created for every IO write to an
 		 * uninitialized extent. To avoid unnecessary conversion,
@@ -4181,9 +4298,6 @@ got_allocated_blocks:
 			}
 		} else {
 			BUG_ON(allocated_clusters < reserved_clusters);
-			/* We will claim quota for all newly allocated blocks.*/
-			ext4_da_update_reserve_space(inode, allocated_clusters,
-							1);
 			if (reserved_clusters < allocated_clusters) {
 				struct ext4_inode_info *ei = EXT4_I(inode);
 				int reservation = allocated_clusters -
@@ -4234,6 +4348,15 @@ got_allocated_blocks:
 				ei->i_reserved_data_blocks += reservation;
 				spin_unlock(&ei->i_block_reservation_lock);
 			}
+			/*
+			 * We will claim quota for all newly allocated blocks.
+			 * We're updating the reserved space *after* the
+			 * correction above so we do not accidentally free
+			 * all the metadata reservation because we might
+			 * actually need it later on.
+			 */
+			ext4_da_update_reserve_space(inode, allocated_clusters,
+							1);
 		}
 	}
 
@@ -4241,10 +4364,9 @@ got_allocated_blocks:
 	 * Cache the extent and update transaction to commit on fdatasync only
 	 * when it is _not_ an uninitialized extent.
 	 */
-	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
+	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
 		ext4_update_inode_fsync_trans(handle, inode, 1);
-	} else
+	else
 		ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
 	if (allocated > map->m_len)
@@ -4260,54 +4382,18 @@ out2:
 	}
 
 out3:
-	trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
+	trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated);
 
 	return err ? err : allocated;
 }
 
-void ext4_ext_truncate(struct inode *inode)
+void ext4_ext_truncate(handle_t *handle, struct inode *inode)
 {
-	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t last_block;
-	handle_t *handle;
-	loff_t page_len;
 	int err = 0;
 
 	/*
-	 * finish any pending end_io work so we won't run the risk of
-	 * converting any truncated blocks to initialized later
-	 */
-	ext4_flush_unwritten_io(inode);
-
-	/*
-	 * probably first extent we're gonna free will be last in block
-	 */
-	err = ext4_writepage_trans_blocks(inode);
-	handle = ext4_journal_start(inode, err);
-	if (IS_ERR(handle))
-		return;
-
-	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-		page_len = PAGE_CACHE_SIZE -
-			(inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-		err = ext4_discard_partial_page_buffers(handle,
-			mapping, inode->i_size, page_len, 0);
-
-		if (err)
-			goto out_stop;
-	}
-
-	if (ext4_orphan_add(handle, inode))
-		goto out_stop;
-
-	down_write(&EXT4_I(inode)->i_data_sem);
-	ext4_ext_invalidate_cache(inode);
-
-	ext4_discard_preallocations(inode);
-
-	/*
 	 * TODO: optimization is possible here.
 	 * Probably we need not scan at all,
 	 * because page truncation is enough.
@@ -4322,29 +4408,6 @@ void ext4_ext_truncate(struct inode *inode)
 	err = ext4_es_remove_extent(inode, last_block,
 				    EXT_MAX_BLOCKS - last_block);
 	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
-
-	/* In a multi-transaction truncate, we only make the final
-	 * transaction synchronous.
-	 */
-	if (IS_SYNC(inode))
-		ext4_handle_sync(handle);
-
-	up_write(&EXT4_I(inode)->i_data_sem);
-
-out_stop:
-	/*
-	 * If this was a simple ftruncate() and the file will remain alive,
-	 * then we need to clear up the orphan record which we created above.
-	 * However, if this was a real unlink then we were called by
-	 * ext4_delete_inode(), and we allow that function to clean up the
-	 * orphan info for us.
-	 */
-	if (inode->i_nlink)
-		ext4_orphan_del(handle, inode);
-
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-	ext4_mark_inode_dirty(handle, inode);
-	ext4_journal_stop(handle);
 }
 
 static void ext4_falloc_update_inode(struct inode *inode,
@@ -4386,7 +4449,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
  */
 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	handle_t *handle;
 	loff_t new_size;
 	unsigned int max_blocks;
@@ -4397,24 +4460,24 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
-	/*
-	 * currently supporting (pre)allocate mode for extent-based
-	 * files _only_
-	 */
-	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-		return -EOPNOTSUPP;
-
 	/* Return error if mode is not supported */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
-		return ext4_punch_hole(file, offset, len);
+		return ext4_punch_hole(inode, offset, len);
 
 	ret = ext4_convert_inline_data(inode);
 	if (ret)
 		return ret;
 
+	/*
+	 * currently supporting (pre)allocate mode for extent-based
+	 * files _only_
+	 */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return -EOPNOTSUPP;
+
 	trace_ext4_fallocate_enter(inode, offset, len, mode);
 	map.m_lblk = offset >> blkbits;
 	/*
@@ -4445,13 +4508,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (len <= EXT_UNINIT_MAX_LEN << blkbits)
 		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 
-	/* Prevent race condition between unwritten */
-	ext4_flush_unwritten_io(inode);
 retry:
 	while (ret >= 0 && ret < max_blocks) {
 		map.m_lblk = map.m_lblk + ret;
 		map.m_len = max_blocks = max_blocks - ret;
-		handle = ext4_journal_start(inode, credits);
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+					    credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			break;
@@ -4459,11 +4521,11 @@ retry:
 		ret = ext4_map_blocks(handle, inode, &map, flags);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
-			WARN_ON(ret <= 0);
-			printk(KERN_ERR "%s: ext4_ext_map_blocks "
-				    "returned error inode#%lu, block=%u, "
-				    "max_blocks=%u", __func__,
-				    inode->i_ino, map.m_lblk, max_blocks);
+			ext4_warning(inode->i_sb,
+				     "inode #%lu: block %u: len %u: "
+				     "ext4_ext_map_blocks returned %d",
+				     inode->i_ino, map.m_lblk,
+				     map.m_len, ret);
 #endif
 			ext4_mark_inode_dirty(handle, inode);
 			ret2 = ext4_journal_stop(handle);
@@ -4505,10 +4567,9 @@ retry:
  * function, to convert the fallocated extents after IO is completed.
  * Returns 0 on success.
  */
-int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-				    ssize_t len)
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+				   loff_t offset, ssize_t len)
 {
-	handle_t *handle;
 	unsigned int max_blocks;
 	int ret = 0;
 	int ret2 = 0;
@@ -4523,72 +4584,96 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
 		      map.m_lblk);
 	/*
-	 * credits to insert 1 extent into extent tree
+	 * This is somewhat ugly but the idea is clear: When transaction is
+	 * reserved, everything goes into it. Otherwise we rather start several
+	 * smaller transactions for conversion of each extent separately.
 	 */
-	credits = ext4_chunk_trans_blocks(inode, max_blocks);
+	if (handle) {
+		handle = ext4_journal_start_reserved(handle,
+						     EXT4_HT_EXT_CONVERT);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+		credits = 0;
+	} else {
+		/*
+		 * credits to insert 1 extent into extent tree
+		 */
+		credits = ext4_chunk_trans_blocks(inode, max_blocks);
+	}
 	while (ret >= 0 && ret < max_blocks) {
 		map.m_lblk += ret;
 		map.m_len = (max_blocks -= ret);
-		handle = ext4_journal_start(inode, credits);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			break;
+		if (credits) {
+			handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+						    credits);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				break;
+			}
 		}
 		ret = ext4_map_blocks(handle, inode, &map,
 				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
-		if (ret <= 0) {
-			WARN_ON(ret <= 0);
-			ext4_msg(inode->i_sb, KERN_ERR,
-				 "%s:%d: inode #%lu: block %u: len %u: "
-				 "ext4_ext_map_blocks returned %d",
-				 __func__, __LINE__, inode->i_ino, map.m_lblk,
-				 map.m_len, ret);
-		}
+		if (ret <= 0)
+			ext4_warning(inode->i_sb,
+				     "inode #%lu: block %u: len %u: "
+				     "ext4_ext_map_blocks returned %d",
+				     inode->i_ino, map.m_lblk,
+				     map.m_len, ret);
 		ext4_mark_inode_dirty(handle, inode);
-		ret2 = ext4_journal_stop(handle);
-		if (ret <= 0 || ret2 )
+		if (credits)
+			ret2 = ext4_journal_stop(handle);
+		if (ret <= 0 || ret2)
 			break;
 	}
+	if (!credits)
+		ret2 = ext4_journal_stop(handle);
 	return ret > 0 ? ret2 : ret;
 }
 
 /*
- * If newex is not existing extent (newex->ec_start equals zero) find
- * delayed extent at start of newex and update newex accordingly and
+ * If newes is not existing extent (newes->ec_pblk equals zero) find
+ * delayed extent at start of newes and update newes accordingly and
  * return start of the next delayed extent.
  *
- * If newex is existing extent (newex->ec_start is not equal zero)
+ * If newes is existing extent (newes->ec_pblk is not equal zero)
  * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
- * extent found. Leave newex unmodified.
+ * extent found. Leave newes unmodified.
  */
 static int ext4_find_delayed_extent(struct inode *inode,
-				    struct ext4_ext_cache *newex)
+				    struct extent_status *newes)
 {
 	struct extent_status es;
-	ext4_lblk_t next_del;
+	ext4_lblk_t block, next_del;
 
-	es.start = newex->ec_block;
-	next_del = ext4_es_find_extent(inode, &es);
+	if (newes->es_pblk == 0) {
+		ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
+				newes->es_lblk + newes->es_len - 1, &es);
 
-	if (newex->ec_start == 0) {
 		/*
-		 * No extent in extent-tree contains block @newex->ec_start,
+		 * No extent in extent-tree contains block @newes->es_pblk,
 		 * then the block may stay in 1)a hole or 2)delayed-extent.
 		 */
-		if (es.len == 0)
+		if (es.es_len == 0)
 			/* A hole found. */
 			return 0;
 
-		if (es.start > newex->ec_block) {
+		if (es.es_lblk > newes->es_lblk) {
 			/* A hole found. */
-			newex->ec_len = min(es.start - newex->ec_block,
-					    newex->ec_len);
+			newes->es_len = min(es.es_lblk - newes->es_lblk,
+					    newes->es_len);
 			return 0;
 		}
 
-		newex->ec_len = es.start + es.len - newex->ec_block;
+		newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
 	}
 
+	block = newes->es_lblk + newes->es_len;
+	ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
+	if (es.es_len == 0)
+		next_del = EXT_MAX_BLOCKS;
+	else
+		next_del = es.es_lblk;
+
 	return next_del;
 }
 /* fiemap flags we can handle specified here */
@@ -4611,7 +4696,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
 		error = ext4_get_inode_loc(inode, &iloc);
 		if (error)
 			return error;
-		physical = iloc.bh->b_blocknr << blockbits;
+		physical = (__u64)iloc.bh->b_blocknr << blockbits;
 		offset = EXT4_GOOD_OLD_INODE_SIZE +
 				EXT4_I(inode)->i_extra_isize;
 		physical += offset;
@@ -4619,7 +4704,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
 		flags |= FIEMAP_EXTENT_DATA_INLINE;
 		brelse(iloc.bh);
 	} else { /* external block */
-		physical = EXT4_I(inode)->i_file_acl << blockbits;
+		physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
 		length = inode->i_sb->s_blocksize;
 	}
 
@@ -4629,189 +4714,6 @@ static int ext4_xattr_fiemap(struct inode *inode,
 	return (error < 0 ? error : 0);
 }
 
-/*
- * ext4_ext_punch_hole
- *
- * Punches a hole of "length" bytes in a file starting
- * at byte "offset"
- *
- * @inode:  The inode of the file to punch a hole in
- * @offset: The starting byte offset of the hole
- * @length: The length of the hole
- *
- * Returns the number of blocks removed or negative on err
- */
-int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct super_block *sb = inode->i_sb;
-	ext4_lblk_t first_block, stop_block;
-	struct address_space *mapping = inode->i_mapping;
-	handle_t *handle;
-	loff_t first_page, last_page, page_len;
-	loff_t first_page_offset, last_page_offset;
-	int credits, err = 0;
-
-	/*
-	 * Write out all dirty pages to avoid race conditions
-	 * Then release them.
-	 */
-	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-		err = filemap_write_and_wait_range(mapping,
-			offset, offset + length - 1);
-
-		if (err)
-			return err;
-	}
-
-	mutex_lock(&inode->i_mutex);
-	/* It's not possible punch hole on append only file */
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
-		err = -EPERM;
-		goto out_mutex;
-	}
-	if (IS_SWAPFILE(inode)) {
-		err = -ETXTBSY;
-		goto out_mutex;
-	}
-
-	/* No need to punch hole beyond i_size */
-	if (offset >= inode->i_size)
-		goto out_mutex;
-
-	/*
-	 * If the hole extends beyond i_size, set the hole
-	 * to end after the page that contains i_size
-	 */
-	if (offset + length > inode->i_size) {
-		length = inode->i_size +
-		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
-		   offset;
-	}
-
-	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
-
-	first_page_offset = first_page << PAGE_CACHE_SHIFT;
-	last_page_offset = last_page << PAGE_CACHE_SHIFT;
-
-	/* Now release the pages */
-	if (last_page_offset > first_page_offset) {
-		truncate_pagecache_range(inode, first_page_offset,
-					 last_page_offset - 1);
-	}
-
-	/* Wait all existing dio workers, newcomers will block on i_mutex */
-	ext4_inode_block_unlocked_dio(inode);
-	err = ext4_flush_unwritten_io(inode);
-	if (err)
-		goto out_dio;
-	inode_dio_wait(inode);
-
-	credits = ext4_writepage_trans_blocks(inode);
-	handle = ext4_journal_start(inode, credits);
-	if (IS_ERR(handle)) {
-		err = PTR_ERR(handle);
-		goto out_dio;
-	}
-
-
-	/*
-	 * Now we need to zero out the non-page-aligned data in the
-	 * pages at the start and tail of the hole, and unmap the buffer
-	 * heads for the block aligned regions of the page that were
-	 * completely zeroed.
-	 */
-	if (first_page > last_page) {
-		/*
-		 * If the file space being truncated is contained within a page
-		 * just zero out and unmap the middle of that page
-		 */
-		err = ext4_discard_partial_page_buffers(handle,
-			mapping, offset, length, 0);
-
-		if (err)
-			goto out;
-	} else {
-		/*
-		 * zero out and unmap the partial page that contains
-		 * the start of the hole
-		 */
-		page_len  = first_page_offset - offset;
-		if (page_len > 0) {
-			err = ext4_discard_partial_page_buffers(handle, mapping,
-						   offset, page_len, 0);
-			if (err)
-				goto out;
-		}
-
-		/*
-		 * zero out and unmap the partial page that contains
-		 * the end of the hole
-		 */
-		page_len = offset + length - last_page_offset;
-		if (page_len > 0) {
-			err = ext4_discard_partial_page_buffers(handle, mapping,
-					last_page_offset, page_len, 0);
-			if (err)
-				goto out;
-		}
-	}
-
-	/*
-	 * If i_size is contained in the last page, we need to
-	 * unmap and zero the partial page after i_size
-	 */
-	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
-	   inode->i_size % PAGE_CACHE_SIZE != 0) {
-
-		page_len = PAGE_CACHE_SIZE -
-			(inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-		if (page_len > 0) {
-			err = ext4_discard_partial_page_buffers(handle,
-			  mapping, inode->i_size, page_len, 0);
-
-			if (err)
-				goto out;
-		}
-	}
-
-	first_block = (offset + sb->s_blocksize - 1) >>
-		EXT4_BLOCK_SIZE_BITS(sb);
-	stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-
-	/* If there are no blocks to remove, return now */
-	if (first_block >= stop_block)
-		goto out;
-
-	down_write(&EXT4_I(inode)->i_data_sem);
-	ext4_ext_invalidate_cache(inode);
-	ext4_discard_preallocations(inode);
-
-	err = ext4_es_remove_extent(inode, first_block,
-				    stop_block - first_block);
-	err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
-
-	ext4_ext_invalidate_cache(inode);
-	ext4_discard_preallocations(inode);
-
-	if (IS_SYNC(inode))
-		ext4_handle_sync(handle);
-
-	up_write(&EXT4_I(inode)->i_data_sem);
-
-out:
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-	ext4_mark_inode_dirty(handle, inode);
-	ext4_journal_stop(handle);
-out_dio:
-	ext4_inode_resume_unlocked_dio(inode);
-out_mutex:
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
-
 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len)
 {
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 564d981a2fc..ee018d5f397 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
  * Ext4 extents status tree core functions.
  */
 #include <linux/rbtree.h>
+#include <linux/list_sort.h>
 #include "ext4.h"
 #include "extents_status.h"
 #include "ext4_extents.h"
@@ -23,40 +24,53 @@
  * (e.g. Reservation space warning), and provide extent-level locking.
  * Delay extent tree is the first step to achieve this goal.  It is
  * original built by Yongqiang Yang.  At that time it is called delay
- * extent tree, whose goal is only track delay extent in memory to
+ * extent tree, whose goal is only track delayed extents in memory to
  * simplify the implementation of fiemap and bigalloc, and introduce
  * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
- * delay extent tree at the following comment.  But for better
- * understand what it does, it has been rename to extent status tree.
+ * delay extent tree at the first commit.  But for better understand
+ * what it does, it has been rename to extent status tree.
  *
- * Currently the first step has been done.  All delay extents are
- * tracked in the tree.  It maintains the delay extent when a delay
- * allocation is issued, and the delay extent is written out or
+ * Step1:
+ * Currently the first step has been done.  All delayed extents are
+ * tracked in the tree.  It maintains the delayed extent when a delayed
+ * allocation is issued, and the delayed extent is written out or
  * invalidated.  Therefore the implementation of fiemap and bigalloc
  * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
  *
  * The following comment describes the implemenmtation of extent
  * status tree and future works.
+ *
+ * Step2:
+ * In this step all extent status are tracked by extent status tree.
+ * Thus, we can first try to lookup a block mapping in this tree before
+ * finding it in extent tree.  Hence, single extent cache can be removed
+ * because extent status tree can do a better job.  Extents in status
+ * tree are loaded on-demand.  Therefore, the extent status tree may not
+ * contain all of the extents in a file.  Meanwhile we define a shrinker
+ * to reclaim memory from extent status tree because fragmented extent
+ * tree will make status tree cost too much memory.  written/unwritten/-
+ * hole extents in the tree will be reclaimed by this shrinker when we
+ * are under high memory pressure.  Delayed extents will not be
+ * reclimed because fiemap, bigalloc, and seek_data/hole need it.
  */
 
 /*
- * extents status tree implementation for ext4.
+ * Extent status tree implementation for ext4.
  *
  *
  * ==========================================================================
- * Extents status encompass delayed extents and extent locks
+ * Extent status tree tracks all extent status.
  *
- * 1. Why delayed extent implementation ?
+ * 1. Why we need to implement extent status tree?
  *
- * Without delayed extent, ext4 identifies a delayed extent by looking
+ * Without extent status tree, ext4 identifies a delayed extent by looking
  * up page cache, this has several deficiencies - complicated, buggy,
  * and inefficient code.
  *
- * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
- * to know if a block or a range of blocks are belonged to a delayed
- * extent.
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
+ * block or a range of blocks are belonged to a delayed extent.
  *
- * Let us have a look at how they do without delayed extents implementation.
+ * Let us have a look at how they do without extent status tree.
  *   --	FIEMAP
  *	FIEMAP looks up page cache to identify delayed allocations from holes.
  *
@@ -68,47 +82,48 @@
  *	already under delayed allocation or not to determine whether
  *	quota reserving is needed for the cluster.
  *
- *   -- punch hole
- *	punch hole looks up page cache to identify a delayed extent.
- *
  *   --	writeout
  *	Writeout looks up whole page cache to see if a buffer is
  *	mapped, If there are not very many delayed buffers, then it is
  *	time comsuming.
  *
- * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
+ * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
  * bigalloc and writeout can figure out if a block or a range of
  * blocks is under delayed allocation(belonged to a delayed extent) or
- * not by searching the delayed extent tree.
+ * not by searching the extent tree.
  *
  *
  * ==========================================================================
- * 2. ext4 delayed extents impelmentation
+ * 2. Ext4 extent status tree impelmentation
  *
- *   --	delayed extent
- *	A delayed extent is a range of blocks which are contiguous
- *	logically and under delayed allocation.  Unlike extent in
- *	ext4, delayed extent in ext4 is a in-memory struct, there is
- *	no corresponding on-disk data.  There is no limit on length of
- *	delayed extent, so a delayed extent can contain as many blocks
- *	as they are contiguous logically.
+ *   --	extent
+ *	A extent is a range of blocks which are contiguous logically and
+ *	physically.  Unlike extent in extent tree, this extent in ext4 is
+ *	a in-memory struct, there is no corresponding on-disk data.  There
+ *	is no limit on length of extent, so an extent can contain as many
+ *	blocks as they are contiguous logically and physically.
  *
- *   --	delayed extent tree
- *	Every inode has a delayed extent tree and all under delayed
- *	allocation blocks are added to the tree as delayed extents.
- *	Delayed extents in the tree are ordered by logical block no.
+ *   --	extent status tree
+ *	Every inode has an extent status tree and all allocation blocks
+ *	are added to the tree with different status.  The extent in the
+ *	tree are ordered by logical block no.
  *
- *   --	operations on a delayed extent tree
- *	There are three operations on a delayed extent tree: find next
- *	delayed extent, adding a space(a range of blocks) and removing
- *	a space.
+ *   --	operations on a extent status tree
+ *	There are three important operations on a delayed extent tree: find
+ *	next extent, adding a extent(a range of blocks) and removing a extent.
  *
- *   --	race on a delayed extent tree
- *	Delayed extent tree is protected inode->i_es_lock.
+ *   --	race on a extent status tree
+ *	Extent status tree is protected by inode->i_es_lock.
+ *
+ *   --	memory consumption
+ *      Fragmented extent tree will make extent status tree cost too much
+ *      memory.  Hence, we will reclaim written/unwritten/hole extents from
+ *      the tree under a heavy memory pressure.
  *
  *
  * ==========================================================================
- * 3. performance analysis
+ * 3. Performance analysis
+ *
  *   --	overhead
  *	1. There is a cache extent for write access, so if writes are
  *	not very random, adding space operaions are in O(1) time.
@@ -120,18 +135,25 @@
  *
  * ==========================================================================
  * 4. TODO list
- *   -- Track all extent status
  *
- *   -- Improve get block process
+ *   -- Refactor delayed space reservation
  *
  *   -- Extent-level locking
  */
 
 static struct kmem_cache *ext4_es_cachep;
 
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+			      ext4_lblk_t end);
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+				       int nr_to_scan);
+
 int __init ext4_init_es(void)
 {
-	ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
+	ext4_es_cachep = kmem_cache_create("ext4_extent_status",
+					   sizeof(struct extent_status),
+					   0, (SLAB_RECLAIM_ACCOUNT), NULL);
 	if (ext4_es_cachep == NULL)
 		return -ENOMEM;
 	return 0;
@@ -161,7 +183,9 @@ static void ext4_es_print_tree(struct inode *inode)
 	while (node) {
 		struct extent_status *es;
 		es = rb_entry(node, struct extent_status, rb_node);
-		printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
+		printk(KERN_DEBUG " [%u/%u) %llu %llx",
+		       es->es_lblk, es->es_len,
+		       ext4_es_pblock(es), ext4_es_status(es));
 		node = rb_next(node);
 	}
 	printk(KERN_DEBUG "\n");
@@ -170,10 +194,10 @@ static void ext4_es_print_tree(struct inode *inode)
 #define ext4_es_print_tree(inode)
 #endif
 
-static inline ext4_lblk_t extent_status_end(struct extent_status *es)
+static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
 {
-	BUG_ON(es->start + es->len < es->start);
-	return es->start + es->len - 1;
+	BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
+	return es->es_lblk + es->es_len - 1;
 }
 
 /*
@@ -181,25 +205,25 @@ static inline ext4_lblk_t extent_status_end(struct extent_status *es)
  * it can't be found, try to find next extent.
  */
 static struct extent_status *__es_tree_search(struct rb_root *root,
-					      ext4_lblk_t offset)
+					      ext4_lblk_t lblk)
 {
 	struct rb_node *node = root->rb_node;
 	struct extent_status *es = NULL;
 
 	while (node) {
 		es = rb_entry(node, struct extent_status, rb_node);
-		if (offset < es->start)
+		if (lblk < es->es_lblk)
 			node = node->rb_left;
-		else if (offset > extent_status_end(es))
+		else if (lblk > ext4_es_end(es))
 			node = node->rb_right;
 		else
 			return es;
 	}
 
-	if (es && offset < es->start)
+	if (es && lblk < es->es_lblk)
 		return es;
 
-	if (es && offset > extent_status_end(es)) {
+	if (es && lblk > ext4_es_end(es)) {
 		node = rb_next(&es->rb_node);
 		return node ? rb_entry(node, struct extent_status, rb_node) :
 			      NULL;
@@ -209,79 +233,140 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
 }
 
 /*
- * ext4_es_find_extent: find the 1st delayed extent covering @es->start
- * if it exists, otherwise, the next extent after @es->start.
+ * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
+ * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
  *
  * @inode: the inode which owns delayed extents
+ * @lblk: the offset where we start to search
+ * @end: the offset where we stop to search
  * @es: delayed extent that we found
- *
- * Returns the first block of the next extent after es, otherwise
- * EXT_MAX_BLOCKS if no delay extent is found.
- * Delayed extent is returned via @es.
  */
-ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
+void ext4_es_find_delayed_extent_range(struct inode *inode,
+				 ext4_lblk_t lblk, ext4_lblk_t end,
+				 struct extent_status *es)
 {
 	struct ext4_es_tree *tree = NULL;
 	struct extent_status *es1 = NULL;
 	struct rb_node *node;
-	ext4_lblk_t ret = EXT_MAX_BLOCKS;
 
-	trace_ext4_es_find_extent_enter(inode, es->start);
+	BUG_ON(es == NULL);
+	BUG_ON(end < lblk);
+	trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
 
 	read_lock(&EXT4_I(inode)->i_es_lock);
 	tree = &EXT4_I(inode)->i_es_tree;
 
-	/* find delay extent in cache firstly */
+	/* find extent in cache firstly */
+	es->es_lblk = es->es_len = es->es_pblk = 0;
 	if (tree->cache_es) {
 		es1 = tree->cache_es;
-		if (in_range(es->start, es1->start, es1->len)) {
-			es_debug("%u cached by [%u/%u)\n",
-				 es->start, es1->start, es1->len);
+		if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+			es_debug("%u cached by [%u/%u) %llu %llx\n",
+				 lblk, es1->es_lblk, es1->es_len,
+				 ext4_es_pblock(es1), ext4_es_status(es1));
 			goto out;
 		}
 	}
 
-	es->len = 0;
-	es1 = __es_tree_search(&tree->root, es->start);
+	es1 = __es_tree_search(&tree->root, lblk);
 
 out:
-	if (es1) {
-		tree->cache_es = es1;
-		es->start = es1->start;
-		es->len = es1->len;
-		node = rb_next(&es1->rb_node);
-		if (node) {
+	if (es1 && !ext4_es_is_delayed(es1)) {
+		while ((node = rb_next(&es1->rb_node)) != NULL) {
 			es1 = rb_entry(node, struct extent_status, rb_node);
-			ret = es1->start;
+			if (es1->es_lblk > end) {
+				es1 = NULL;
+				break;
+			}
+			if (ext4_es_is_delayed(es1))
+				break;
 		}
 	}
 
+	if (es1 && ext4_es_is_delayed(es1)) {
+		tree->cache_es = es1;
+		es->es_lblk = es1->es_lblk;
+		es->es_len = es1->es_len;
+		es->es_pblk = es1->es_pblk;
+	}
+
 	read_unlock(&EXT4_I(inode)->i_es_lock);
 
-	trace_ext4_es_find_extent_exit(inode, es, ret);
-	return ret;
+	trace_ext4_es_find_delayed_extent_range_exit(inode, es);
 }
 
 static struct extent_status *
-ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
+ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+		     ext4_fsblk_t pblk)
 {
 	struct extent_status *es;
 	es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
 	if (es == NULL)
 		return NULL;
-	es->start = start;
-	es->len = len;
+	es->es_lblk = lblk;
+	es->es_len = len;
+	es->es_pblk = pblk;
+
+	/*
+	 * We don't count delayed extent because we never try to reclaim them
+	 */
+	if (!ext4_es_is_delayed(es)) {
+		EXT4_I(inode)->i_es_lru_nr++;
+		percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+	}
+
 	return es;
 }
 
-static void ext4_es_free_extent(struct extent_status *es)
+static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
 {
+	/* Decrease the lru counter when this es is not delayed */
+	if (!ext4_es_is_delayed(es)) {
+		BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+		EXT4_I(inode)->i_es_lru_nr--;
+		percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+	}
+
 	kmem_cache_free(ext4_es_cachep, es);
 }
 
+/*
+ * Check whether or not two extents can be merged
+ * Condition:
+ *  - logical block number is contiguous
+ *  - physical block number is contiguous
+ *  - status is equal
+ */
+static int ext4_es_can_be_merged(struct extent_status *es1,
+				 struct extent_status *es2)
+{
+	if (ext4_es_status(es1) != ext4_es_status(es2))
+		return 0;
+
+	if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL)
+		return 0;
+
+	if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
+		return 0;
+
+	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
+	    (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
+		return 1;
+
+	if (ext4_es_is_hole(es1))
+		return 1;
+
+	/* we need to check delayed extent is without unwritten status */
+	if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+		return 1;
+
+	return 0;
+}
+
 static struct extent_status *
-ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
+ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
 {
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 	struct extent_status *es1;
 	struct rb_node *node;
 
@@ -290,10 +375,10 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
 		return es;
 
 	es1 = rb_entry(node, struct extent_status, rb_node);
-	if (es->start == extent_status_end(es1) + 1) {
-		es1->len += es->len;
+	if (ext4_es_can_be_merged(es1, es)) {
+		es1->es_len += es->es_len;
 		rb_erase(&es->rb_node, &tree->root);
-		ext4_es_free_extent(es);
+		ext4_es_free_extent(inode, es);
 		es = es1;
 	}
 
@@ -301,8 +386,9 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
 }
 
 static struct extent_status *
-ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
+ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
 {
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 	struct extent_status *es1;
 	struct rb_node *node;
 
@@ -311,69 +397,230 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
 		return es;
 
 	es1 = rb_entry(node, struct extent_status, rb_node);
-	if (es1->start == extent_status_end(es) + 1) {
-		es->len += es1->len;
+	if (ext4_es_can_be_merged(es, es1)) {
+		es->es_len += es1->es_len;
 		rb_erase(node, &tree->root);
-		ext4_es_free_extent(es1);
+		ext4_es_free_extent(inode, es1);
 	}
 
 	return es;
 }
 
-static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
-			      ext4_lblk_t len)
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_es_insert_extent_ext_check(struct inode *inode,
+					    struct extent_status *es)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ex;
+	ext4_lblk_t ee_block;
+	ext4_fsblk_t ee_start;
+	unsigned short ee_len;
+	int depth, ee_status, es_status;
+
+	path = ext4_ext_find_extent(inode, es->es_lblk, NULL);
+	if (IS_ERR(path))
+		return;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+
+	if (ex) {
+
+		ee_block = le32_to_cpu(ex->ee_block);
+		ee_start = ext4_ext_pblock(ex);
+		ee_len = ext4_ext_get_actual_len(ex);
+
+		ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0;
+		es_status = ext4_es_is_unwritten(es) ? 1 : 0;
+
+		/*
+		 * Make sure ex and es are not overlap when we try to insert
+		 * a delayed/hole extent.
+		 */
+		if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
+			if (in_range(es->es_lblk, ee_block, ee_len)) {
+				pr_warn("ES insert assertation failed for "
+					"inode: %lu we can find an extent "
+					"at block [%d/%d/%llu/%c], but we "
+					"want to add an delayed/hole extent "
+					"[%d/%d/%llu/%llx]\n",
+					inode->i_ino, ee_block, ee_len,
+					ee_start, ee_status ? 'u' : 'w',
+					es->es_lblk, es->es_len,
+					ext4_es_pblock(es), ext4_es_status(es));
+			}
+			goto out;
+		}
+
+		/*
+		 * We don't check ee_block == es->es_lblk, etc. because es
+		 * might be a part of whole extent, vice versa.
+		 */
+		if (es->es_lblk < ee_block ||
+		    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
+			pr_warn("ES insert assertation failed for inode: %lu "
+				"ex_status [%d/%d/%llu/%c] != "
+				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+				ee_block, ee_len, ee_start,
+				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+				ext4_es_pblock(es), es_status ? 'u' : 'w');
+			goto out;
+		}
+
+		if (ee_status ^ es_status) {
+			pr_warn("ES insert assertation failed for inode: %lu "
+				"ex_status [%d/%d/%llu/%c] != "
+				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+				ee_block, ee_len, ee_start,
+				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+				ext4_es_pblock(es), es_status ? 'u' : 'w');
+		}
+	} else {
+		/*
+		 * We can't find an extent on disk.  So we need to make sure
+		 * that we don't want to add an written/unwritten extent.
+		 */
+		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
+			pr_warn("ES insert assertation failed for inode: %lu "
+				"can't find an extent at block %d but we want "
+				"to add an written/unwritten extent "
+				"[%d/%d/%llu/%llx]\n", inode->i_ino,
+				es->es_lblk, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+		}
+	}
+out:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+}
+
+static void ext4_es_insert_extent_ind_check(struct inode *inode,
+					    struct extent_status *es)
 {
+	struct ext4_map_blocks map;
+	int retval;
+
+	/*
+	 * Here we call ext4_ind_map_blocks to lookup a block mapping because
+	 * 'Indirect' structure is defined in indirect.c.  So we couldn't
+	 * access direct/indirect tree from outside.  It is too dirty to define
+	 * this function in indirect.c file.
+	 */
+
+	map.m_lblk = es->es_lblk;
+	map.m_len = es->es_len;
+
+	retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
+	if (retval > 0) {
+		if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
+			/*
+			 * We want to add a delayed/hole extent but this
+			 * block has been allocated.
+			 */
+			pr_warn("ES insert assertation failed for inode: %lu "
+				"We can find blocks but we want to add a "
+				"delayed/hole extent [%d/%d/%llu/%llx]\n",
+				inode->i_ino, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+			return;
+		} else if (ext4_es_is_written(es)) {
+			if (retval != es->es_len) {
+				pr_warn("ES insert assertation failed for "
+					"inode: %lu retval %d != es_len %d\n",
+					inode->i_ino, retval, es->es_len);
+				return;
+			}
+			if (map.m_pblk != ext4_es_pblock(es)) {
+				pr_warn("ES insert assertation failed for "
+					"inode: %lu m_pblk %llu != "
+					"es_pblk %llu\n",
+					inode->i_ino, map.m_pblk,
+					ext4_es_pblock(es));
+				return;
+			}
+		} else {
+			/*
+			 * We don't need to check unwritten extent because
+			 * indirect-based file doesn't have it.
+			 */
+			BUG_ON(1);
+		}
+	} else if (retval == 0) {
+		if (ext4_es_is_written(es)) {
+			pr_warn("ES insert assertation failed for inode: %lu "
+				"We can't find the block but we want to add "
+				"an written extent [%d/%d/%llu/%llx]\n",
+				inode->i_ino, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+			return;
+		}
+	}
+}
+
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+					       struct extent_status *es)
+{
+	/*
+	 * We don't need to worry about the race condition because
+	 * caller takes i_data_sem locking.
+	 */
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ext4_es_insert_extent_ext_check(inode, es);
+	else
+		ext4_es_insert_extent_ind_check(inode, es);
+}
+#else
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+					       struct extent_status *es)
+{
+}
+#endif
+
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
 	struct rb_node **p = &tree->root.rb_node;
 	struct rb_node *parent = NULL;
 	struct extent_status *es;
-	ext4_lblk_t end = offset + len - 1;
-
-	BUG_ON(end < offset);
-	es = tree->cache_es;
-	if (es && offset == (extent_status_end(es) + 1)) {
-		es_debug("cached by [%u/%u)\n", es->start, es->len);
-		es->len += len;
-		es = ext4_es_try_to_merge_right(tree, es);
-		goto out;
-	} else if (es && es->start == end + 1) {
-		es_debug("cached by [%u/%u)\n", es->start, es->len);
-		es->start = offset;
-		es->len += len;
-		es = ext4_es_try_to_merge_left(tree, es);
-		goto out;
-	} else if (es && es->start <= offset &&
-		   end <= extent_status_end(es)) {
-		es_debug("cached by [%u/%u)\n", es->start, es->len);
-		goto out;
-	}
 
 	while (*p) {
 		parent = *p;
 		es = rb_entry(parent, struct extent_status, rb_node);
 
-		if (offset < es->start) {
-			if (es->start == end + 1) {
-				es->start = offset;
-				es->len += len;
-				es = ext4_es_try_to_merge_left(tree, es);
+		if (newes->es_lblk < es->es_lblk) {
+			if (ext4_es_can_be_merged(newes, es)) {
+				/*
+				 * Here we can modify es_lblk directly
+				 * because it isn't overlapped.
+				 */
+				es->es_lblk = newes->es_lblk;
+				es->es_len += newes->es_len;
+				if (ext4_es_is_written(es) ||
+				    ext4_es_is_unwritten(es))
+					ext4_es_store_pblock(es,
+							     newes->es_pblk);
+				es = ext4_es_try_to_merge_left(inode, es);
 				goto out;
 			}
 			p = &(*p)->rb_left;
-		} else if (offset > extent_status_end(es)) {
-			if (offset == extent_status_end(es) + 1) {
-				es->len += len;
-				es = ext4_es_try_to_merge_right(tree, es);
+		} else if (newes->es_lblk > ext4_es_end(es)) {
+			if (ext4_es_can_be_merged(es, newes)) {
+				es->es_len += newes->es_len;
+				es = ext4_es_try_to_merge_right(inode, es);
 				goto out;
 			}
 			p = &(*p)->rb_right;
 		} else {
-			if (extent_status_end(es) <= end)
-				es->len = offset - es->start + len;
-			goto out;
+			BUG_ON(1);
+			return -EINVAL;
 		}
 	}
 
-	es = ext4_es_alloc_extent(offset, len);
+	es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
+				  newes->es_pblk);
 	if (!es)
 		return -ENOMEM;
 	rb_link_node(&es->rb_node, parent, p);
@@ -385,27 +632,44 @@ out:
 }
 
 /*
- * ext4_es_insert_extent() adds a space to a delayed extent tree.
- * Caller holds inode->i_es_lock.
+ * ext4_es_insert_extent() adds a space to a extent status tree.
  *
  * ext4_es_insert_extent is called by ext4_da_write_begin and
  * ext4_es_remove_extent.
  *
  * Return 0 on success, error code on failure.
  */
-int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
-			  ext4_lblk_t len)
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+			  ext4_lblk_t len, ext4_fsblk_t pblk,
+			  unsigned long long status)
 {
-	struct ext4_es_tree *tree;
+	struct extent_status newes;
+	ext4_lblk_t end = lblk + len - 1;
 	int err = 0;
 
-	trace_ext4_es_insert_extent(inode, offset, len);
-	es_debug("add [%u/%u) to extent status tree of inode %lu\n",
-		 offset, len, inode->i_ino);
+	es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
+		 lblk, len, pblk, status, inode->i_ino);
+
+	if (!len)
+		return 0;
+
+	BUG_ON(end < lblk);
+
+	newes.es_lblk = lblk;
+	newes.es_len = len;
+	ext4_es_store_pblock(&newes, pblk);
+	ext4_es_store_status(&newes, status);
+	trace_ext4_es_insert_extent(inode, &newes);
+
+	ext4_es_insert_extent_check(inode, &newes);
 
 	write_lock(&EXT4_I(inode)->i_es_lock);
-	tree = &EXT4_I(inode)->i_es_tree;
-	err = __es_insert_extent(tree, offset, len);
+	err = __es_remove_extent(inode, lblk, end);
+	if (err != 0)
+		goto error;
+	err = __es_insert_extent(inode, &newes);
+
+error:
 	write_unlock(&EXT4_I(inode)->i_es_lock);
 
 	ext4_es_print_tree(inode);
@@ -414,56 +678,120 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
 }
 
 /*
- * ext4_es_remove_extent() removes a space from a delayed extent tree.
- * Caller holds inode->i_es_lock.
+ * ext4_es_lookup_extent() looks up an extent in extent status tree.
  *
- * Return 0 on success, error code on failure.
+ * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
+ *
+ * Return: 1 on found, 0 on not
  */
-int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
-			  ext4_lblk_t len)
+int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+			  struct extent_status *es)
 {
-	struct rb_node *node;
 	struct ext4_es_tree *tree;
+	struct extent_status *es1 = NULL;
+	struct rb_node *node;
+	int found = 0;
+
+	trace_ext4_es_lookup_extent_enter(inode, lblk);
+	es_debug("lookup extent in block %u\n", lblk);
+
+	tree = &EXT4_I(inode)->i_es_tree;
+	read_lock(&EXT4_I(inode)->i_es_lock);
+
+	/* find extent in cache firstly */
+	es->es_lblk = es->es_len = es->es_pblk = 0;
+	if (tree->cache_es) {
+		es1 = tree->cache_es;
+		if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+			es_debug("%u cached by [%u/%u)\n",
+				 lblk, es1->es_lblk, es1->es_len);
+			found = 1;
+			goto out;
+		}
+	}
+
+	node = tree->root.rb_node;
+	while (node) {
+		es1 = rb_entry(node, struct extent_status, rb_node);
+		if (lblk < es1->es_lblk)
+			node = node->rb_left;
+		else if (lblk > ext4_es_end(es1))
+			node = node->rb_right;
+		else {
+			found = 1;
+			break;
+		}
+	}
+
+out:
+	if (found) {
+		BUG_ON(!es1);
+		es->es_lblk = es1->es_lblk;
+		es->es_len = es1->es_len;
+		es->es_pblk = es1->es_pblk;
+	}
+
+	read_unlock(&EXT4_I(inode)->i_es_lock);
+
+	trace_ext4_es_lookup_extent_exit(inode, es, found);
+	return found;
+}
+
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+			      ext4_lblk_t end)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+	struct rb_node *node;
 	struct extent_status *es;
 	struct extent_status orig_es;
-	ext4_lblk_t len1, len2, end;
+	ext4_lblk_t len1, len2;
+	ext4_fsblk_t block;
 	int err = 0;
 
-	trace_ext4_es_remove_extent(inode, offset, len);
-	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
-		 offset, len, inode->i_ino);
-
-	end = offset + len - 1;
-	BUG_ON(end < offset);
-	write_lock(&EXT4_I(inode)->i_es_lock);
-	tree = &EXT4_I(inode)->i_es_tree;
-	es = __es_tree_search(&tree->root, offset);
+	es = __es_tree_search(&tree->root, lblk);
 	if (!es)
 		goto out;
-	if (es->start > end)
+	if (es->es_lblk > end)
 		goto out;
 
 	/* Simply invalidate cache_es. */
 	tree->cache_es = NULL;
 
-	orig_es.start = es->start;
-	orig_es.len = es->len;
-	len1 = offset > es->start ? offset - es->start : 0;
-	len2 = extent_status_end(es) > end ?
-	       extent_status_end(es) - end : 0;
+	orig_es.es_lblk = es->es_lblk;
+	orig_es.es_len = es->es_len;
+	orig_es.es_pblk = es->es_pblk;
+
+	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+	len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
 	if (len1 > 0)
-		es->len = len1;
+		es->es_len = len1;
 	if (len2 > 0) {
 		if (len1 > 0) {
-			err = __es_insert_extent(tree, end + 1, len2);
+			struct extent_status newes;
+
+			newes.es_lblk = end + 1;
+			newes.es_len = len2;
+			if (ext4_es_is_written(&orig_es) ||
+			    ext4_es_is_unwritten(&orig_es)) {
+				block = ext4_es_pblock(&orig_es) +
+					orig_es.es_len - len2;
+				ext4_es_store_pblock(&newes, block);
+			}
+			ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+			err = __es_insert_extent(inode, &newes);
 			if (err) {
-				es->start = orig_es.start;
-				es->len = orig_es.len;
+				es->es_lblk = orig_es.es_lblk;
+				es->es_len = orig_es.es_len;
 				goto out;
 			}
 		} else {
-			es->start = end + 1;
-			es->len = len2;
+			es->es_lblk = end + 1;
+			es->es_len = len2;
+			if (ext4_es_is_written(es) ||
+			    ext4_es_is_unwritten(es)) {
+				block = orig_es.es_pblk + orig_es.es_len - len2;
+				ext4_es_store_pblock(es, block);
+			}
 		}
 		goto out;
 	}
@@ -476,10 +804,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
 			es = NULL;
 	}
 
-	while (es && extent_status_end(es) <= end) {
+	while (es && ext4_es_end(es) <= end) {
 		node = rb_next(&es->rb_node);
 		rb_erase(&es->rb_node, &tree->root);
-		ext4_es_free_extent(es);
+		ext4_es_free_extent(inode, es);
 		if (!node) {
 			es = NULL;
 			break;
@@ -487,14 +815,220 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
 		es = rb_entry(node, struct extent_status, rb_node);
 	}
 
-	if (es && es->start < end + 1) {
-		len1 = extent_status_end(es) - end;
-		es->start = end + 1;
-		es->len = len1;
+	if (es && es->es_lblk < end + 1) {
+		ext4_lblk_t orig_len = es->es_len;
+
+		len1 = ext4_es_end(es) - end;
+		es->es_lblk = end + 1;
+		es->es_len = len1;
+		if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
+			block = es->es_pblk + orig_len - len1;
+			ext4_es_store_pblock(es, block);
+		}
 	}
 
 out:
+	return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a extent status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+			  ext4_lblk_t len)
+{
+	ext4_lblk_t end;
+	int err = 0;
+
+	trace_ext4_es_remove_extent(inode, lblk, len);
+	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+		 lblk, len, inode->i_ino);
+
+	if (!len)
+		return err;
+
+	end = lblk + len - 1;
+	BUG_ON(end < lblk);
+
+	write_lock(&EXT4_I(inode)->i_es_lock);
+	err = __es_remove_extent(inode, lblk, end);
 	write_unlock(&EXT4_I(inode)->i_es_lock);
 	ext4_es_print_tree(inode);
 	return err;
 }
+
+int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+	ext4_lblk_t  ee_block;
+	ext4_fsblk_t ee_pblock;
+	unsigned int ee_len;
+
+	ee_block  = le32_to_cpu(ex->ee_block);
+	ee_len    = ext4_ext_get_actual_len(ex);
+	ee_pblock = ext4_ext_pblock(ex);
+
+	if (ee_len == 0)
+		return 0;
+
+	return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+				     EXTENT_STATUS_WRITTEN);
+}
+
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+				     struct list_head *b)
+{
+	struct ext4_inode_info *eia, *eib;
+	eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+	eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+
+	if (eia->i_touch_when == eib->i_touch_when)
+		return 0;
+	if (time_after(eia->i_touch_when, eib->i_touch_when))
+		return 1;
+	else
+		return -1;
+}
+
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct ext4_sb_info *sbi = container_of(shrink,
+					struct ext4_sb_info, s_es_shrinker);
+	struct ext4_inode_info *ei;
+	struct list_head *cur, *tmp;
+	LIST_HEAD(skiped);
+	int nr_to_scan = sc->nr_to_scan;
+	int ret, nr_shrunk = 0;
+
+	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+	trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+
+	if (!nr_to_scan)
+		return ret;
+
+	spin_lock(&sbi->s_es_lru_lock);
+
+	/*
+	 * If the inode that is at the head of LRU list is newer than
+	 * last_sorted time, that means that we need to sort this list.
+	 */
+	ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
+	if (sbi->s_es_last_sorted < ei->i_touch_when) {
+		list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+		sbi->s_es_last_sorted = jiffies;
+	}
+
+	list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+		/*
+		 * If we have already reclaimed all extents from extent
+		 * status tree, just stop the loop immediately.
+		 */
+		if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+			break;
+
+		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+
+		/* Skip the inode that is newer than the last_sorted time */
+		if (sbi->s_es_last_sorted < ei->i_touch_when) {
+			list_move_tail(cur, &skiped);
+			continue;
+		}
+
+		if (ei->i_es_lru_nr == 0)
+			continue;
+
+		write_lock(&ei->i_es_lock);
+		ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+		if (ei->i_es_lru_nr == 0)
+			list_del_init(&ei->i_es_lru);
+		write_unlock(&ei->i_es_lock);
+
+		nr_shrunk += ret;
+		nr_to_scan -= ret;
+		if (nr_to_scan == 0)
+			break;
+	}
+
+	/* Move the newer inodes into the tail of the LRU list. */
+	list_splice_tail(&skiped, &sbi->s_es_lru);
+	spin_unlock(&sbi->s_es_lru_lock);
+
+	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+	trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+	return ret;
+}
+
+void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+{
+	INIT_LIST_HEAD(&sbi->s_es_lru);
+	spin_lock_init(&sbi->s_es_lru_lock);
+	sbi->s_es_last_sorted = 0;
+	sbi->s_es_shrinker.shrink = ext4_es_shrink;
+	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+	register_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+{
+	unregister_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_lru_add(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+	ei->i_touch_when = jiffies;
+
+	if (!list_empty(&ei->i_es_lru))
+		return;
+
+	spin_lock(&sbi->s_es_lru_lock);
+	if (list_empty(&ei->i_es_lru))
+		list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
+	spin_unlock(&sbi->s_es_lru_lock);
+}
+
+void ext4_es_lru_del(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+	spin_lock(&sbi->s_es_lru_lock);
+	if (!list_empty(&ei->i_es_lru))
+		list_del_init(&ei->i_es_lru);
+	spin_unlock(&sbi->s_es_lru_lock);
+}
+
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+				       int nr_to_scan)
+{
+	struct inode *inode = &ei->vfs_inode;
+	struct ext4_es_tree *tree = &ei->i_es_tree;
+	struct rb_node *node;
+	struct extent_status *es;
+	int nr_shrunk = 0;
+
+	if (ei->i_es_lru_nr == 0)
+		return 0;
+
+	node = rb_first(&tree->root);
+	while (node != NULL) {
+		es = rb_entry(node, struct extent_status, rb_node);
+		node = rb_next(&es->rb_node);
+		/*
+		 * We can't reclaim delayed extent from status tree because
+		 * fiemap, bigallic, and seek_data/hole need to use it.
+		 */
+		if (!ext4_es_is_delayed(es)) {
+			rb_erase(&es->rb_node, &tree->root);
+			ext4_es_free_extent(inode, es);
+			nr_shrunk++;
+			if (--nr_to_scan == 0)
+				break;
+		}
+	}
+	tree->cache_es = NULL;
+	return nr_shrunk;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 077f82db092..e936730cc5b 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -20,10 +20,33 @@
 #define es_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
+/*
+ * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
+ * checked with old map_block's result.
+ */
+#define ES_AGGRESSIVE_TEST__
+
+/*
+ * These flags live in the high bits of extent_status.es_pblk
+ */
+#define EXTENT_STATUS_WRITTEN	(1ULL << 63)
+#define EXTENT_STATUS_UNWRITTEN (1ULL << 62)
+#define EXTENT_STATUS_DELAYED	(1ULL << 61)
+#define EXTENT_STATUS_HOLE	(1ULL << 60)
+
+#define EXTENT_STATUS_FLAGS	(EXTENT_STATUS_WRITTEN | \
+				 EXTENT_STATUS_UNWRITTEN | \
+				 EXTENT_STATUS_DELAYED | \
+				 EXTENT_STATUS_HOLE)
+
+struct ext4_sb_info;
+struct ext4_extent;
+
 struct extent_status {
 	struct rb_node rb_node;
-	ext4_lblk_t start;	/* first block extent covers */
-	ext4_lblk_t len;	/* length of extent in block */
+	ext4_lblk_t es_lblk;	/* first logical block extent covers */
+	ext4_lblk_t es_len;	/* length of extent in block */
+	ext4_fsblk_t es_pblk;	/* first physical block */
 };
 
 struct ext4_es_tree {
@@ -35,11 +58,71 @@ extern int __init ext4_init_es(void);
 extern void ext4_exit_es(void);
 extern void ext4_es_init_tree(struct ext4_es_tree *tree);
 
-extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start,
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+				 ext4_lblk_t len, ext4_fsblk_t pblk,
+				 unsigned long long status);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 				 ext4_lblk_t len);
-extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start,
-				 ext4_lblk_t len);
-extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
-				struct extent_status *es);
+extern void ext4_es_find_delayed_extent_range(struct inode *inode,
+					ext4_lblk_t lblk, ext4_lblk_t end,
+					struct extent_status *es);
+extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+				 struct extent_status *es);
+extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex);
+
+static inline int ext4_es_is_written(struct extent_status *es)
+{
+	return (es->es_pblk & EXTENT_STATUS_WRITTEN) != 0;
+}
+
+static inline int ext4_es_is_unwritten(struct extent_status *es)
+{
+	return (es->es_pblk & EXTENT_STATUS_UNWRITTEN) != 0;
+}
+
+static inline int ext4_es_is_delayed(struct extent_status *es)
+{
+	return (es->es_pblk & EXTENT_STATUS_DELAYED) != 0;
+}
+
+static inline int ext4_es_is_hole(struct extent_status *es)
+{
+	return (es->es_pblk & EXTENT_STATUS_HOLE) != 0;
+}
+
+static inline ext4_fsblk_t ext4_es_status(struct extent_status *es)
+{
+	return (es->es_pblk & EXTENT_STATUS_FLAGS);
+}
+
+static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
+{
+	return (es->es_pblk & ~EXTENT_STATUS_FLAGS);
+}
+
+static inline void ext4_es_store_pblock(struct extent_status *es,
+					ext4_fsblk_t pb)
+{
+	ext4_fsblk_t block;
+
+	block = (pb & ~EXTENT_STATUS_FLAGS) |
+		(es->es_pblk & EXTENT_STATUS_FLAGS);
+	es->es_pblk = block;
+}
+
+static inline void ext4_es_store_status(struct extent_status *es,
+					unsigned long long status)
+{
+	ext4_fsblk_t block;
+
+	block = (status & EXTENT_STATUS_FLAGS) |
+		(es->es_pblk & ~EXTENT_STATUS_FLAGS);
+	es->es_pblk = block;
+}
+
+extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_lru_add(struct inode *inode);
+extern void ext4_es_lru_del(struct inode *inode);
 
 #endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 405565a6227..6f4cc567c38 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
 #include <linux/jbd2.h>
 #include <linux/mount.h>
 #include <linux/path.h>
+#include <linux/aio.h>
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
 #include "ext4.h"
@@ -167,7 +168,7 @@ static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
-	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
 
 	/*
@@ -240,7 +241,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 			handle_t *handle;
 			int err;
 
-			handle = ext4_journal_start_sb(sb, 1);
+			handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
 			if (IS_ERR(handle))
 				return PTR_ERR(handle);
 			err = ext4_journal_get_write_access(handle, sbi->s_sbh);
@@ -311,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 	blkbits = inode->i_sb->s_blocksize_bits;
 	startoff = *offset;
 	lastoff = startoff;
-	endoff = (map->m_lblk + map->m_len) << blkbits;
+	endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
 
 	index = startoff >> PAGE_CACHE_SHIFT;
 	end = endoff >> PAGE_CACHE_SHIFT;
@@ -456,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 		ret = ext4_map_blocks(NULL, inode, &map, 0);
 		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
 			if (last != start)
-				dataoff = last << blkbits;
+				dataoff = (loff_t)last << blkbits;
 			break;
 		}
 
@@ -464,12 +465,10 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 		 * If there is a delay extent at this offset,
 		 * it will be as a data.
 		 */
-		es.start = last;
-		(void)ext4_es_find_extent(inode, &es);
-		if (last >= es.start &&
-		    last < es.start + es.len) {
+		ext4_es_find_delayed_extent_range(inode, last, last, &es);
+		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
 			if (last != start)
-				dataoff = last << blkbits;
+				dataoff = (loff_t)last << blkbits;
 			break;
 		}
 
@@ -487,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 		}
 
 		last++;
-		dataoff = last << blkbits;
+		dataoff = (loff_t)last << blkbits;
 	} while (last <= end);
 
 	mutex_unlock(&inode->i_mutex);
@@ -495,17 +494,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 	if (dataoff > isize)
 		return -ENXIO;
 
-	if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
-		return -EINVAL;
-	if (dataoff > maxsize)
-		return -EINVAL;
-
-	if (dataoff != file->f_pos) {
-		file->f_pos = dataoff;
-		file->f_version = 0;
-	}
-
-	return dataoff;
+	return vfs_setpos(file, dataoff, maxsize);
 }
 
 /*
@@ -541,7 +530,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 		ret = ext4_map_blocks(NULL, inode, &map, 0);
 		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
 			last += ret;
-			holeoff = last << blkbits;
+			holeoff = (loff_t)last << blkbits;
 			continue;
 		}
 
@@ -549,12 +538,10 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 		 * If there is a delay extent at this offset,
 		 * we will skip this extent.
 		 */
-		es.start = last;
-		(void)ext4_es_find_extent(inode, &es);
-		if (last >= es.start &&
-		    last < es.start + es.len) {
-			last = es.start + es.len;
-			holeoff = last << blkbits;
+		ext4_es_find_delayed_extent_range(inode, last, last, &es);
+		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+			last = es.es_lblk + es.es_len;
+			holeoff = (loff_t)last << blkbits;
 			continue;
 		}
 
@@ -569,7 +556,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 							      &map, &holeoff);
 			if (!unwritten) {
 				last += ret;
-				holeoff = last << blkbits;
+				holeoff = (loff_t)last << blkbits;
 				continue;
 			}
 		}
@@ -583,17 +570,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 	if (holeoff > isize)
 		holeoff = isize;
 
-	if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
-		return -EINVAL;
-	if (holeoff > maxsize)
-		return -EINVAL;
-
-	if (holeoff != file->f_pos) {
-		file->f_pos = holeoff;
-		file->f_version = 0;
-	}
-
-	return holeoff;
+	return vfs_setpos(file, holeoff, maxsize);
 }
 
 /*
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 3278e64e57b..a8bc47f75fa 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
 	return ret;
 }
 
-/**
- * __sync_file - generic_file_fsync without the locking and filemap_write
- * @inode:	inode to sync
- * @datasync:	only sync essential metadata if true
- *
- * This is just generic_file_fsync without the locking.  This is needed for
- * nojournal mode to make sure this inodes data/metadata makes it to disk
- * properly.  The i_mutex should be held already.
- */
-static int __sync_inode(struct inode *inode, int datasync)
-{
-	int err;
-	int ret;
-
-	ret = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return ret;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return ret;
-
-	err = sync_inode_metadata(inode, 1);
-	if (ret == 0)
-		ret = err;
-	return ret;
-}
-
 /*
  * akpm: A new design for ext4_sync_file().
  *
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-	int ret, err;
+	int ret = 0, err;
 	tid_t commit_tid;
 	bool needs_barrier = false;
 
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	trace_ext4_sync_file_enter(file, datasync);
 
-	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (ret)
-		return ret;
-	mutex_lock(&inode->i_mutex);
-
-	if (inode->i_sb->s_flags & MS_RDONLY)
-		goto out;
-
-	ret = ext4_flush_unwritten_io(inode);
-	if (ret < 0)
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		/* Make sure that we read updated s_mount_flags value */
+		smp_rmb();
+		if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			ret = -EROFS;
 		goto out;
+	}
 
 	if (!journal) {
-		ret = __sync_inode(inode, datasync);
+		ret = generic_file_fsync(file, start, end, datasync);
 		if (!ret && !hlist_empty(&inode->i_dentry))
 			ret = ext4_sync_parent(inode);
 		goto out;
 	}
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
 	/*
 	 * data=writeback,ordered:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
@@ -166,15 +139,13 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
 		needs_barrier = true;
-	jbd2_log_start_commit(journal, commit_tid);
-	ret = jbd2_log_wait_commit(journal, commit_tid);
+	ret = jbd2_complete_transaction(journal, commit_tid);
 	if (needs_barrier) {
 		err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 		if (!ret)
 			ret = err;
 	}
- out:
-	mutex_unlock(&inode->i_mutex);
+out:
 	trace_ext4_sync_file_exit(inode, ret);
 	return ret;
 }
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index fa8e4911d35..3d586f02883 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -155,11 +155,11 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 	/* Check to see if the seed is all zero's */
 	if (hinfo->seed) {
 		for (i = 0; i < 4; i++) {
-			if (hinfo->seed[i])
+			if (hinfo->seed[i]) {
+				memcpy(buf, hinfo->seed, sizeof(buf));
 				break;
+			}
 		}
-		if (i < 4)
-			memcpy(buf, hinfo->seed, sizeof(buf));
 	}
 
 	switch (hinfo->hash_version) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3f32c801244..f03598c6ffd 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	trace_ext4_load_inode_bitmap(sb, block_group);
 	bh->b_end_io = ext4_end_bitmap_read;
 	get_bh(bh);
-	submit_bh(READ, bh);
+	submit_bh(READ | REQ_META | REQ_PRIO, bh);
 	wait_on_buffer(bh);
 	if (!buffer_uptodate(bh)) {
 		put_bh(bh);
@@ -324,8 +324,8 @@ error_return:
 }
 
 struct orlov_stats {
+	__u64 free_clusters;
 	__u32 free_inodes;
-	__u32 free_clusters;
 	__u32 used_dirs;
 };
 
@@ -342,7 +342,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 
 	if (flex_size > 1) {
 		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
-		stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
+		stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
 		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
 		return;
 	}
@@ -634,8 +634,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
  * For other inodes, search forward from the parent directory's block
  * group to find a free inode.
  */
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
-			     const struct qstr *qstr, __u32 goal, uid_t *owner)
+struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+			       umode_t mode, const struct qstr *qstr,
+			       __u32 goal, uid_t *owner, int handle_type,
+			       unsigned int line_no, int nblocks)
 {
 	struct super_block *sb;
 	struct buffer_head *inode_bitmap_bh = NULL;
@@ -664,6 +666,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
 	ei = EXT4_I(inode);
 	sbi = EXT4_SB(sb);
 
+	/*
+	 * Initalize owners and quota early so that we don't have to account
+	 * for quota initialization worst case in standard inode creating
+	 * transaction
+	 */
+	if (owner) {
+		inode->i_mode = mode;
+		i_uid_write(inode, owner[0]);
+		i_gid_write(inode, owner[1]);
+	} else if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = dir->i_gid;
+	} else
+		inode_init_owner(inode, dir, mode);
+	dquot_initialize(inode);
+
 	if (!goal)
 		goal = sbi->s_inode_goal;
 
@@ -695,7 +714,7 @@ got_group:
 
 		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
 		if (!gdp)
-			goto fail;
+			goto out;
 
 		/*
 		 * Check free inodes count before loading bitmap.
@@ -709,7 +728,7 @@ got_group:
 		brelse(inode_bitmap_bh);
 		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
 		if (!inode_bitmap_bh)
-			goto fail;
+			goto out;
 
 repeat_in_this_group:
 		ino = ext4_find_next_zero_bit((unsigned long *)
@@ -725,10 +744,23 @@ repeat_in_this_group:
 				   "inode=%lu", ino + 1);
 			continue;
 		}
+		if (!handle) {
+			BUG_ON(nblocks <= 0);
+			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
+							 handle_type, nblocks,
+							 0);
+			if (IS_ERR(handle)) {
+				err = PTR_ERR(handle);
+				ext4_std_error(sb, err);
+				goto out;
+			}
+		}
 		BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
-		if (err)
-			goto fail;
+		if (err) {
+			ext4_std_error(sb, err);
+			goto out;
+		}
 		ext4_lock_group(sb, group);
 		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
 		ext4_unlock_group(sb, group);
@@ -744,8 +776,10 @@ repeat_in_this_group:
 got:
 	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
-	if (err)
-		goto fail;
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
+	}
 
 	/* We may have to initialize the block bitmap if it isn't already */
 	if (ext4_has_group_desc_csum(sb) &&
@@ -757,7 +791,8 @@ got:
 		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
 		if (err) {
 			brelse(block_bitmap_bh);
-			goto fail;
+			ext4_std_error(sb, err);
+			goto out;
 		}
 
 		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
@@ -776,14 +811,18 @@ got:
 		ext4_unlock_group(sb, group);
 		brelse(block_bitmap_bh);
 
-		if (err)
-			goto fail;
+		if (err) {
+			ext4_std_error(sb, err);
+			goto out;
+		}
 	}
 
 	BUFFER_TRACE(group_desc_bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, group_desc_bh);
-	if (err)
-		goto fail;
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
+	}
 
 	/* Update the relevant bg descriptor fields */
 	if (ext4_has_group_desc_csum(sb)) {
@@ -829,8 +868,10 @@ got:
 
 	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
-	if (err)
-		goto fail;
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
+	}
 
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
@@ -840,16 +881,6 @@ got:
 		flex_group = ext4_flex_group(sbi, group);
 		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
 	}
-	if (owner) {
-		inode->i_mode = mode;
-		i_uid_write(inode, owner[0]);
-		i_gid_write(inode, owner[1]);
-	} else if (test_opt(sb, GRPID)) {
-		inode->i_mode = mode;
-		inode->i_uid = current_fsuid();
-		inode->i_gid = dir->i_gid;
-	} else
-		inode_init_owner(inode, dir, mode);
 
 	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
 	/* This is the optimal IO size (for stat), not the fs block size */
@@ -878,7 +909,9 @@ got:
 		 * twice.
 		 */
 		err = -EIO;
-		goto fail;
+		ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
+			   inode->i_ino);
+		goto out;
 	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
@@ -888,7 +921,6 @@ got:
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
 		__u32 csum;
-		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 		__le32 inum = cpu_to_le32(inode->i_ino);
 		__le32 gen = cpu_to_le32(inode->i_generation);
 		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
@@ -907,7 +939,6 @@ got:
 		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
 
 	ret = inode;
-	dquot_initialize(inode);
 	err = dquot_alloc_inode(inode);
 	if (err)
 		goto fail_drop;
@@ -941,24 +972,17 @@ got:
 
 	ext4_debug("allocating inode %lu\n", inode->i_ino);
 	trace_ext4_allocate_inode(inode, dir, mode);
-	goto really_out;
-fail:
-	ext4_std_error(sb, err);
-out:
-	iput(inode);
-	ret = ERR_PTR(err);
-really_out:
 	brelse(inode_bitmap_bh);
 	return ret;
 
 fail_free_drop:
 	dquot_free_inode(inode);
-
 fail_drop:
-	dquot_drop(inode);
-	inode->i_flags |= S_NOQUOTA;
 	clear_nlink(inode);
 	unlock_new_inode(inode);
+out:
+	dquot_drop(inode);
+	inode->i_flags |= S_NOQUOTA;
 	iput(inode);
 	brelse(inode_bitmap_bh);
 	return ERR_PTR(err);
@@ -1017,17 +1041,17 @@ iget_failed:
 	inode = NULL;
 bad_orphan:
 	ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
-	printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+	printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
 	       bit, (unsigned long long)bitmap_bh->b_blocknr,
 	       ext4_test_bit(bit, bitmap_bh->b_data));
-	printk(KERN_NOTICE "inode=%p\n", inode);
+	printk(KERN_WARNING "inode=%p\n", inode);
 	if (inode) {
-		printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+		printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
 		       is_bad_inode(inode));
-		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+		printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
 		       NEXT_ORPHAN(inode));
-		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
-		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
+		printk(KERN_WARNING "max_ino=%lu\n", max_ino);
+		printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
 		/* Avoid freeing blocks if we got a bad deleted inode */
 		if (inode->i_nlink == 0)
 			inode->i_blocks = 0;
@@ -1137,7 +1161,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
 		goto out;
 
-	handle = ext4_journal_start_sb(sb, 1);
+	handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 20862f96e8a..87b30cd357e 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,6 +20,7 @@
  *	(sct@redhat.com), 1993, 1998
  */
 
+#include <linux/aio.h>
 #include "ext4_jbd2.h"
 #include "truncate.h"
 #include "ext4_extents.h"	/* Needed for EXT_MAX_BLOCKS */
@@ -146,6 +147,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	struct super_block *sb = inode->i_sb;
 	Indirect *p = chain;
 	struct buffer_head *bh;
+	int ret = -EIO;
 
 	*err = 0;
 	/* i_data is not going away, no lock needed */
@@ -154,8 +156,10 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 		goto no_block;
 	while (--depth) {
 		bh = sb_getblk(sb, le32_to_cpu(p->key));
-		if (unlikely(!bh))
+		if (unlikely(!bh)) {
+			ret = -ENOMEM;
 			goto failure;
+		}
 
 		if (!bh_uptodate_or_lock(bh)) {
 			if (bh_submit_read(bh) < 0) {
@@ -177,7 +181,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 	return NULL;
 
 failure:
-	*err = -EIO;
+	*err = ret;
 no_block:
 	return p;
 }
@@ -289,132 +293,6 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 }
 
 /**
- *	ext4_alloc_blocks: multiple allocate blocks needed for a branch
- *	@handle: handle for this transaction
- *	@inode: inode which needs allocated blocks
- *	@iblock: the logical block to start allocated at
- *	@goal: preferred physical block of allocation
- *	@indirect_blks: the number of blocks need to allocate for indirect
- *			blocks
- *	@blks: number of desired blocks
- *	@new_blocks: on return it will store the new block numbers for
- *	the indirect blocks(if needed) and the first direct block,
- *	@err: on return it will store the error code
- *
- *	This function will return the number of blocks allocated as
- *	requested by the passed-in parameters.
- */
-static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-			     ext4_lblk_t iblock, ext4_fsblk_t goal,
-			     int indirect_blks, int blks,
-			     ext4_fsblk_t new_blocks[4], int *err)
-{
-	struct ext4_allocation_request ar;
-	int target, i;
-	unsigned long count = 0, blk_allocated = 0;
-	int index = 0;
-	ext4_fsblk_t current_block = 0;
-	int ret = 0;
-
-	/*
-	 * Here we try to allocate the requested multiple blocks at once,
-	 * on a best-effort basis.
-	 * To build a branch, we should allocate blocks for
-	 * the indirect blocks(if not allocated yet), and at least
-	 * the first direct block of this branch.  That's the
-	 * minimum number of blocks need to allocate(required)
-	 */
-	/* first we try to allocate the indirect blocks */
-	target = indirect_blks;
-	while (target > 0) {
-		count = target;
-		/* allocating blocks for indirect blocks and direct blocks */
-		current_block = ext4_new_meta_blocks(handle, inode, goal,
-						     0, &count, err);
-		if (*err)
-			goto failed_out;
-
-		if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
-			EXT4_ERROR_INODE(inode,
-					 "current_block %llu + count %lu > %d!",
-					 current_block, count,
-					 EXT4_MAX_BLOCK_FILE_PHYS);
-			*err = -EIO;
-			goto failed_out;
-		}
-
-		target -= count;
-		/* allocate blocks for indirect blocks */
-		while (index < indirect_blks && count) {
-			new_blocks[index++] = current_block++;
-			count--;
-		}
-		if (count > 0) {
-			/*
-			 * save the new block number
-			 * for the first direct block
-			 */
-			new_blocks[index] = current_block;
-			printk(KERN_INFO "%s returned more blocks than "
-						"requested\n", __func__);
-			WARN_ON(1);
-			break;
-		}
-	}
-
-	target = blks - count ;
-	blk_allocated = count;
-	if (!target)
-		goto allocated;
-	/* Now allocate data blocks */
-	memset(&ar, 0, sizeof(ar));
-	ar.inode = inode;
-	ar.goal = goal;
-	ar.len = target;
-	ar.logical = iblock;
-	if (S_ISREG(inode->i_mode))
-		/* enable in-core preallocation only for regular files */
-		ar.flags = EXT4_MB_HINT_DATA;
-
-	current_block = ext4_mb_new_blocks(handle, &ar, err);
-	if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
-		EXT4_ERROR_INODE(inode,
-				 "current_block %llu + ar.len %d > %d!",
-				 current_block, ar.len,
-				 EXT4_MAX_BLOCK_FILE_PHYS);
-		*err = -EIO;
-		goto failed_out;
-	}
-
-	if (*err && (target == blks)) {
-		/*
-		 * if the allocation failed and we didn't allocate
-		 * any blocks before
-		 */
-		goto failed_out;
-	}
-	if (!*err) {
-		if (target == blks) {
-			/*
-			 * save the new block number
-			 * for the first direct block
-			 */
-			new_blocks[index] = current_block;
-		}
-		blk_allocated += ar.len;
-	}
-allocated:
-	/* total number of blocks allocated for direct blocks */
-	ret = blk_allocated;
-	*err = 0;
-	return ret;
-failed_out:
-	for (i = 0; i < index; i++)
-		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-	return ret;
-}
-
-/**
  *	ext4_alloc_branch - allocate and set up a chain of blocks.
  *	@handle: handle for this transaction
  *	@inode: owner
@@ -446,60 +324,59 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 			     int *blks, ext4_fsblk_t goal,
 			     ext4_lblk_t *offsets, Indirect *branch)
 {
-	int blocksize = inode->i_sb->s_blocksize;
-	int i, n = 0;
-	int err = 0;
-	struct buffer_head *bh;
-	int num;
-	ext4_fsblk_t new_blocks[4];
-	ext4_fsblk_t current_block;
-
-	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
-				*blks, new_blocks, &err);
-	if (err)
-		return err;
+	struct ext4_allocation_request	ar;
+	struct buffer_head *		bh;
+	ext4_fsblk_t			b, new_blocks[4];
+	__le32				*p;
+	int				i, j, err, len = 1;
 
-	branch[0].key = cpu_to_le32(new_blocks[0]);
 	/*
-	 * metadata blocks and data blocks are allocated.
+	 * Set up for the direct block allocation
 	 */
-	for (n = 1; n <= indirect_blks;  n++) {
-		/*
-		 * Get buffer_head for parent block, zero it out
-		 * and set the pointer to new one, then send
-		 * parent to disk.
-		 */
-		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
-		if (unlikely(!bh)) {
-			err = -EIO;
+	memset(&ar, 0, sizeof(ar));
+	ar.inode = inode;
+	ar.len = *blks;
+	ar.logical = iblock;
+	if (S_ISREG(inode->i_mode))
+		ar.flags = EXT4_MB_HINT_DATA;
+
+	for (i = 0; i <= indirect_blks; i++) {
+		if (i == indirect_blks) {
+			ar.goal = goal;
+			new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
+		} else
+			goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
+							goal, 0, NULL, &err);
+		if (err) {
+			i--;
 			goto failed;
 		}
+		branch[i].key = cpu_to_le32(new_blocks[i]);
+		if (i == 0)
+			continue;
 
-		branch[n].bh = bh;
+		bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
+			goto failed;
+		}
 		lock_buffer(bh);
 		BUFFER_TRACE(bh, "call get_create_access");
 		err = ext4_journal_get_create_access(handle, bh);
 		if (err) {
-			/* Don't brelse(bh) here; it's done in
-			 * ext4_journal_forget() below */
 			unlock_buffer(bh);
 			goto failed;
 		}
 
-		memset(bh->b_data, 0, blocksize);
-		branch[n].p = (__le32 *) bh->b_data + offsets[n];
-		branch[n].key = cpu_to_le32(new_blocks[n]);
-		*branch[n].p = branch[n].key;
-		if (n == indirect_blks) {
-			current_block = new_blocks[n];
-			/*
-			 * End of chain, update the last new metablock of
-			 * the chain to point to the new allocated
-			 * data blocks numbers
-			 */
-			for (i = 1; i < num; i++)
-				*(branch[n].p + i) = cpu_to_le32(++current_block);
-		}
+		memset(bh->b_data, 0, bh->b_size);
+		p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
+		b = new_blocks[i];
+
+		if (i == indirect_blks)
+			len = ar.len;
+		for (j = 0; j < len; j++)
+			*p++ = cpu_to_le32(b++);
+
 		BUFFER_TRACE(bh, "marking uptodate");
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
@@ -509,25 +386,16 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		if (err)
 			goto failed;
 	}
-	*blks = num;
-	return err;
+	*blks = ar.len;
+	return 0;
 failed:
-	/* Allocation failed, free what we already allocated */
-	ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
-	for (i = 1; i <= n ; i++) {
-		/*
-		 * branch[i].bh is newly allocated, so there is no
-		 * need to revoke the block, which is why we don't
-		 * need to set EXT4_FREE_BLOCKS_METADATA.
-		 */
-		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
-				 EXT4_FREE_BLOCKS_FORGET);
+	for (; i >= 0; i--) {
+		if (i != indirect_blks && branch[i].bh)
+			ext4_forget(handle, 1, inode, branch[i].bh,
+				    branch[i].bh->b_blocknr);
+		ext4_free_blocks(handle, inode, NULL, new_blocks[i],
+				 (i == indirect_blks) ? ar.len : 1, 0);
 	}
-	for (i = n+1; i < indirect_blks; i++)
-		ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-
-	ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
-
 	return err;
 }
 
@@ -756,7 +624,7 @@ cleanup:
 		partial--;
 	}
 out:
-	trace_ext4_ind_map_blocks_exit(inode, map, err);
+	trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
 	return err;
 }
 
@@ -789,7 +657,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 
 		if (final_size > inode->i_size) {
 			/* Credits for sb + inode write */
-			handle = ext4_journal_start(inode, 2);
+			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 			if (IS_ERR(handle)) {
 				ret = PTR_ERR(handle);
 				goto out;
@@ -807,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 
 retry:
 	if (rw == READ && ext4_should_dioread_nolock(inode)) {
-		if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
-			mutex_lock(&inode->i_mutex);
-			ext4_flush_unwritten_io(inode);
-			mutex_unlock(&inode->i_mutex);
-		}
 		/*
 		 * Nolock dioread optimization may be dynamically disabled
 		 * via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -849,7 +712,7 @@ locked:
 		int err;
 
 		/* Credits for sb + inode write */
-		handle = ext4_journal_start(inode, 2);
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 		if (IS_ERR(handle)) {
 			/* This is really bad luck. We've written the data
 			 * but cannot extend i_size. Bail out and pretend
@@ -911,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
 	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
 
-int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
 {
-	int indirects;
-
-	/* if nrblocks are contiguous */
-	if (chunk) {
-		/*
-		 * With N contiguous data blocks, we need at most
-		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-		 * 2 dindirect blocks, and 1 tindirect block
-		 */
-		return DIV_ROUND_UP(nrblocks,
-				    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
-	}
 	/*
-	 * if nrblocks are not contiguous, worse case, each block touch
-	 * a indirect block, and each indirect block touch a double indirect
-	 * block, plus a triple indirect block
+	 * With N contiguous data blocks, we need at most
+	 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+	 * 2 dindirect blocks, and 1 tindirect block
 	 */
-	indirects = nrblocks * 2 + 1;
-	return indirects;
+	return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
 }
 
 /*
@@ -939,25 +793,9 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  * be able to restart the transaction at a conventient checkpoint to make
  * sure we don't overflow the journal.
  *
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit.  If
+ * Try to extend this transaction for the purposes of truncation.  If
  * extend fails, we need to propagate the failure up and restart the
  * transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
-{
-	handle_t *result;
-
-	result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode));
-	if (!IS_ERR(result))
-		return result;
-
-	ext4_std_error(inode->i_sb, PTR_ERR(result));
-	return result;
-}
-
-/*
- * Try to extend this transaction for the purposes of truncation.
  *
  * Returns 0 if we managed to create more room.  If we can't create more
  * room, and the transaction must be restarted we return 1.
@@ -1088,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 			     __le32 *last)
 {
 	__le32 *p;
-	int	flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+	int	flags = EXT4_FREE_BLOCKS_VALIDATED;
 	int	err;
 
 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		flags |= EXT4_FREE_BLOCKS_METADATA;
+		flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+	else if (ext4_should_journal_data(inode))
+		flags |= EXT4_FREE_BLOCKS_FORGET;
 
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
 				   count)) {
@@ -1350,68 +1190,30 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 	}
 }
 
-void ext4_ind_truncate(struct inode *inode)
+void ext4_ind_truncate(handle_t *handle, struct inode *inode)
 {
-	handle_t *handle;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	__le32 *i_data = ei->i_data;
 	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-	struct address_space *mapping = inode->i_mapping;
 	ext4_lblk_t offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
 	__le32 nr = 0;
 	int n = 0;
 	ext4_lblk_t last_block, max_block;
-	loff_t page_len;
 	unsigned blocksize = inode->i_sb->s_blocksize;
-	int err;
-
-	handle = start_transaction(inode);
-	if (IS_ERR(handle))
-		return;		/* AKPM: return what? */
 
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
-	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
-		page_len = PAGE_CACHE_SIZE -
-			(inode->i_size & (PAGE_CACHE_SIZE - 1));
-
-		err = ext4_discard_partial_page_buffers(handle,
-			mapping, inode->i_size, page_len, 0);
-
-		if (err)
-			goto out_stop;
-	}
-
 	if (last_block != max_block) {
 		n = ext4_block_to_path(inode, last_block, offsets, NULL);
 		if (n == 0)
-			goto out_stop;	/* error */
+			return;
 	}
 
-	/*
-	 * OK.  This truncate is going to happen.  We add the inode to the
-	 * orphan list, so that if this truncate spans multiple transactions,
-	 * and we crash, we will resume the truncate when the filesystem
-	 * recovers.  It also marks the inode dirty, to catch the new size.
-	 *
-	 * Implication: the file must always be in a sane, consistent
-	 * truncatable state while each transaction commits.
-	 */
-	if (ext4_orphan_add(handle, inode))
-		goto out_stop;
-
-	/*
-	 * From here we block out all ext4_get_block() callers who want to
-	 * modify the block allocation tree.
-	 */
-	down_write(&ei->i_data_sem);
-
-	ext4_discard_preallocations(inode);
 	ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
 
 	/*
@@ -1428,7 +1230,7 @@ void ext4_ind_truncate(struct inode *inode)
 		 * It is unnecessary to free any data blocks if last_block is
 		 * equal to the indirect block limit.
 		 */
-		goto out_unlock;
+		return;
 	} else if (n == 1) {		/* direct blocks */
 		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
 			       i_data + EXT4_NDIR_BLOCKS);
@@ -1488,30 +1290,91 @@ do_indirects:
 	case EXT4_TIND_BLOCK:
 		;
 	}
+}
 
-out_unlock:
-	up_write(&ei->i_data_sem);
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-	ext4_mark_inode_dirty(handle, inode);
+static int free_hole_blocks(handle_t *handle, struct inode *inode,
+			    struct buffer_head *parent_bh, __le32 *i_data,
+			    int level, ext4_lblk_t first,
+			    ext4_lblk_t count, int max)
+{
+	struct buffer_head *bh = NULL;
+	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int ret = 0;
+	int i, inc;
+	ext4_lblk_t offset;
+	__le32 blk;
 
-	/*
-	 * In a multi-transaction truncate, we only make the final transaction
-	 * synchronous
-	 */
-	if (IS_SYNC(inode))
-		ext4_handle_sync(handle);
-out_stop:
-	/*
-	 * If this was a simple ftruncate(), and the file will remain alive
-	 * then we need to clear up the orphan record which we created above.
-	 * However, if this was a real unlink then we were called by
-	 * ext4_delete_inode(), and we allow that function to clean up the
-	 * orphan info for us.
-	 */
-	if (inode->i_nlink)
-		ext4_orphan_del(handle, inode);
+	inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level);
+	for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) {
+		if (offset >= count + first)
+			break;
+		if (*i_data == 0 || (offset + inc) <= first)
+			continue;
+		blk = *i_data;
+		if (level > 0) {
+			ext4_lblk_t first2;
+			bh = sb_bread(inode->i_sb, le32_to_cpu(blk));
+			if (!bh) {
+				EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk),
+						       "Read failure");
+				return -EIO;
+			}
+			first2 = (first > offset) ? first - offset : 0;
+			ret = free_hole_blocks(handle, inode, bh,
+					       (__le32 *)bh->b_data, level - 1,
+					       first2, count - offset,
+					       inode->i_sb->s_blocksize >> 2);
+			if (ret) {
+				brelse(bh);
+				goto err;
+			}
+		}
+		if (level == 0 ||
+		    (bh && all_zeroes((__le32 *)bh->b_data,
+				      (__le32 *)bh->b_data + addr_per_block))) {
+			ext4_free_data(handle, inode, parent_bh, &blk, &blk+1);
+			*i_data = 0;
+		}
+		brelse(bh);
+		bh = NULL;
+	}
+
+err:
+	return ret;
+}
+
+int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+			  ext4_lblk_t first, ext4_lblk_t stop)
+{
+	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int level, ret = 0;
+	int num = EXT4_NDIR_BLOCKS;
+	ext4_lblk_t count, max = EXT4_NDIR_BLOCKS;
+	__le32 *i_data = EXT4_I(inode)->i_data;
+
+	count = stop - first;
+	for (level = 0; level < 4; level++, max *= addr_per_block) {
+		if (first < max) {
+			ret = free_hole_blocks(handle, inode, NULL, i_data,
+					       level, first, count, num);
+			if (ret)
+				goto err;
+			if (count > max - first)
+				count -= max - first;
+			else
+				break;
+			first = 0;
+		} else {
+			first -= max;
+		}
+		i_data += num;
+		if (level == 0) {
+			num = 1;
+			max = 1;
+		}
+	}
 
-	ext4_journal_stop(handle);
-	trace_ext4_truncate_exit(inode);
+err:
+	return ret;
 }
 
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 387c47c6cda..d9ecbf1113a 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -19,7 +19,8 @@
 
 #define EXT4_XATTR_SYSTEM_DATA	"data"
 #define EXT4_MIN_INLINE_DATA_SIZE	((sizeof(__le32) * EXT4_N_BLOCKS))
-#define EXT4_INLINE_DOTDOT_SIZE	4
+#define EXT4_INLINE_DOTDOT_OFFSET	2
+#define EXT4_INLINE_DOTDOT_SIZE		4
 
 int ext4_get_inline_size(struct inode *inode)
 {
@@ -71,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
 		entry = (struct ext4_xattr_entry *)
 			((void *)raw_inode + EXT4_I(inode)->i_inline_off);
 
-		free += le32_to_cpu(entry->e_value_size);
+		free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
 		goto out;
 	}
 
@@ -545,7 +546,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
 		return ret;
 
 retry:
-	handle = ext4_journal_start(inode, needed_blocks);
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		handle = NULL;
@@ -657,7 +658,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 	 * The possible write could happen in the inode,
 	 * so try to reserve the space in inode first.
 	 */
-	handle = ext4_journal_start(inode, 1);
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		handle = NULL;
@@ -853,7 +854,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
 	if (ret)
 		return ret;
 
-	handle = ext4_journal_start(inode, 1);
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		handle = NULL;
@@ -1188,7 +1189,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
 
 	data_bh = sb_getblk(inode->i_sb, map.m_pblk);
 	if (!data_bh) {
-		error = -EIO;
+		error = -ENOMEM;
 		goto out_restore;
 	}
 
@@ -1289,19 +1290,133 @@ out:
 	return ret;
 }
 
-int ext4_read_inline_dir(struct file *filp,
-			 void *dirent, filldir_t filldir,
+/*
+ * This function fills a red-black tree with information from an
+ * inlined dir.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+int htree_inlinedir_to_tree(struct file *dir_file,
+			    struct inode *dir, ext4_lblk_t block,
+			    struct dx_hash_info *hinfo,
+			    __u32 start_hash, __u32 start_minor_hash,
+			    int *has_inline_data)
+{
+	int err = 0, count = 0;
+	unsigned int parent_ino;
+	int pos;
+	struct ext4_dir_entry_2 *de;
+	struct inode *inode = file_inode(dir_file);
+	int ret, inline_size = 0;
+	struct ext4_iloc iloc;
+	void *dir_buf = NULL;
+	struct ext4_dir_entry_2 fake;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		up_read(&EXT4_I(inode)->xattr_sem);
+		*has_inline_data = 0;
+		goto out;
+	}
+
+	inline_size = ext4_get_inline_size(inode);
+	dir_buf = kmalloc(inline_size, GFP_NOFS);
+	if (!dir_buf) {
+		ret = -ENOMEM;
+		up_read(&EXT4_I(inode)->xattr_sem);
+		goto out;
+	}
+
+	ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+	up_read(&EXT4_I(inode)->xattr_sem);
+	if (ret < 0)
+		goto out;
+
+	pos = 0;
+	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+	while (pos < inline_size) {
+		/*
+		 * As inlined dir doesn't store any information about '.' and
+		 * only the inode number of '..' is stored, we have to handle
+		 * them differently.
+		 */
+		if (pos == 0) {
+			fake.inode = cpu_to_le32(inode->i_ino);
+			fake.name_len = 1;
+			strcpy(fake.name, ".");
+			fake.rec_len = ext4_rec_len_to_disk(
+						EXT4_DIR_REC_LEN(fake.name_len),
+						inline_size);
+			ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+			de = &fake;
+			pos = EXT4_INLINE_DOTDOT_OFFSET;
+		} else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
+			fake.inode = cpu_to_le32(parent_ino);
+			fake.name_len = 2;
+			strcpy(fake.name, "..");
+			fake.rec_len = ext4_rec_len_to_disk(
+						EXT4_DIR_REC_LEN(fake.name_len),
+						inline_size);
+			ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+			de = &fake;
+			pos = EXT4_INLINE_DOTDOT_SIZE;
+		} else {
+			de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
+			pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
+			if (ext4_check_dir_entry(inode, dir_file, de,
+					 iloc.bh, dir_buf,
+					 inline_size, pos)) {
+				ret = count;
+				goto out;
+			}
+		}
+
+		ext4fs_dirhash(de->name, de->name_len, hinfo);
+		if ((hinfo->hash < start_hash) ||
+		    ((hinfo->hash == start_hash) &&
+		     (hinfo->minor_hash < start_minor_hash)))
+			continue;
+		if (de->inode == 0)
+			continue;
+		err = ext4_htree_store_dirent(dir_file,
+				   hinfo->hash, hinfo->minor_hash, de);
+		if (err) {
+			count = err;
+			goto out;
+		}
+		count++;
+	}
+	ret = count;
+out:
+	kfree(dir_buf);
+	brelse(iloc.bh);
+	return ret;
+}
+
+/*
+ * So this function is called when the volume is mkfsed with
+ * dir_index disabled. In order to keep f_pos persistent
+ * after we convert from an inlined dir to a blocked based,
+ * we just pretend that we are a normal dir and return the
+ * offset as if '.' and '..' really take place.
+ *
+ */
+int ext4_read_inline_dir(struct file *file,
+			 struct dir_context *ctx,
 			 int *has_inline_data)
 {
-	int error = 0;
 	unsigned int offset, parent_ino;
-	int i, stored;
+	int i;
 	struct ext4_dir_entry_2 *de;
 	struct super_block *sb;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	int ret, inline_size = 0;
 	struct ext4_iloc iloc;
 	void *dir_buf = NULL;
+	int dotdot_offset, dotdot_size, extra_offset, extra_size;
 
 	ret = ext4_get_inode_loc(inode, &iloc);
 	if (ret)
@@ -1328,96 +1443,89 @@ int ext4_read_inline_dir(struct file *filp,
 		goto out;
 
 	sb = inode->i_sb;
-	stored = 0;
 	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+	offset = ctx->pos;
 
-	while (!error && !stored && filp->f_pos < inode->i_size) {
-revalidate:
-		/*
-		 * If the version has changed since the last call to
-		 * readdir(2), then we might be pointing to an invalid
-		 * dirent right now.  Scan from the start of the inline
-		 * dir to make sure.
-		 */
-		if (filp->f_version != inode->i_version) {
-			for (i = 0;
-			     i < inode->i_size && i < offset;) {
-				if (!i) {
-					/* skip "." and ".." if needed. */
-					i += EXT4_INLINE_DOTDOT_SIZE;
-					continue;
-				}
-				de = (struct ext4_dir_entry_2 *)
-					(dir_buf + i);
-				/* It's too expensive to do a full
-				 * dirent test each time round this
-				 * loop, but we do have to test at
-				 * least that it is non-zero.  A
-				 * failure will be detected in the
-				 * dirent test below. */
-				if (ext4_rec_len_from_disk(de->rec_len,
-					inline_size) < EXT4_DIR_REC_LEN(1))
-					break;
-				i += ext4_rec_len_from_disk(de->rec_len,
-							    inline_size);
-			}
-			offset = i;
-			filp->f_pos = offset;
-			filp->f_version = inode->i_version;
-		}
+	/*
+	 * dotdot_offset and dotdot_size is the real offset and
+	 * size for ".." and "." if the dir is block based while
+	 * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
+	 * So we will use extra_offset and extra_size to indicate them
+	 * during the inline dir iteration.
+	 */
+	dotdot_offset = EXT4_DIR_REC_LEN(1);
+	dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+	extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+	extra_size = extra_offset + inline_size;
 
-		while (!error && filp->f_pos < inode->i_size) {
-			if (filp->f_pos == 0) {
-				error = filldir(dirent, ".", 1, 0, inode->i_ino,
-						DT_DIR);
-				if (error)
-					break;
-				stored++;
-
-				error = filldir(dirent, "..", 2, 0, parent_ino,
-						DT_DIR);
-				if (error)
-					break;
-				stored++;
-
-				filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
+	/*
+	 * If the version has changed since the last call to
+	 * readdir(2), then we might be pointing to an invalid
+	 * dirent right now.  Scan from the start of the inline
+	 * dir to make sure.
+	 */
+	if (file->f_version != inode->i_version) {
+		for (i = 0; i < extra_size && i < offset;) {
+			/*
+			 * "." is with offset 0 and
+			 * ".." is dotdot_offset.
+			 */
+			if (!i) {
+				i = dotdot_offset;
+				continue;
+			} else if (i == dotdot_offset) {
+				i = dotdot_size;
 				continue;
 			}
+			/* for other entry, the real offset in
+			 * the buf has to be tuned accordingly.
+			 */
+			de = (struct ext4_dir_entry_2 *)
+				(dir_buf + i - extra_offset);
+			/* It's too expensive to do a full
+			 * dirent test each time round this
+			 * loop, but we do have to test at
+			 * least that it is non-zero.  A
+			 * failure will be detected in the
+			 * dirent test below. */
+			if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+				< EXT4_DIR_REC_LEN(1))
+				break;
+			i += ext4_rec_len_from_disk(de->rec_len,
+						    extra_size);
+		}
+		offset = i;
+		ctx->pos = offset;
+		file->f_version = inode->i_version;
+	}
 
-			de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
-			if (ext4_check_dir_entry(inode, filp, de,
-						 iloc.bh, dir_buf,
-						 inline_size, offset)) {
-				ret = stored;
+	while (ctx->pos < extra_size) {
+		if (ctx->pos == 0) {
+			if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
+				goto out;
+			ctx->pos = dotdot_offset;
+			continue;
+		}
+
+		if (ctx->pos == dotdot_offset) {
+			if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
+				goto out;
+			ctx->pos = dotdot_size;
+			continue;
+		}
+
+		de = (struct ext4_dir_entry_2 *)
+			(dir_buf + ctx->pos - extra_offset);
+		if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
+					 extra_size, ctx->pos))
+			goto out;
+		if (le32_to_cpu(de->inode)) {
+			if (!dir_emit(ctx, de->name, de->name_len,
+				      le32_to_cpu(de->inode),
+				      get_dtype(sb, de->file_type)))
 				goto out;
-			}
-			offset += ext4_rec_len_from_disk(de->rec_len,
-							 inline_size);
-			if (le32_to_cpu(de->inode)) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation.
-				 */
-				u64 version = filp->f_version;
-
-				error = filldir(dirent, de->name,
-						de->name_len,
-						filp->f_pos,
-						le32_to_cpu(de->inode),
-						get_dtype(sb, de->file_type));
-				if (error)
-					break;
-				if (version != filp->f_version)
-					goto revalidate;
-				stored++;
-			}
-			filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
-							      inline_size);
 		}
-		offset = 0;
+		ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
 	}
 out:
 	kfree(dir_buf);
@@ -1702,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
 	if (error)
 		goto out;
 
-	physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+	physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
 	physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
 	physical += offsetof(struct ext4_inode, i_block);
 	length = i_size_read(inode);
@@ -1770,7 +1878,7 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 
 
 	needed_blocks = ext4_writepage_trans_blocks(inode);
-	handle = ext4_journal_start(inode, needed_blocks);
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
 	if (IS_ERR(handle))
 		return;
 
@@ -1862,7 +1970,7 @@ int ext4_convert_inline_data(struct inode *inode)
 	if (error)
 		return error;
 
-	handle = ext4_journal_start(inode, needed_blocks);
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
 		goto out_free;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cbfe13bf5b2..0188e65e1f5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/aio.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -55,21 +56,21 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
 	__u16 csum_hi = 0;
 	__u32 csum;
 
-	csum_lo = raw->i_checksum_lo;
+	csum_lo = le16_to_cpu(raw->i_checksum_lo);
 	raw->i_checksum_lo = 0;
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
 	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
-		csum_hi = raw->i_checksum_hi;
+		csum_hi = le16_to_cpu(raw->i_checksum_hi);
 		raw->i_checksum_hi = 0;
 	}
 
 	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
 			   EXT4_INODE_SIZE(inode->i_sb));
 
-	raw->i_checksum_lo = csum_lo;
+	raw->i_checksum_lo = cpu_to_le16(csum_lo);
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
 	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
-		raw->i_checksum_hi = csum_hi;
+		raw->i_checksum_hi = cpu_to_le16(csum_hi);
 
 	return csum;
 }
@@ -131,16 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 						   new_size);
 }
 
-static void ext4_invalidatepage(struct page *page, unsigned long offset);
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
-				   struct buffer_head *bh_result, int create);
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
-		struct inode *inode, struct page *page, loff_t from,
-		loff_t length, int flags);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+				  int pextents);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -189,8 +186,6 @@ void ext4_evict_inode(struct inode *inode)
 
 	trace_ext4_evict_inode(inode);
 
-	ext4_ioend_wait(inode);
-
 	if (inode->i_nlink) {
 		/*
 		 * When journalling data dirty buffers are tracked only in the
@@ -211,15 +206,17 @@ void ext4_evict_inode(struct inode *inode)
 		 * don't use page cache.
 		 */
 		if (ext4_should_journal_data(inode) &&
-		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+		    inode->i_ino != EXT4_JOURNAL_INO) {
 			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
 
-			jbd2_log_start_commit(journal, commit_tid);
-			jbd2_log_wait_commit(journal, commit_tid);
+			jbd2_complete_transaction(journal, commit_tid);
 			filemap_write_and_wait(&inode->i_data);
 		}
 		truncate_inode_pages(&inode->i_data, 0);
+
+		WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
 		goto no_delete;
 	}
 
@@ -230,6 +227,7 @@ void ext4_evict_inode(struct inode *inode)
 		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages(&inode->i_data, 0);
 
+	WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
 	if (is_bad_inode(inode))
 		goto no_delete;
 
@@ -238,7 +236,8 @@ void ext4_evict_inode(struct inode *inode)
 	 * protection against it
 	 */
 	sb_start_intwrite(inode->i_sb);
-	handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+				    ext4_blocks_for_truncate(inode)+3);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		/*
@@ -346,7 +345,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
 	spin_lock(&ei->i_block_reservation_lock);
 	trace_ext4_da_update_reserve_space(inode, used, quota_claim);
 	if (unlikely(used > ei->i_reserved_data_blocks)) {
-		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+		ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
 			 "with only %d reserved data blocks",
 			 __func__, inode->i_ino, used,
 			 ei->i_reserved_data_blocks);
@@ -355,10 +354,12 @@ void ext4_da_update_reserve_space(struct inode *inode,
 	}
 
 	if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
-		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d "
-			 "with only %d reserved metadata blocks\n", __func__,
-			 inode->i_ino, ei->i_allocated_meta_blocks,
-			 ei->i_reserved_meta_blocks);
+		ext4_warning(inode->i_sb, "ino %lu, allocated %d "
+			"with only %d reserved metadata blocks "
+			"(releasing %d blocks with reserved %d data blocks)",
+			inode->i_ino, ei->i_allocated_meta_blocks,
+			     ei->i_reserved_meta_blocks, used,
+			     ei->i_reserved_data_blocks);
 		WARN_ON(1);
 		ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
 	}
@@ -423,65 +424,57 @@ static int __check_block_validity(struct inode *inode, const char *func,
 #define check_block_validity(inode, map)	\
 	__check_block_validity((inode), __func__, __LINE__, (map))
 
-/*
- * Return the number of contiguous dirty pages in a given inode
- * starting at page frame idx.
- */
-static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
-				    unsigned int max_pages)
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_map_blocks_es_recheck(handle_t *handle,
+				       struct inode *inode,
+				       struct ext4_map_blocks *es_map,
+				       struct ext4_map_blocks *map,
+				       int flags)
 {
-	struct address_space *mapping = inode->i_mapping;
-	pgoff_t	index;
-	struct pagevec pvec;
-	pgoff_t num = 0;
-	int i, nr_pages, done = 0;
+	int retval;
 
-	if (max_pages == 0)
-		return 0;
-	pagevec_init(&pvec, 0);
-	while (!done) {
-		index = idx;
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					      PAGECACHE_TAG_DIRTY,
-					      (pgoff_t)PAGEVEC_SIZE);
-		if (nr_pages == 0)
-			break;
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-			struct buffer_head *bh, *head;
+	map->m_flags = 0;
+	/*
+	 * There is a race window that the result is not the same.
+	 * e.g. xfstests #223 when dioread_nolock enables.  The reason
+	 * is that we lookup a block mapping in extent status tree with
+	 * out taking i_data_sem.  So at the time the unwritten extent
+	 * could be converted.
+	 */
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		down_read((&EXT4_I(inode)->i_data_sem));
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
+	} else {
+		retval = ext4_ind_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
+	}
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		up_read((&EXT4_I(inode)->i_data_sem));
+	/*
+	 * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
+	 * because it shouldn't be marked in es_map->m_flags.
+	 */
+	map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
 
-			lock_page(page);
-			if (unlikely(page->mapping != mapping) ||
-			    !PageDirty(page) ||
-			    PageWriteback(page) ||
-			    page->index != idx) {
-				done = 1;
-				unlock_page(page);
-				break;
-			}
-			if (page_has_buffers(page)) {
-				bh = head = page_buffers(page);
-				do {
-					if (!buffer_delay(bh) &&
-					    !buffer_unwritten(bh))
-						done = 1;
-					bh = bh->b_this_page;
-				} while (!done && (bh != head));
-			}
-			unlock_page(page);
-			if (done)
-				break;
-			idx++;
-			num++;
-			if (num >= max_pages) {
-				done = 1;
-				break;
-			}
-		}
-		pagevec_release(&pvec);
+	/*
+	 * We don't check m_len because extent will be collpased in status
+	 * tree.  So the m_len might not equal.
+	 */
+	if (es_map->m_lblk != map->m_lblk ||
+	    es_map->m_flags != map->m_flags ||
+	    es_map->m_pblk != map->m_pblk) {
+		printk("ES cache assertation failed for inode: %lu "
+		       "es_cached ex [%d/%d/%llu/%x] != "
+		       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
+		       inode->i_ino, es_map->m_lblk, es_map->m_len,
+		       es_map->m_pblk, es_map->m_flags, map->m_lblk,
+		       map->m_len, map->m_pblk, map->m_flags,
+		       retval, flags);
 	}
-	return num;
 }
+#endif /* ES_AGGRESSIVE_TEST */
 
 /*
  * The ext4_map_blocks() function tries to look up the requested blocks,
@@ -508,12 +501,44 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		    struct ext4_map_blocks *map, int flags)
 {
+	struct extent_status es;
 	int retval;
+#ifdef ES_AGGRESSIVE_TEST
+	struct ext4_map_blocks orig_map;
+
+	memcpy(&orig_map, map, sizeof(*map));
+#endif
 
 	map->m_flags = 0;
 	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
 		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
 		  (unsigned long) map->m_lblk);
+
+	ext4_es_lru_add(inode);
+
+	/* Lookup extent status tree firstly */
+	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+			map->m_pblk = ext4_es_pblock(&es) +
+					map->m_lblk - es.es_lblk;
+			map->m_flags |= ext4_es_is_written(&es) ?
+					EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+			retval = es.es_len - (map->m_lblk - es.es_lblk);
+			if (retval > map->m_len)
+				retval = map->m_len;
+			map->m_len = retval;
+		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+			retval = 0;
+		} else {
+			BUG_ON(1);
+		}
+#ifdef ES_AGGRESSIVE_TEST
+		ext4_map_blocks_es_recheck(handle, inode, map,
+					   &orig_map, flags);
+#endif
+		goto found;
+	}
+
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
@@ -527,20 +552,36 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		retval = ext4_ind_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
+	if (retval > 0) {
+		int ret;
+		unsigned long long status;
+
+#ifdef ES_AGGRESSIVE_TEST
+		if (retval != map->m_len) {
+			printk("ES len assertation failed for inode: %lu "
+			       "retval %d != map->m_len %d "
+			       "in %s (lookup)\n", inode->i_ino, retval,
+			       map->m_len, __func__);
+		}
+#endif
+
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+		    ext4_find_delalloc_range(inode, map->m_lblk,
+					     map->m_lblk + map->m_len - 1))
+			status |= EXTENT_STATUS_DELAYED;
+		ret = ext4_es_insert_extent(inode, map->m_lblk,
+					    map->m_len, map->m_pblk, status);
+		if (ret < 0)
+			retval = ret;
+	}
 	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
 		up_read((&EXT4_I(inode)->i_data_sem));
 
+found:
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-		int ret;
-		if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-			/* delayed alloc may be allocated by fallocate and
-			 * coverted to initialized by directIO.
-			 * we need to handle delayed extent here.
-			 */
-			down_write((&EXT4_I(inode)->i_data_sem));
-			goto delayed_mapped;
-		}
-		ret = check_block_validity(inode, map);
+		int ret = check_block_validity(inode, map);
 		if (ret != 0)
 			return ret;
 	}
@@ -560,16 +601,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		return retval;
 
 	/*
-	 * When we call get_blocks without the create flag, the
-	 * BH_Unwritten flag could have gotten set if the blocks
-	 * requested were part of a uninitialized extent.  We need to
-	 * clear this flag now that we are committed to convert all or
-	 * part of the uninitialized extent to be an initialized
-	 * extent.  This is because we need to avoid the combination
-	 * of BH_Unwritten and BH_Mapped flags being simultaneously
-	 * set on the buffer_head.
+	 * Here we clear m_flags because after allocating an new extent,
+	 * it will be set again.
 	 */
-	map->m_flags &= ~EXT4_MAP_UNWRITTEN;
+	map->m_flags &= ~EXT4_MAP_FLAGS;
 
 	/*
 	 * New blocks allocate and/or writing to uninitialized extent
@@ -615,20 +650,44 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
 			ext4_da_update_reserve_space(inode, retval, 1);
 	}
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 
-		if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-			int ret;
-delayed_mapped:
-			/* delayed allocation blocks has been allocated */
-			ret = ext4_es_remove_extent(inode, map->m_lblk,
-						    map->m_len);
-			if (ret < 0)
-				retval = ret;
+	if (retval > 0) {
+		int ret;
+		unsigned long long status;
+
+#ifdef ES_AGGRESSIVE_TEST
+		if (retval != map->m_len) {
+			printk("ES len assertation failed for inode: %lu "
+			       "retval %d != map->m_len %d "
+			       "in %s (allocation)\n", inode->i_ino, retval,
+			       map->m_len, __func__);
 		}
+#endif
+
+		/*
+		 * If the extent has been zeroed out, we don't need to update
+		 * extent status tree.
+		 */
+		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
+		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+			if (ext4_es_is_written(&es))
+				goto has_zeroout;
+		}
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+		    ext4_find_delalloc_range(inode, map->m_lblk,
+					     map->m_lblk + map->m_len - 1))
+			status |= EXTENT_STATUS_DELAYED;
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    map->m_pblk, status);
+		if (ret < 0)
+			retval = ret;
 	}
 
+has_zeroout:
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		int ret = check_block_validity(inode, map);
@@ -660,7 +719,8 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 		if (map.m_len > DIO_MAX_BLOCKS)
 			map.m_len = DIO_MAX_BLOCKS;
 		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
-		handle = ext4_journal_start(inode, dio_credits);
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+					    dio_credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			return ret;
@@ -707,14 +767,16 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	/* ensure we send some value back into *errp */
 	*errp = 0;
 
+	if (create && err == 0)
+		err = -ENOSPC;	/* should never happen */
 	if (err < 0)
 		*errp = err;
 	if (err <= 0)
 		return NULL;
 
 	bh = sb_getblk(inode->i_sb, map.m_pblk);
-	if (!bh) {
-		*errp = -EIO;
+	if (unlikely(!bh)) {
+		*errp = -ENOMEM;
 		return NULL;
 	}
 	if (map.m_flags & EXT4_MAP_NEW) {
@@ -808,11 +870,10 @@ int ext4_walk_page_buffers(handle_t *handle,
  * and the commit_write().  So doing the jbd2_journal_start at the start of
  * prepare_write() is the right place.
  *
- * Also, this function can nest inside ext4_writepage() ->
- * block_write_full_page(). In that case, we *know* that ext4_writepage()
- * has generated enough buffer credits to do the whole page.  So we won't
- * block on the journal in that case, which is good, because the caller may
- * be PF_MEMALLOC.
+ * Also, this function can nest inside ext4_writepage().  In that case, we
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
+ * whole page.  So we won't block on the journal in that case, which is good,
+ * because the caller may be PF_MEMALLOC.
  *
  * By accident, ext4 can be reentered when a transaction is open via
  * quota file writes.  If we were to commit the transaction while thus
@@ -878,32 +939,40 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 		ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
 						    flags, pagep);
 		if (ret < 0)
-			goto out;
-		if (ret == 1) {
-			ret = 0;
-			goto out;
-		}
+			return ret;
+		if (ret == 1)
+			return 0;
 	}
 
-retry:
-	handle = ext4_journal_start(inode, needed_blocks);
+	/*
+	 * grab_cache_page_write_begin() can take a long time if the
+	 * system is thrashing due to memory pressure, or if the page
+	 * is being written back.  So grab it first before we start
+	 * the transaction handle.  This also allows us to allocate
+	 * the page (if needed) without using GFP_NOFS.
+	 */
+retry_grab:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	unlock_page(page);
+
+retry_journal:
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
 	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out;
+		page_cache_release(page);
+		return PTR_ERR(handle);
 	}
 
-	/* We cannot recurse into the filesystem as the transaction is already
-	 * started */
-	flags |= AOP_FLAG_NOFS;
-
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page) {
+	lock_page(page);
+	if (page->mapping != mapping) {
+		/* The page got truncated from under us */
+		unlock_page(page);
+		page_cache_release(page);
 		ext4_journal_stop(handle);
-		ret = -ENOMEM;
-		goto out;
+		goto retry_grab;
 	}
-
-	*pagep = page;
+	wait_on_page_writeback(page);
 
 	if (ext4_should_dioread_nolock(inode))
 		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@ -918,7 +987,6 @@ retry:
 
 	if (ret) {
 		unlock_page(page);
-		page_cache_release(page);
 		/*
 		 * __block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
@@ -942,42 +1010,70 @@ retry:
 			if (inode->i_nlink)
 				ext4_orphan_del(NULL, inode);
 		}
-	}
 
-	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
-out:
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_journal;
+		page_cache_release(page);
+		return ret;
+	}
+	*pagep = page;
 	return ret;
 }
 
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
+	int ret;
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	set_buffer_uptodate(bh);
-	return ext4_handle_dirty_metadata(handle, NULL, bh);
+	ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+	clear_buffer_meta(bh);
+	clear_buffer_prio(bh);
+	return ret;
 }
 
-static int ext4_generic_write_end(struct file *file,
-				  struct address_space *mapping,
-				  loff_t pos, unsigned len, unsigned copied,
-				  struct page *page, void *fsdata)
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list.  metadata
+ * buffers are managed internally.
+ */
+static int ext4_write_end(struct file *file,
+			  struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
 {
-	int i_size_changed = 0;
-	struct inode *inode = mapping->host;
 	handle_t *handle = ext4_journal_current_handle();
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+	int i_size_changed = 0;
 
-	if (ext4_has_inline_data(inode))
-		copied = ext4_write_inline_data_end(inode, pos, len,
-						    copied, page);
-	else
+	trace_ext4_write_end(inode, pos, len, copied);
+	if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
+		ret = ext4_jbd2_file_inode(handle, inode);
+		if (ret) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto errout;
+		}
+	}
+
+	if (ext4_has_inline_data(inode)) {
+		ret = ext4_write_inline_data_end(inode, pos, len,
+						 copied, page);
+		if (ret < 0)
+			goto errout;
+		copied = ret;
+	} else
 		copied = block_write_end(file, mapping, pos,
 					 len, copied, page, fsdata);
 
 	/*
 	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold i_mutex.
+	 * cannot change under us because we hole i_mutex.
 	 *
 	 * But it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
@@ -987,10 +1083,10 @@ static int ext4_generic_write_end(struct file *file,
 		i_size_changed = 1;
 	}
 
-	if (pos + copied >  EXT4_I(inode)->i_disksize) {
+	if (pos + copied > EXT4_I(inode)->i_disksize) {
 		/* We need to mark inode dirty even if
 		 * new_i_size is less that inode->i_size
-		 * bu greater than i_disksize.(hint delalloc)
+		 * but greater than i_disksize. (hint delalloc)
 		 */
 		ext4_update_i_disksize(inode, (pos + copied));
 		i_size_changed = 1;
@@ -1007,87 +1103,13 @@ static int ext4_generic_write_end(struct file *file,
 	if (i_size_changed)
 		ext4_mark_inode_dirty(handle, inode);
 
-	return copied;
-}
-
-/*
- * We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
- *
- * ext4 never places buffers on inode->i_mapping->private_list.  metadata
- * buffers are managed internally.
- */
-static int ext4_ordered_write_end(struct file *file,
-				  struct address_space *mapping,
-				  loff_t pos, unsigned len, unsigned copied,
-				  struct page *page, void *fsdata)
-{
-	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = mapping->host;
-	int ret = 0, ret2;
-
-	trace_ext4_ordered_write_end(inode, pos, len, copied);
-	ret = ext4_jbd2_file_inode(handle, inode);
-
-	if (ret == 0) {
-		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-		copied = ret2;
-		if (pos + len > inode->i_size && ext4_can_truncate(inode))
-			/* if we have allocated more blocks and copied
-			 * less. We will have blocks allocated outside
-			 * inode->i_size. So truncate them
-			 */
-			ext4_orphan_add(handle, inode);
-		if (ret2 < 0)
-			ret = ret2;
-	} else {
-		unlock_page(page);
-		page_cache_release(page);
-	}
-
-	ret2 = ext4_journal_stop(handle);
-	if (!ret)
-		ret = ret2;
-
-	if (pos + len > inode->i_size) {
-		ext4_truncate_failed_write(inode);
-		/*
-		 * If truncate failed early the inode might still be
-		 * on the orphan list; we need to make sure the inode
-		 * is removed from the orphan list in that case.
-		 */
-		if (inode->i_nlink)
-			ext4_orphan_del(NULL, inode);
-	}
-
-
-	return ret ? ret : copied;
-}
-
-static int ext4_writeback_write_end(struct file *file,
-				    struct address_space *mapping,
-				    loff_t pos, unsigned len, unsigned copied,
-				    struct page *page, void *fsdata)
-{
-	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = mapping->host;
-	int ret = 0, ret2;
-
-	trace_ext4_writeback_write_end(inode, pos, len, copied);
-	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-	copied = ret2;
 	if (pos + len > inode->i_size && ext4_can_truncate(inode))
 		/* if we have allocated more blocks and copied
 		 * less. We will have blocks allocated outside
 		 * inode->i_size. So truncate them
 		 */
 		ext4_orphan_add(handle, inode);
-
-	if (ret2 < 0)
-		ret = ret2;
-
+errout:
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
@@ -1178,6 +1200,55 @@ static int ext4_journalled_write_end(struct file *file,
 }
 
 /*
+ * Reserve a metadata for a single block located at lblock
+ */
+static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
+{
+	int retries = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int md_needed;
+	ext4_lblk_t save_last_lblock;
+	int save_len;
+
+	/*
+	 * recalculate the amount of metadata blocks to reserve
+	 * in order to allocate nrblocks
+	 * worse case is one extent per block
+	 */
+repeat:
+	spin_lock(&ei->i_block_reservation_lock);
+	/*
+	 * ext4_calc_metadata_amount() has side effects, which we have
+	 * to be prepared undo if we fail to claim space.
+	 */
+	save_len = ei->i_da_metadata_calc_len;
+	save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+	md_needed = EXT4_NUM_B2C(sbi,
+				 ext4_calc_metadata_amount(inode, lblock));
+	trace_ext4_da_reserve_space(inode, md_needed);
+
+	/*
+	 * We do still charge estimated metadata to the sb though;
+	 * we cannot afford to run out of free blocks.
+	 */
+	if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
+		ei->i_da_metadata_calc_len = save_len;
+		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+		spin_unlock(&ei->i_block_reservation_lock);
+		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+			cond_resched();
+			goto repeat;
+		}
+		return -ENOSPC;
+	}
+	ei->i_reserved_meta_blocks += md_needed;
+	spin_unlock(&ei->i_block_reservation_lock);
+
+	return 0;       /* success */
+}
+
+/*
  * Reserve a single cluster located at lblock
  */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
@@ -1225,7 +1296,7 @@ repeat:
 		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
 		spin_unlock(&ei->i_block_reservation_lock);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
-			yield();
+			cond_resched();
 			goto repeat;
 		}
 		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
@@ -1256,7 +1327,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 		 * function is called from invalidate page, it's
 		 * harmless to return without any action.
 		 */
-		ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+		ext4_warning(inode->i_sb, "ext4_da_release_space: "
 			 "ino %lu, to_free %d with only %d reserved "
 			 "data blocks", inode->i_ino, to_free,
 			 ei->i_reserved_data_blocks);
@@ -1288,21 +1359,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
-					     unsigned long offset)
+					     unsigned int offset,
+					     unsigned int length)
 {
 	int to_release = 0;
 	struct buffer_head *head, *bh;
 	unsigned int curr_off = 0;
 	struct inode *inode = page->mapping->host;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	unsigned int stop = offset + length;
 	int num_clusters;
 	ext4_fsblk_t lblk;
 
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
 	head = page_buffers(page);
 	bh = head;
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
 
+		if (next_off > stop)
+			break;
+
 		if ((offset <= curr_off) && (buffer_delay(bh))) {
 			to_release++;
 			clear_buffer_delay(bh);
@@ -1333,178 +1411,43 @@ static void ext4_da_page_release_reservation(struct page *page,
  * Delayed allocation stuff
  */
 
-/*
- * mpage_da_submit_io - walks through extent of pages and try to write
- * them with writepage() call back
- *
- * @mpd->inode: inode
- * @mpd->first_page: first page of the extent
- * @mpd->next_page: page after the last page of the extent
- *
- * By the time mpage_da_submit_io() is called we expect all blocks
- * to be allocated. this may be wrong if allocation failed.
- *
- * As pages are already locked by write_cache_pages(), we can't use it
- */
-static int mpage_da_submit_io(struct mpage_da_data *mpd,
-			      struct ext4_map_blocks *map)
-{
-	struct pagevec pvec;
-	unsigned long index, end;
-	int ret = 0, err, nr_pages, i;
-	struct inode *inode = mpd->inode;
-	struct address_space *mapping = inode->i_mapping;
-	loff_t size = i_size_read(inode);
-	unsigned int len, block_start;
-	struct buffer_head *bh, *page_bufs = NULL;
-	int journal_data = ext4_should_journal_data(inode);
-	sector_t pblock = 0, cur_logical = 0;
-	struct ext4_io_submit io_submit;
+struct mpage_da_data {
+	struct inode *inode;
+	struct writeback_control *wbc;
 
-	BUG_ON(mpd->next_page <= mpd->first_page);
-	memset(&io_submit, 0, sizeof(io_submit));
+	pgoff_t first_page;	/* The first page to write */
+	pgoff_t next_page;	/* Current page to examine */
+	pgoff_t last_page;	/* Last page to examine */
 	/*
-	 * We need to start from the first_page to the next_page - 1
-	 * to make sure we also write the mapped dirty buffer_heads.
-	 * If we look at mpd->b_blocknr we would only be looking
-	 * at the currently mapped buffer_heads.
+	 * Extent to map - this can be after first_page because that can be
+	 * fully mapped. We somewhat abuse m_flags to store whether the extent
+	 * is delalloc or unwritten.
 	 */
-	index = mpd->first_page;
-	end = mpd->next_page - 1;
-
-	pagevec_init(&pvec, 0);
-	while (index <= end) {
-		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-		if (nr_pages == 0)
-			break;
-		for (i = 0; i < nr_pages; i++) {
-			int commit_write = 0, skip_page = 0;
-			struct page *page = pvec.pages[i];
-
-			index = page->index;
-			if (index > end)
-				break;
-
-			if (index == size >> PAGE_CACHE_SHIFT)
-				len = size & ~PAGE_CACHE_MASK;
-			else
-				len = PAGE_CACHE_SIZE;
-			if (map) {
-				cur_logical = index << (PAGE_CACHE_SHIFT -
-							inode->i_blkbits);
-				pblock = map->m_pblk + (cur_logical -
-							map->m_lblk);
-			}
-			index++;
-
-			BUG_ON(!PageLocked(page));
-			BUG_ON(PageWriteback(page));
-
-			/*
-			 * If the page does not have buffers (for
-			 * whatever reason), try to create them using
-			 * __block_write_begin.  If this fails,
-			 * skip the page and move on.
-			 */
-			if (!page_has_buffers(page)) {
-				if (__block_write_begin(page, 0, len,
-						noalloc_get_block_write)) {
-				skip_page:
-					unlock_page(page);
-					continue;
-				}
-				commit_write = 1;
-			}
-
-			bh = page_bufs = page_buffers(page);
-			block_start = 0;
-			do {
-				if (!bh)
-					goto skip_page;
-				if (map && (cur_logical >= map->m_lblk) &&
-				    (cur_logical <= (map->m_lblk +
-						     (map->m_len - 1)))) {
-					if (buffer_delay(bh)) {
-						clear_buffer_delay(bh);
-						bh->b_blocknr = pblock;
-					}
-					if (buffer_unwritten(bh) ||
-					    buffer_mapped(bh))
-						BUG_ON(bh->b_blocknr != pblock);
-					if (map->m_flags & EXT4_MAP_UNINIT)
-						set_buffer_uninit(bh);
-					clear_buffer_unwritten(bh);
-				}
-
-				/*
-				 * skip page if block allocation undone and
-				 * block is dirty
-				 */
-				if (ext4_bh_delay_or_unwritten(NULL, bh))
-					skip_page = 1;
-				bh = bh->b_this_page;
-				block_start += bh->b_size;
-				cur_logical++;
-				pblock++;
-			} while (bh != page_bufs);
-
-			if (skip_page)
-				goto skip_page;
-
-			if (commit_write)
-				/* mark the buffer_heads as dirty & uptodate */
-				block_commit_write(page, 0, len);
-
-			clear_page_dirty_for_io(page);
-			/*
-			 * Delalloc doesn't support data journalling,
-			 * but eventually maybe we'll lift this
-			 * restriction.
-			 */
-			if (unlikely(journal_data && PageChecked(page)))
-				err = __ext4_journalled_writepage(page, len);
-			else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
-				err = ext4_bio_write_page(&io_submit, page,
-							  len, mpd->wbc);
-			else if (buffer_uninit(page_bufs)) {
-				ext4_set_bh_endio(page_bufs, inode);
-				err = block_write_full_page_endio(page,
-					noalloc_get_block_write,
-					mpd->wbc, ext4_end_io_buffer_write);
-			} else
-				err = block_write_full_page(page,
-					noalloc_get_block_write, mpd->wbc);
-
-			if (!err)
-				mpd->pages_written++;
-			/*
-			 * In error case, we have to continue because
-			 * remaining pages are still locked
-			 */
-			if (ret == 0)
-				ret = err;
-		}
-		pagevec_release(&pvec);
-	}
-	ext4_io_submit(&io_submit);
-	return ret;
-}
+	struct ext4_map_blocks map;
+	struct ext4_io_submit io_submit;	/* IO submission data */
+};
 
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+				       bool invalidate)
 {
 	int nr_pages, i;
 	pgoff_t index, end;
 	struct pagevec pvec;
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
-	ext4_lblk_t start, last;
+
+	/* This is necessary when next_page == 0. */
+	if (mpd->first_page >= mpd->next_page)
+		return;
 
 	index = mpd->first_page;
 	end   = mpd->next_page - 1;
-
-	start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	ext4_es_remove_extent(inode, start, last - start + 1);
+	if (invalidate) {
+		ext4_lblk_t start, last;
+		start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		ext4_es_remove_extent(inode, start, last - start + 1);
+	}
 
 	pagevec_init(&pvec, 0);
 	while (index <= end) {
@@ -1517,246 +1460,40 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
 				break;
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
-			block_invalidatepage(page, 0);
-			ClearPageUptodate(page);
+			if (invalidate) {
+				block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+				ClearPageUptodate(page);
+			}
 			unlock_page(page);
 		}
 		index = pvec.pages[nr_pages - 1]->index + 1;
 		pagevec_release(&pvec);
 	}
-	return;
 }
 
 static void ext4_print_free_blocks(struct inode *inode)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct super_block *sb = inode->i_sb;
+	struct ext4_inode_info *ei = EXT4_I(inode);
 
 	ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
 	       EXT4_C2B(EXT4_SB(inode->i_sb),
-			ext4_count_free_clusters(inode->i_sb)));
+			ext4_count_free_clusters(sb)));
 	ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
 	ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
-	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+	       (long long) EXT4_C2B(EXT4_SB(sb),
 		percpu_counter_sum(&sbi->s_freeclusters_counter)));
 	ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
-	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+	       (long long) EXT4_C2B(EXT4_SB(sb),
 		percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
 	ext4_msg(sb, KERN_CRIT, "Block reservation details");
 	ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
-		 EXT4_I(inode)->i_reserved_data_blocks);
+		 ei->i_reserved_data_blocks);
 	ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
-	       EXT4_I(inode)->i_reserved_meta_blocks);
-	return;
-}
-
-/*
- * mpage_da_map_and_submit - go through given space, map them
- *       if necessary, and then submit them for I/O
- *
- * @mpd - bh describing space
- *
- * The function skips space we know is already mapped to disk blocks.
- *
- */
-static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
-{
-	int err, blks, get_blocks_flags;
-	struct ext4_map_blocks map, *mapp = NULL;
-	sector_t next = mpd->b_blocknr;
-	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
-	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
-	handle_t *handle = NULL;
-
-	/*
-	 * If the blocks are mapped already, or we couldn't accumulate
-	 * any blocks, then proceed immediately to the submission stage.
-	 */
-	if ((mpd->b_size == 0) ||
-	    ((mpd->b_state  & (1 << BH_Mapped)) &&
-	     !(mpd->b_state & (1 << BH_Delay)) &&
-	     !(mpd->b_state & (1 << BH_Unwritten))))
-		goto submit_io;
-
-	handle = ext4_journal_current_handle();
-	BUG_ON(!handle);
-
-	/*
-	 * Call ext4_map_blocks() to allocate any delayed allocation
-	 * blocks, or to convert an uninitialized extent to be
-	 * initialized (in the case where we have written into
-	 * one or more preallocated blocks).
-	 *
-	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
-	 * indicate that we are on the delayed allocation path.  This
-	 * affects functions in many different parts of the allocation
-	 * call path.  This flag exists primarily because we don't
-	 * want to change *many* call functions, so ext4_map_blocks()
-	 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
-	 * inode's allocation semaphore is taken.
-	 *
-	 * If the blocks in questions were delalloc blocks, set
-	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
-	 * variables are updated after the blocks have been allocated.
-	 */
-	map.m_lblk = next;
-	map.m_len = max_blocks;
-	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
-	if (ext4_should_dioread_nolock(mpd->inode))
-		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
-	if (mpd->b_state & (1 << BH_Delay))
-		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-
-	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
-	if (blks < 0) {
-		struct super_block *sb = mpd->inode->i_sb;
-
-		err = blks;
-		/*
-		 * If get block returns EAGAIN or ENOSPC and there
-		 * appears to be free blocks we will just let
-		 * mpage_da_submit_io() unlock all of the pages.
-		 */
-		if (err == -EAGAIN)
-			goto submit_io;
-
-		if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
-			mpd->retval = err;
-			goto submit_io;
-		}
-
-		/*
-		 * get block failure will cause us to loop in
-		 * writepages, because a_ops->writepage won't be able
-		 * to make progress. The page will be redirtied by
-		 * writepage and writepages will again try to write
-		 * the same.
-		 */
-		if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
-			ext4_msg(sb, KERN_CRIT,
-				 "delayed block allocation failed for inode %lu "
-				 "at logical offset %llu with max blocks %zd "
-				 "with error %d", mpd->inode->i_ino,
-				 (unsigned long long) next,
-				 mpd->b_size >> mpd->inode->i_blkbits, err);
-			ext4_msg(sb, KERN_CRIT,
-				"This should not happen!! Data will be lost\n");
-			if (err == -ENOSPC)
-				ext4_print_free_blocks(mpd->inode);
-		}
-		/* invalidate all the pages */
-		ext4_da_block_invalidatepages(mpd);
-
-		/* Mark this page range as having been completed */
-		mpd->io_done = 1;
-		return;
-	}
-	BUG_ON(blks == 0);
-
-	mapp = &map;
-	if (map.m_flags & EXT4_MAP_NEW) {
-		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
-		int i;
-
-		for (i = 0; i < map.m_len; i++)
-			unmap_underlying_metadata(bdev, map.m_pblk + i);
-	}
-
-	/*
-	 * Update on-disk size along with block allocation.
-	 */
-	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
-	if (disksize > i_size_read(mpd->inode))
-		disksize = i_size_read(mpd->inode);
-	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
-		ext4_update_i_disksize(mpd->inode, disksize);
-		err = ext4_mark_inode_dirty(handle, mpd->inode);
-		if (err)
-			ext4_error(mpd->inode->i_sb,
-				   "Failed to mark inode %lu dirty",
-				   mpd->inode->i_ino);
-	}
-
-submit_io:
-	mpage_da_submit_io(mpd, mapp);
-	mpd->io_done = 1;
-}
-
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
-		(1 << BH_Delay) | (1 << BH_Unwritten))
-
-/*
- * mpage_add_bh_to_extent - try to add one more block to extent of blocks
- *
- * @mpd->lbh - extent of blocks
- * @logical - logical number of the block in the file
- * @bh - bh of the block (used to access block's state)
- *
- * the function is used to collect contig. blocks in same state
- */
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
-				   sector_t logical, size_t b_size,
-				   unsigned long b_state)
-{
-	sector_t next;
-	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
-
-	/*
-	 * XXX Don't go larger than mballoc is willing to allocate
-	 * This is a stopgap solution.  We eventually need to fold
-	 * mpage_da_submit_io() into this function and then call
-	 * ext4_map_blocks() multiple times in a loop
-	 */
-	if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
-		goto flush_it;
-
-	/* check if thereserved journal credits might overflow */
-	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
-		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
-			/*
-			 * With non-extent format we are limited by the journal
-			 * credit available.  Total credit needed to insert
-			 * nrblocks contiguous blocks is dependent on the
-			 * nrblocks.  So limit nrblocks.
-			 */
-			goto flush_it;
-		} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
-				EXT4_MAX_TRANS_DATA) {
-			/*
-			 * Adding the new buffer_head would make it cross the
-			 * allowed limit for which we have journal credit
-			 * reserved. So limit the new bh->b_size
-			 */
-			b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
-						mpd->inode->i_blkbits;
-			/* we will do mpage_da_submit_io in the next loop */
-		}
-	}
-	/*
-	 * First block in the extent
-	 */
-	if (mpd->b_size == 0) {
-		mpd->b_blocknr = logical;
-		mpd->b_size = b_size;
-		mpd->b_state = b_state & BH_FLAGS;
-		return;
-	}
-
-	next = mpd->b_blocknr + nrblocks;
-	/*
-	 * Can we merge the block to our big extent?
-	 */
-	if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
-		mpd->b_size += b_size;
-		return;
-	}
-
-flush_it:
-	/*
-	 * We couldn't merge the block to our extent, so we
-	 * need to flush current  extent and start new one
-	 */
-	mpage_da_map_and_submit(mpd);
+	       ei->i_reserved_meta_blocks);
+	ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
+	       ei->i_allocated_meta_blocks);
 	return;
 }
 
@@ -1775,8 +1512,14 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 			      struct ext4_map_blocks *map,
 			      struct buffer_head *bh)
 {
+	struct extent_status es;
 	int retval;
 	sector_t invalid_block = ~((sector_t) 0xffff);
+#ifdef ES_AGGRESSIVE_TEST
+	struct ext4_map_blocks orig_map;
+
+	memcpy(&orig_map, map, sizeof(*map));
+#endif
 
 	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
 		invalid_block = ~0;
@@ -1785,6 +1528,47 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 	ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
 		  "logical block %lu\n", inode->i_ino, map->m_len,
 		  (unsigned long) map->m_lblk);
+
+	ext4_es_lru_add(inode);
+
+	/* Lookup extent status tree firstly */
+	if (ext4_es_lookup_extent(inode, iblock, &es)) {
+
+		if (ext4_es_is_hole(&es)) {
+			retval = 0;
+			down_read((&EXT4_I(inode)->i_data_sem));
+			goto add_delayed;
+		}
+
+		/*
+		 * Delayed extent could be allocated by fallocate.
+		 * So we need to check it.
+		 */
+		if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+			map_bh(bh, inode->i_sb, invalid_block);
+			set_buffer_new(bh);
+			set_buffer_delay(bh);
+			return 0;
+		}
+
+		map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+		retval = es.es_len - (iblock - es.es_lblk);
+		if (retval > map->m_len)
+			retval = map->m_len;
+		map->m_len = retval;
+		if (ext4_es_is_written(&es))
+			map->m_flags |= EXT4_MAP_MAPPED;
+		else if (ext4_es_is_unwritten(&es))
+			map->m_flags |= EXT4_MAP_UNWRITTEN;
+		else
+			BUG_ON(1);
+
+#ifdef ES_AGGRESSIVE_TEST
+		ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
+		return retval;
+	}
+
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
@@ -1803,27 +1587,46 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 			map->m_flags |= EXT4_MAP_FROM_CLUSTER;
 		retval = 0;
 	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+		retval = ext4_ext_map_blocks(NULL, inode, map,
+					     EXT4_GET_BLOCKS_NO_PUT_HOLE);
 	else
-		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+		retval = ext4_ind_map_blocks(NULL, inode, map,
+					     EXT4_GET_BLOCKS_NO_PUT_HOLE);
 
+add_delayed:
 	if (retval == 0) {
+		int ret;
 		/*
 		 * XXX: __block_prepare_write() unmaps passed block,
 		 * is it OK?
 		 */
-		/* If the block was allocated from previously allocated cluster,
-		 * then we dont need to reserve it again. */
+		/*
+		 * If the block was allocated from previously allocated cluster,
+		 * then we don't need to reserve it again. However we still need
+		 * to reserve metadata for every block we're going to write.
+		 */
 		if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
-			retval = ext4_da_reserve_space(inode, iblock);
-			if (retval)
+			ret = ext4_da_reserve_space(inode, iblock);
+			if (ret) {
 				/* not enough space to reserve */
+				retval = ret;
 				goto out_unlock;
+			}
+		} else {
+			ret = ext4_da_reserve_metadata(inode, iblock);
+			if (ret) {
+				/* not enough space to reserve */
+				retval = ret;
+				goto out_unlock;
+			}
 		}
 
-		retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
-		if (retval)
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    ~0, EXTENT_STATUS_DELAYED);
+		if (ret) {
+			retval = ret;
 			goto out_unlock;
+		}
 
 		/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
 		 * and it should not appear on the bh->b_state.
@@ -1833,6 +1636,25 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
 		map_bh(bh, inode->i_sb, invalid_block);
 		set_buffer_new(bh);
 		set_buffer_delay(bh);
+	} else if (retval > 0) {
+		int ret;
+		unsigned long long status;
+
+#ifdef ES_AGGRESSIVE_TEST
+		if (retval != map->m_len) {
+			printk("ES len assertation failed for inode: %lu "
+			       "retval %d != map->m_len %d "
+			       "in %s (lookup)\n", inode->i_ino, retval,
+			       map->m_len, __func__);
+		}
+#endif
+
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    map->m_pblk, status);
+		if (ret != 0)
+			retval = ret;
 	}
 
 out_unlock:
@@ -1890,27 +1712,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	return 0;
 }
 
-/*
- * This function is used as a standard get_block_t calback function
- * when there is no desire to allocate any blocks.  It is used as a
- * callback function for block_write_begin() and block_write_full_page().
- * These functions should only try to map a single block at a time.
- *
- * Since this function doesn't do block allocations even if the caller
- * requests it by passing in create=1, it is critically important that
- * any caller checks to make sure that any buffer heads are returned
- * by this function are either all already mapped or marked for
- * delayed allocation before calling  block_write_full_page().  Otherwise,
- * b_blocknr could be left unitialized, and the page write functions will
- * be taken by surprise.
- */
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
-				   struct buffer_head *bh_result, int create)
-{
-	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
-	return _ext4_get_block(inode, iblock, bh_result, 0);
-}
-
 static int bget_one(handle_t *handle, struct buffer_head *bh)
 {
 	get_bh(bh);
@@ -1955,7 +1756,8 @@ static int __ext4_journalled_writepage(struct page *page,
 	 * references to buffers so we are safe */
 	unlock_page(page);
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+				    ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		goto out;
@@ -2002,7 +1804,7 @@ out:
  * lock so we have to do some magic.
  *
  * This function can get called via...
- *   - ext4_da_writepages after taking page lock (have journal handle)
+ *   - ext4_writepages after taking page lock (have journal handle)
  *   - journal_submit_inode_data_buffers (no journal handle)
  *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
  *   - grab_page_cache when doing write_begin (have journal handle)
@@ -2035,11 +1837,12 @@ out:
 static int ext4_writepage(struct page *page,
 			  struct writeback_control *wbc)
 {
-	int ret = 0, commit_write = 0;
+	int ret = 0;
 	loff_t size;
 	unsigned int len;
 	struct buffer_head *page_bufs = NULL;
 	struct inode *inode = page->mapping->host;
+	struct ext4_io_submit io_submit;
 
 	trace_ext4_writepage(page);
 	size = i_size_read(inode);
@@ -2048,39 +1851,29 @@ static int ext4_writepage(struct page *page,
 	else
 		len = PAGE_CACHE_SIZE;
 
+	page_bufs = page_buffers(page);
 	/*
-	 * If the page does not have buffers (for whatever reason),
-	 * try to create them using __block_write_begin.  If this
-	 * fails, redirty the page and move on.
+	 * We cannot do block allocation or other extent handling in this
+	 * function. If there are buffers needing that, we have to redirty
+	 * the page. But we may reach here when we do a journal commit via
+	 * journal_submit_inode_data_buffers() and in that case we must write
+	 * allocated buffers to achieve data=ordered mode guarantees.
 	 */
-	if (!page_has_buffers(page)) {
-		if (__block_write_begin(page, 0, len,
-					noalloc_get_block_write)) {
-		redirty_page:
-			redirty_page_for_writepage(wbc, page);
+	if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+				   ext4_bh_delay_or_unwritten)) {
+		redirty_page_for_writepage(wbc, page);
+		if (current->flags & PF_MEMALLOC) {
+			/*
+			 * For memory cleaning there's no point in writing only
+			 * some buffers. So just bail out. Warn if we came here
+			 * from direct reclaim.
+			 */
+			WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+							== PF_MEMALLOC);
 			unlock_page(page);
 			return 0;
 		}
-		commit_write = 1;
 	}
-	page_bufs = page_buffers(page);
-	if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-				   ext4_bh_delay_or_unwritten)) {
-		/*
-		 * We don't want to do block allocation, so redirty
-		 * the page and return.  We may reach here when we do
-		 * a journal commit via journal_submit_inode_data_buffers.
-		 * We can also reach here via shrink_page_list but it
-		 * should never be for direct reclaim so warn if that
-		 * happens
-		 */
-		WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
-								PF_MEMALLOC);
-		goto redirty_page;
-	}
-	if (commit_write)
-		/* now mark the buffer_heads as dirty and uptodate */
-		block_commit_write(page, 0, len);
 
 	if (PageChecked(page) && ext4_should_journal_data(inode))
 		/*
@@ -2089,81 +1882,405 @@ static int ext4_writepage(struct page *page,
 		 */
 		return __ext4_journalled_writepage(page, len);
 
-	if (buffer_uninit(page_bufs)) {
-		ext4_set_bh_endio(page_bufs, inode);
-		ret = block_write_full_page_endio(page, noalloc_get_block_write,
-					    wbc, ext4_end_io_buffer_write);
-	} else
-		ret = block_write_full_page(page, noalloc_get_block_write,
-					    wbc);
-
+	ext4_io_submit_init(&io_submit, wbc);
+	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+	if (!io_submit.io_end) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return -ENOMEM;
+	}
+	ret = ext4_bio_write_page(&io_submit, page, len, wbc);
+	ext4_io_submit(&io_submit);
+	/* Drop io_end reference we got from init */
+	ext4_put_io_end_defer(io_submit.io_end);
 	return ret;
 }
 
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
 /*
- * This is called via ext4_da_writepages() to
- * calculate the total number of credits to reserve to fit
- * a single extent allocation into a single transaction,
- * ext4_da_writpeages() will loop calling this before
- * the block allocation.
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks upto full group size.
  */
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
 
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @b_state - b_state of the buffer head added
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+				  unsigned long b_state)
 {
-	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+	struct ext4_map_blocks *map = &mpd->map;
 
+	/* Don't go larger than mballoc is willing to allocate */
+	if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+		return 0;
+
+	/* First block in the extent? */
+	if (map->m_len == 0) {
+		map->m_lblk = lblk;
+		map->m_len = 1;
+		map->m_flags = b_state & BH_FLAGS;
+		return 1;
+	}
+
+	/* Can we merge the block to our big extent? */
+	if (lblk == map->m_lblk + map->m_len &&
+	    (b_state & BH_FLAGS) == map->m_flags) {
+		map->m_len++;
+		return 1;
+	}
+	return 0;
+}
+
+static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
+				    struct buffer_head *head,
+				    struct buffer_head *bh,
+				    ext4_lblk_t lblk)
+{
+	struct inode *inode = mpd->inode;
+	ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+							>> inode->i_blkbits;
+
+	do {
+		BUG_ON(buffer_locked(bh));
+
+		if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+		    (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
+		    lblk >= blocks) {
+			/* Found extent to map? */
+			if (mpd->map.m_len)
+				return false;
+			if (lblk >= blocks)
+				return true;
+			continue;
+		}
+		if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
+			return false;
+	} while (lblk++, (bh = bh->b_this_page) != head);
+	return true;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+	int len;
+	loff_t size = i_size_read(mpd->inode);
+	int err;
+
+	BUG_ON(page->index != mpd->first_page);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+	clear_page_dirty_for_io(page);
+	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+	if (!err)
+		mpd->wbc->nr_to_write--;
+	mpd->first_page++;
+
+	return err;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ *		       submit fully mapped pages for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ *
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+	struct pagevec pvec;
+	int nr_pages, i;
+	struct inode *inode = mpd->inode;
+	struct buffer_head *head, *bh;
+	int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+	ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+							>> inode->i_blkbits;
+	pgoff_t start, end;
+	ext4_lblk_t lblk;
+	sector_t pblock;
+	int err;
+
+	start = mpd->map.m_lblk >> bpp_bits;
+	end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+	lblk = start << bpp_bits;
+	pblock = mpd->map.m_pblk;
+
+	pagevec_init(&pvec, 0);
+	while (start <= end) {
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+					  PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (page->index > end)
+				break;
+			/* Upto 'end' pages must be contiguous */
+			BUG_ON(page->index != start);
+			bh = head = page_buffers(page);
+			do {
+				if (lblk < mpd->map.m_lblk)
+					continue;
+				if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+					/*
+					 * Buffer after end of mapped extent.
+					 * Find next buffer in the page to map.
+					 */
+					mpd->map.m_len = 0;
+					mpd->map.m_flags = 0;
+					add_page_bufs_to_extent(mpd, head, bh,
+								lblk);
+					pagevec_release(&pvec);
+					return 0;
+				}
+				if (buffer_delay(bh)) {
+					clear_buffer_delay(bh);
+					bh->b_blocknr = pblock++;
+				}
+				clear_buffer_unwritten(bh);
+			} while (++lblk < blocks &&
+				 (bh = bh->b_this_page) != head);
+
+			/*
+			 * FIXME: This is going to break if dioread_nolock
+			 * supports blocksize < pagesize as we will try to
+			 * convert potentially unmapped parts of inode.
+			 */
+			mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+			/* Page fully mapped - let IO run! */
+			err = mpage_submit_page(mpd, page);
+			if (err < 0) {
+				pagevec_release(&pvec);
+				return err;
+			}
+			start++;
+		}
+		pagevec_release(&pvec);
+	}
+	/* Extent fully mapped and matches with page boundary. We are done. */
+	mpd->map.m_len = 0;
+	mpd->map.m_flags = 0;
+	return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+	struct inode *inode = mpd->inode;
+	struct ext4_map_blocks *map = &mpd->map;
+	int get_blocks_flags;
+	int err;
+
+	trace_ext4_da_write_pages_extent(inode, map);
 	/*
-	 * With non-extent format the journal credit needed to
-	 * insert nrblocks contiguous block is dependent on
-	 * number of contiguous block. So we will limit
-	 * number of contiguous block to a sane value
+	 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+	 * to convert an uninitialized extent to be initialized (in the case
+	 * where we have written into one or more preallocated blocks).  It is
+	 * possible that we're going to need more metadata blocks than
+	 * previously reserved. However we must not fail because we're in
+	 * writeback and there is nothing we can do about it so it might result
+	 * in data loss.  So use reserved blocks to allocate metadata if
+	 * possible.
+	 *
+	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+	 * in question are delalloc blocks.  This affects functions in many
+	 * different parts of the allocation call path.  This flag exists
+	 * primarily because we don't want to change *many* call functions, so
+	 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+	 * once the inode's allocation semaphore is taken.
 	 */
-	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
-	    (max_blocks > EXT4_MAX_TRANS_DATA))
-		max_blocks = EXT4_MAX_TRANS_DATA;
+	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+			   EXT4_GET_BLOCKS_METADATA_NOFAIL;
+	if (ext4_should_dioread_nolock(inode))
+		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+	if (map->m_flags & (1 << BH_Delay))
+		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
 
-	return ext4_chunk_trans_blocks(inode, max_blocks);
+	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+	if (err < 0)
+		return err;
+	if (map->m_flags & EXT4_MAP_UNINIT) {
+		if (!mpd->io_submit.io_end->handle &&
+		    ext4_handle_valid(handle)) {
+			mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+			handle->h_rsv_handle = NULL;
+		}
+		ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+	}
+
+	BUG_ON(map->m_len == 0);
+	if (map->m_flags & EXT4_MAP_NEW) {
+		struct block_device *bdev = inode->i_sb->s_bdev;
+		int i;
+
+		for (i = 0; i < map->m_len; i++)
+			unmap_underlying_metadata(bdev, map->m_pblk + i);
+	}
+	return 0;
 }
 
 /*
- * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and accumulate pages that need writing, and call
- * mpage_da_map_and_submit to map a single contiguous memory region
- * and then write them.
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ *				 mpd->len and submit pages underlying it for IO
+ *
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ *
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
  */
-static int write_cache_pages_da(handle_t *handle,
-				struct address_space *mapping,
-				struct writeback_control *wbc,
-				struct mpage_da_data *mpd,
-				pgoff_t *done_index)
+static int mpage_map_and_submit_extent(handle_t *handle,
+				       struct mpage_da_data *mpd,
+				       bool *give_up_on_write)
 {
-	struct buffer_head	*bh, *head;
-	struct inode		*inode = mapping->host;
-	struct pagevec		pvec;
-	unsigned int		nr_pages;
-	sector_t		logical;
-	pgoff_t			index, end;
-	long			nr_to_write = wbc->nr_to_write;
-	int			i, tag, ret = 0;
-
-	memset(mpd, 0, sizeof(struct mpage_da_data));
-	mpd->wbc = wbc;
-	mpd->inode = inode;
-	pagevec_init(&pvec, 0);
-	index = wbc->range_start >> PAGE_CACHE_SHIFT;
-	end = wbc->range_end >> PAGE_CACHE_SHIFT;
+	struct inode *inode = mpd->inode;
+	struct ext4_map_blocks *map = &mpd->map;
+	int err;
+	loff_t disksize;
 
-	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+	mpd->io_submit.io_end->offset =
+				((loff_t)map->m_lblk) << inode->i_blkbits;
+	while (map->m_len) {
+		err = mpage_map_one_extent(handle, mpd);
+		if (err < 0) {
+			struct super_block *sb = inode->i_sb;
+
+			if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+				goto invalidate_dirty_pages;
+			/*
+			 * Let the uper layers retry transient errors.
+			 * In the case of ENOSPC, if ext4_count_free_blocks()
+			 * is non-zero, a commit should free up blocks.
+			 */
+			if ((err == -ENOMEM) ||
+			    (err == -ENOSPC && ext4_count_free_clusters(sb)))
+				return err;
+			ext4_msg(sb, KERN_CRIT,
+				 "Delayed block allocation failed for "
+				 "inode %lu at logical offset %llu with"
+				 " max blocks %u with error %d",
+				 inode->i_ino,
+				 (unsigned long long)map->m_lblk,
+				 (unsigned)map->m_len, -err);
+			ext4_msg(sb, KERN_CRIT,
+				 "This should not happen!! Data will "
+				 "be lost\n");
+			if (err == -ENOSPC)
+				ext4_print_free_blocks(inode);
+		invalidate_dirty_pages:
+			*give_up_on_write = true;
+			return err;
+		}
+		/*
+		 * Update buffer state, submit mapped pages, and get us new
+		 * extent to map
+		 */
+		err = mpage_map_and_submit_buffers(mpd);
+		if (err < 0)
+			return err;
+	}
+
+	/* Update on-disk size after IO is submitted */
+	disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+	if (disksize > i_size_read(inode))
+		disksize = i_size_read(inode);
+	if (disksize > EXT4_I(inode)->i_disksize) {
+		int err2;
+
+		ext4_update_i_disksize(inode, disksize);
+		err2 = ext4_mark_inode_dirty(handle, inode);
+		if (err2)
+			ext4_error(inode->i_sb,
+				   "Failed to mark inode %lu dirty",
+				   inode->i_ino);
+		if (!err)
+			err = err2;
+	}
+	return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+	int bpp = ext4_journal_blocks_per_page(inode);
+
+	return ext4_meta_trans_blocks(inode,
+				MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * 				 and underlying extent to map
+ *
+ * @mpd - where to look for pages
+ *
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
+ *
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
+ */
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+{
+	struct address_space *mapping = mpd->inode->i_mapping;
+	struct pagevec pvec;
+	unsigned int nr_pages;
+	pgoff_t index = mpd->first_page;
+	pgoff_t end = mpd->last_page;
+	int tag;
+	int i, err = 0;
+	int blkbits = mpd->inode->i_blkbits;
+	ext4_lblk_t lblk;
+	struct buffer_head *head;
+
+	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 
-	*done_index = index;
+	pagevec_init(&pvec, 0);
+	mpd->map.m_len = 0;
+	mpd->next_page = index;
 	while (index <= end) {
 		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
 			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
-			return 0;
+			goto out;
 
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
@@ -2178,31 +2295,21 @@ static int write_cache_pages_da(handle_t *handle,
 			if (page->index > end)
 				goto out;
 
-			*done_index = page->index + 1;
-
-			/*
-			 * If we can't merge this page, and we have
-			 * accumulated an contiguous region, write it
-			 */
-			if ((mpd->next_page != page->index) &&
-			    (mpd->next_page != mpd->first_page)) {
-				mpage_da_map_and_submit(mpd);
-				goto ret_extent_tail;
-			}
+			/* If we can't merge this page, we are done. */
+			if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+				goto out;
 
 			lock_page(page);
-
 			/*
-			 * If the page is no longer dirty, or its
-			 * mapping no longer corresponds to inode we
-			 * are writing (which means it has been
-			 * truncated or invalidated), or the page is
-			 * already under writeback and we are not
-			 * doing a data integrity writeback, skip the page
+			 * If the page is no longer dirty, or its mapping no
+			 * longer corresponds to inode we are writing (which
+			 * means it has been truncated or invalidated), or the
+			 * page is already under writeback and we are not doing
+			 * a data integrity writeback, skip the page
 			 */
 			if (!PageDirty(page) ||
 			    (PageWriteback(page) &&
-			     (wbc->sync_mode == WB_SYNC_NONE)) ||
+			     (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
 			    unlikely(page->mapping != mapping)) {
 				unlock_page(page);
 				continue;
@@ -2211,119 +2318,70 @@ static int write_cache_pages_da(handle_t *handle,
 			wait_on_page_writeback(page);
 			BUG_ON(PageWriteback(page));
 
-			/*
-			 * If we have inline data and arrive here, it means that
-			 * we will soon create the block for the 1st page, so
-			 * we'd better clear the inline data here.
-			 */
-			if (ext4_has_inline_data(inode)) {
-				BUG_ON(ext4_test_inode_state(inode,
-						EXT4_STATE_MAY_INLINE_DATA));
-				ext4_destroy_inline_data(handle, inode);
-			}
-
-			if (mpd->next_page != page->index)
+			if (mpd->map.m_len == 0)
 				mpd->first_page = page->index;
 			mpd->next_page = page->index + 1;
-			logical = (sector_t) page->index <<
-				(PAGE_CACHE_SHIFT - inode->i_blkbits);
-
-			if (!page_has_buffers(page)) {
-				mpage_add_bh_to_extent(mpd, logical,
-						       PAGE_CACHE_SIZE,
-						       (1 << BH_Dirty) | (1 << BH_Uptodate));
-				if (mpd->io_done)
-					goto ret_extent_tail;
-			} else {
-				/*
-				 * Page with regular buffer heads,
-				 * just add all dirty ones
-				 */
-				head = page_buffers(page);
-				bh = head;
-				do {
-					BUG_ON(buffer_locked(bh));
-					/*
-					 * We need to try to allocate
-					 * unmapped blocks in the same page.
-					 * Otherwise we won't make progress
-					 * with the page in ext4_writepage
-					 */
-					if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-						mpage_add_bh_to_extent(mpd, logical,
-								       bh->b_size,
-								       bh->b_state);
-						if (mpd->io_done)
-							goto ret_extent_tail;
-					} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
-						/*
-						 * mapped dirty buffer. We need
-						 * to update the b_state
-						 * because we look at b_state
-						 * in mpage_da_map_blocks.  We
-						 * don't update b_size because
-						 * if we find an unmapped
-						 * buffer_head later we need to
-						 * use the b_state flag of that
-						 * buffer_head.
-						 */
-						if (mpd->b_size == 0)
-							mpd->b_state = bh->b_state & BH_FLAGS;
-					}
-					logical++;
-				} while ((bh = bh->b_this_page) != head);
-			}
-
-			if (nr_to_write > 0) {
-				nr_to_write--;
-				if (nr_to_write == 0 &&
-				    wbc->sync_mode == WB_SYNC_NONE)
-					/*
-					 * We stop writing back only if we are
-					 * not doing integrity sync. In case of
-					 * integrity sync we have to keep going
-					 * because someone may be concurrently
-					 * dirtying pages, and we might have
-					 * synced a lot of newly appeared dirty
-					 * pages, but have not synced all of the
-					 * old dirty pages.
-					 */
+			/* Add all dirty buffers to mpd */
+			lblk = ((ext4_lblk_t)page->index) <<
+				(PAGE_CACHE_SHIFT - blkbits);
+			head = page_buffers(page);
+			if (!add_page_bufs_to_extent(mpd, head, head, lblk))
+				goto out;
+			/* So far everything mapped? Submit the page for IO. */
+			if (mpd->map.m_len == 0) {
+				err = mpage_submit_page(mpd, page);
+				if (err < 0)
 					goto out;
 			}
+
+			/*
+			 * Accumulated enough dirty pages? This doesn't apply
+			 * to WB_SYNC_ALL mode. For integrity sync we have to
+			 * keep going because someone may be concurrently
+			 * dirtying pages, and we might have synced a lot of
+			 * newly appeared dirty pages, but have not synced all
+			 * of the old dirty pages.
+			 */
+			if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
+			    mpd->next_page - mpd->first_page >=
+							mpd->wbc->nr_to_write)
+				goto out;
 		}
 		pagevec_release(&pvec);
 		cond_resched();
 	}
 	return 0;
-ret_extent_tail:
-	ret = MPAGE_DA_EXTENT_TAIL;
 out:
 	pagevec_release(&pvec);
-	cond_resched();
-	return ret;
+	return err;
 }
 
+static int __writepage(struct page *page, struct writeback_control *wbc,
+		       void *data)
+{
+	struct address_space *mapping = data;
+	int ret = ext4_writepage(page, wbc);
+	mapping_set_error(mapping, ret);
+	return ret;
+}
 
-static int ext4_da_writepages(struct address_space *mapping,
-			      struct writeback_control *wbc)
+static int ext4_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
 {
-	pgoff_t	index;
+	pgoff_t	writeback_index = 0;
+	long nr_to_write = wbc->nr_to_write;
 	int range_whole = 0;
+	int cycled = 1;
 	handle_t *handle = NULL;
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
-	int pages_written = 0;
-	unsigned int max_pages;
-	int range_cyclic, cycled = 1, io_done = 0;
-	int needed_blocks, ret = 0;
-	long desired_nr_to_write, nr_to_writebump = 0;
-	loff_t range_start = wbc->range_start;
+	int needed_blocks, rsv_blocks = 0, ret = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
-	pgoff_t done_index = 0;
-	pgoff_t end;
+	bool done;
 	struct blk_plug plug;
+	bool give_up_on_write = false;
 
-	trace_ext4_da_writepages(inode, wbc);
+	trace_ext4_writepages(inode, wbc);
 
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
@@ -2333,169 +2391,171 @@ static int ext4_da_writepages(struct address_space *mapping,
 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
 
+	if (ext4_should_journal_data(inode)) {
+		struct blk_plug plug;
+		int ret;
+
+		blk_start_plug(&plug);
+		ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+		blk_finish_plug(&plug);
+		return ret;
+	}
+
 	/*
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
 	 * will obscure the real source of the problem.  We test
 	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
 	 * the latter could be true if the filesystem is mounted
-	 * read-only, and in that case, ext4_da_writepages should
+	 * read-only, and in that case, ext4_writepages should
 	 * *never* be called, so if that ever happens, we would want
 	 * the stack trace.
 	 */
 	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
 		return -EROFS;
 
+	if (ext4_should_dioread_nolock(inode)) {
+		/*
+		 * We may need to convert upto one extent per block in
+		 * the page and we may dirty the inode.
+		 */
+		rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+	}
+
+	/*
+	 * If we have inline data and arrive here, it means that
+	 * we will soon create the block for the 1st page, so
+	 * we'd better clear the inline data here.
+	 */
+	if (ext4_has_inline_data(inode)) {
+		/* Just inode will be modified... */
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_writepages;
+		}
+		BUG_ON(ext4_test_inode_state(inode,
+				EXT4_STATE_MAY_INLINE_DATA));
+		ext4_destroy_inline_data(handle, inode);
+		ext4_journal_stop(handle);
+	}
+
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 
-	range_cyclic = wbc->range_cyclic;
 	if (wbc->range_cyclic) {
-		index = mapping->writeback_index;
-		if (index)
+		writeback_index = mapping->writeback_index;
+		if (writeback_index)
 			cycled = 0;
-		wbc->range_start = index << PAGE_CACHE_SHIFT;
-		wbc->range_end  = LLONG_MAX;
-		wbc->range_cyclic = 0;
-		end = -1;
+		mpd.first_page = writeback_index;
+		mpd.last_page = -1;
 	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
-	}
-
-	/*
-	 * This works around two forms of stupidity.  The first is in
-	 * the writeback code, which caps the maximum number of pages
-	 * written to be 1024 pages.  This is wrong on multiple
-	 * levels; different architectues have a different page size,
-	 * which changes the maximum amount of data which gets
-	 * written.  Secondly, 4 megabytes is way too small.  XFS
-	 * forces this value to be 16 megabytes by multiplying
-	 * nr_to_write parameter by four, and then relies on its
-	 * allocator to allocate larger extents to make them
-	 * contiguous.  Unfortunately this brings us to the second
-	 * stupidity, which is that ext4's mballoc code only allocates
-	 * at most 2048 blocks.  So we force contiguous writes up to
-	 * the number of dirty blocks in the inode, or
-	 * sbi->max_writeback_mb_bump whichever is smaller.
-	 */
-	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-	if (!range_cyclic && range_whole) {
-		if (wbc->nr_to_write == LONG_MAX)
-			desired_nr_to_write = wbc->nr_to_write;
-		else
-			desired_nr_to_write = wbc->nr_to_write * 8;
-	} else
-		desired_nr_to_write = ext4_num_dirty_pages(inode, index,
-							   max_pages);
-	if (desired_nr_to_write > max_pages)
-		desired_nr_to_write = max_pages;
-
-	if (wbc->nr_to_write < desired_nr_to_write) {
-		nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
-		wbc->nr_to_write = desired_nr_to_write;
+		mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+		mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
 	}
 
+	mpd.inode = inode;
+	mpd.wbc = wbc;
+	ext4_io_submit_init(&mpd.io_submit, wbc);
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-		tag_pages_for_writeback(mapping, index, end);
-
+		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+	done = false;
 	blk_start_plug(&plug);
-	while (!ret && wbc->nr_to_write > 0) {
+	while (!done && mpd.first_page <= mpd.last_page) {
+		/* For each extent of pages we use new io_end */
+		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+		if (!mpd.io_submit.io_end) {
+			ret = -ENOMEM;
+			break;
+		}
 
 		/*
-		 * we  insert one extent at a time. So we need
-		 * credit needed for single extent allocation.
-		 * journalled mode is currently not supported
-		 * by delalloc
+		 * We have two constraints: We find one extent to map and we
+		 * must always write out whole page (makes a difference when
+		 * blocksize < pagesize) so that we don't block on IO when we
+		 * try to write out the rest of the page. Journalled mode is
+		 * not supported by delalloc.
 		 */
 		BUG_ON(ext4_should_journal_data(inode));
 		needed_blocks = ext4_da_writepages_trans_blocks(inode);
 
-		/* start a new transaction*/
-		handle = ext4_journal_start(inode, needed_blocks);
+		/* start a new transaction */
+		handle = ext4_journal_start_with_reserve(inode,
+				EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
 			       "%ld pages, ino %lu; err %d", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
-			blk_finish_plug(&plug);
-			goto out_writepages;
+			/* Release allocated io_end */
+			ext4_put_io_end(mpd.io_submit.io_end);
+			break;
 		}
 
-		/*
-		 * Now call write_cache_pages_da() to find the next
-		 * contiguous region of logical blocks that need
-		 * blocks to be allocated by ext4 and submit them.
-		 */
-		ret = write_cache_pages_da(handle, mapping,
-					   wbc, &mpd, &done_index);
-		/*
-		 * If we have a contiguous extent of pages and we
-		 * haven't done the I/O yet, map the blocks and submit
-		 * them for I/O.
-		 */
-		if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-			mpage_da_map_and_submit(&mpd);
-			ret = MPAGE_DA_EXTENT_TAIL;
+		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+		ret = mpage_prepare_extent_to_map(&mpd);
+		if (!ret) {
+			if (mpd.map.m_len)
+				ret = mpage_map_and_submit_extent(handle, &mpd,
+					&give_up_on_write);
+			else {
+				/*
+				 * We scanned the whole range (or exhausted
+				 * nr_to_write), submitted what was mapped and
+				 * didn't find anything needing mapping. We are
+				 * done.
+				 */
+				done = true;
+			}
 		}
-		trace_ext4_da_write_pages(inode, &mpd);
-		wbc->nr_to_write -= mpd.pages_written;
-
 		ext4_journal_stop(handle);
-
-		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
-			/* commit the transaction which would
+		/* Submit prepared bio */
+		ext4_io_submit(&mpd.io_submit);
+		/* Unlock pages we didn't use */
+		mpage_release_unused_pages(&mpd, give_up_on_write);
+		/* Drop our io_end reference we got from init */
+		ext4_put_io_end(mpd.io_submit.io_end);
+
+		if (ret == -ENOSPC && sbi->s_journal) {
+			/*
+			 * Commit the transaction which would
 			 * free blocks released in the transaction
 			 * and try again
 			 */
 			jbd2_journal_force_commit_nested(sbi->s_journal);
 			ret = 0;
-		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
-			/*
-			 * Got one extent now try with rest of the pages.
-			 * If mpd.retval is set -EIO, journal is aborted.
-			 * So we don't need to write any more.
-			 */
-			pages_written += mpd.pages_written;
-			ret = mpd.retval;
-			io_done = 1;
-		} else if (wbc->nr_to_write)
-			/*
-			 * There is no more writeout needed
-			 * or we requested for a noblocking writeout
-			 * and we found the device congested
-			 */
+			continue;
+		}
+		/* Fatal error - ENOMEM, EIO... */
+		if (ret)
 			break;
 	}
 	blk_finish_plug(&plug);
-	if (!io_done && !cycled) {
+	if (!ret && !cycled) {
 		cycled = 1;
-		index = 0;
-		wbc->range_start = index << PAGE_CACHE_SHIFT;
-		wbc->range_end  = mapping->writeback_index - 1;
+		mpd.last_page = writeback_index - 1;
+		mpd.first_page = 0;
 		goto retry;
 	}
 
 	/* Update index */
-	wbc->range_cyclic = range_cyclic;
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
-		 * set the writeback_index so that range_cyclic
+		 * Set the writeback_index so that range_cyclic
 		 * mode will write it back later
 		 */
-		mapping->writeback_index = done_index;
+		mapping->writeback_index = mpd.first_page;
 
 out_writepages:
-	wbc->nr_to_write -= nr_to_writebump;
-	wbc->range_start = range_start;
-	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+	trace_ext4_writepages_result(inode, wbc, ret,
+				     nr_to_write - wbc->nr_to_write);
 	return ret;
 }
 
 static int ext4_nonda_switch(struct super_block *sb)
 {
-	s64 free_blocks, dirty_blocks;
+	s64 free_clusters, dirty_clusters;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	/*
@@ -2506,21 +2566,18 @@ static int ext4_nonda_switch(struct super_block *sb)
 	 * Delalloc need an accurate free block accounting. So switch
 	 * to non delalloc when we are near to error range.
 	 */
-	free_blocks  = EXT4_C2B(sbi,
-		percpu_counter_read_positive(&sbi->s_freeclusters_counter));
-	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+	free_clusters =
+		percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+	dirty_clusters =
+		percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
 	/*
 	 * Start pushing delalloc when 1/2 of free blocks are dirty.
 	 */
-	if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
-	    !writeback_in_progress(sb->s_bdi) &&
-	    down_read_trylock(&sb->s_umount)) {
-		writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
-		up_read(&sb->s_umount);
-	}
+	if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
+		try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
 
-	if (2 * free_blocks < 3 * dirty_blocks ||
-		free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
+	if (2 * free_clusters < 3 * dirty_clusters ||
+	    free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
 		/*
 		 * free block count is less than 150% of dirty blocks
 		 * or free blocks is less than watermark
@@ -2555,42 +2612,52 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 						      pos, len, flags,
 						      pagep, fsdata);
 		if (ret < 0)
-			goto out;
-		if (ret == 1) {
-			ret = 0;
-			goto out;
-		}
+			return ret;
+		if (ret == 1)
+			return 0;
 	}
 
-retry:
+	/*
+	 * grab_cache_page_write_begin() can take a long time if the
+	 * system is thrashing due to memory pressure, or if the page
+	 * is being written back.  So grab it first before we start
+	 * the transaction handle.  This also allows us to allocate
+	 * the page (if needed) without using GFP_NOFS.
+	 */
+retry_grab:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	unlock_page(page);
+
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
 	 * if there is delayed block allocation. But we still need
 	 * to journalling the i_disksize update if writes to the end
 	 * of file which has an already mapped buffer.
 	 */
-	handle = ext4_journal_start(inode, 1);
+retry_journal:
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
 	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out;
+		page_cache_release(page);
+		return PTR_ERR(handle);
 	}
-	/* We cannot recurse into the filesystem as the transaction is already
-	 * started */
-	flags |= AOP_FLAG_NOFS;
 
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page) {
+	lock_page(page);
+	if (page->mapping != mapping) {
+		/* The page got truncated from under us */
+		unlock_page(page);
+		page_cache_release(page);
 		ext4_journal_stop(handle);
-		ret = -ENOMEM;
-		goto out;
+		goto retry_grab;
 	}
-	*pagep = page;
+	/* In case writeback began while the page was unlocked */
+	wait_on_page_writeback(page);
 
 	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
 	if (ret < 0) {
 		unlock_page(page);
 		ext4_journal_stop(handle);
-		page_cache_release(page);
 		/*
 		 * block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
@@ -2598,11 +2665,16 @@ retry:
 		 */
 		if (pos + len > inode->i_size)
 			ext4_truncate_failed_write(inode);
+
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_journal;
+
+		page_cache_release(page);
+		return ret;
 	}
 
-	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
-out:
+	*pagep = page;
 	return ret;
 }
 
@@ -2641,18 +2713,9 @@ static int ext4_da_write_end(struct file *file,
 	unsigned long start, end;
 	int write_mode = (int)(unsigned long)fsdata;
 
-	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
-		switch (ext4_inode_journal_mode(inode)) {
-		case EXT4_INODE_ORDERED_DATA_MODE:
-			return ext4_ordered_write_end(file, mapping, pos,
-					len, copied, page, fsdata);
-		case EXT4_INODE_WRITEBACK_DATA_MODE:
-			return ext4_writeback_write_end(file, mapping, pos,
-					len, copied, page, fsdata);
-		default:
-			BUG();
-		}
-	}
+	if (write_mode == FALL_BACK_TO_NONDELALLOC)
+		return ext4_write_end(file, mapping, pos,
+				      len, copied, page, fsdata);
 
 	trace_ext4_da_write_end(inode, pos, len, copied);
 	start = pos & (PAGE_CACHE_SIZE - 1);
@@ -2698,7 +2761,8 @@ static int ext4_da_write_end(struct file *file,
 	return ret ? ret : copied;
 }
 
-static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+				   unsigned int length)
 {
 	/*
 	 * Drop reserved blocks
@@ -2707,10 +2771,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
 	if (!page_has_buffers(page))
 		goto out;
 
-	ext4_da_page_release_reservation(page, offset);
+	ext4_da_page_release_reservation(page, offset, length);
 
 out:
-	ext4_invalidatepage(page, offset);
+	ext4_invalidatepage(page, offset, length);
 
 	return;
 }
@@ -2733,7 +2797,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
 	 * laptop_mode, not even desirable).  However, to do otherwise
 	 * would require replicating code paths in:
 	 *
-	 * ext4_da_writepages() ->
+	 * ext4_writepages() ->
 	 *    write_cache_pages() ---> (via passed in callback function)
 	 *        __mpage_da_writepage() -->
 	 *           mpage_add_bh_to_extent()
@@ -2858,63 +2922,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
-static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
 {
-	struct buffer_head *head, *bh;
-	unsigned int curr_off = 0;
-
-	if (!page_has_buffers(page))
-		return;
-	head = bh = page_buffers(page);
-	do {
-		if (offset <= curr_off && test_clear_buffer_uninit(bh)
-					&& bh->b_private) {
-			ext4_free_io_end(bh->b_private);
-			bh->b_private = NULL;
-			bh->b_end_io = NULL;
-		}
-		curr_off = curr_off + bh->b_size;
-		bh = bh->b_this_page;
-	} while (bh != head);
-}
-
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
-{
-	trace_ext4_invalidatepage(page, offset);
-
-	/*
-	 * free any io_end structure allocated for buffers to be discarded
-	 */
-	if (ext4_should_dioread_nolock(page->mapping->host))
-		ext4_invalidatepage_free_endio(page, offset);
+	trace_ext4_invalidatepage(page, offset, length);
 
 	/* No journalling happens on data buffers when this function is used */
 	WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
 
-	block_invalidatepage(page, offset);
+	block_invalidatepage(page, offset, length);
 }
 
 static int __ext4_journalled_invalidatepage(struct page *page,
-					    unsigned long offset)
+					    unsigned int offset,
+					    unsigned int length)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 
-	trace_ext4_journalled_invalidatepage(page, offset);
+	trace_ext4_journalled_invalidatepage(page, offset, length);
 
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		ClearPageChecked(page);
 
-	return jbd2_journal_invalidatepage(journal, page, offset);
+	return jbd2_journal_invalidatepage(journal, page, offset, length);
 }
 
 /* Wrapper for aops... */
 static void ext4_journalled_invalidatepage(struct page *page,
-					   unsigned long offset)
+					   unsigned int offset,
+					   unsigned int length)
 {
-	WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
+	WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
 }
 
 static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2923,8 +2964,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 
 	trace_ext4_releasepage(page);
 
-	WARN_ON(PageChecked(page));
-	if (!page_has_buffers(page))
+	/* Page has dirty journalled data -> cannot release */
+	if (PageChecked(page))
 		return 0;
 	if (journal)
 		return jbd2_journal_try_to_free_buffers(journal, page, wait);
@@ -2959,12 +3000,16 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 			    ssize_t size, void *private, int ret,
 			    bool is_async)
 {
-	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(iocb->ki_filp);
         ext4_io_end_t *io_end = iocb->private;
 
-	/* if not async direct IO or dio with 0 bytes write, just return */
-	if (!io_end || !size)
-		goto out;
+	/* if not async direct IO just return */
+	if (!io_end) {
+		inode_dio_done(inode);
+		if (is_async)
+			aio_complete(iocb, ret, 0);
+		return;
+	}
 
 	ext_debug("ext4_end_io_dio(): io_end 0x%p "
 		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -2972,84 +3017,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 		  size);
 
 	iocb->private = NULL;
-
-	/* if not aio dio with unwritten extents, just free io and return */
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-		ext4_free_io_end(io_end);
-out:
-		if (is_async)
-			aio_complete(iocb, ret, 0);
-		inode_dio_done(inode);
-		return;
-	}
-
 	io_end->offset = offset;
 	io_end->size = size;
 	if (is_async) {
 		io_end->iocb = iocb;
 		io_end->result = ret;
 	}
-
-	ext4_add_complete_io(io_end);
-}
-
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
-{
-	ext4_io_end_t *io_end = bh->b_private;
-	struct inode *inode;
-
-	if (!test_clear_buffer_uninit(bh) || !io_end)
-		goto out;
-
-	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
-		ext4_msg(io_end->inode->i_sb, KERN_INFO,
-			 "sb umounted, discard end_io request for inode %lu",
-			 io_end->inode->i_ino);
-		ext4_free_io_end(io_end);
-		goto out;
-	}
-
-	/*
-	 * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
-	 * but being more careful is always safe for the future change.
-	 */
-	inode = io_end->inode;
-	ext4_set_io_unwritten_flag(inode, io_end);
-	ext4_add_complete_io(io_end);
-out:
-	bh->b_private = NULL;
-	bh->b_end_io = NULL;
-	clear_buffer_uninit(bh);
-	end_buffer_async_write(bh, uptodate);
-}
-
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
-{
-	ext4_io_end_t *io_end;
-	struct page *page = bh->b_page;
-	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
-	size_t size = bh->b_size;
-
-retry:
-	io_end = ext4_init_io_end(inode, GFP_ATOMIC);
-	if (!io_end) {
-		pr_warn_ratelimited("%s: allocation fail\n", __func__);
-		schedule();
-		goto retry;
-	}
-	io_end->offset = offset;
-	io_end->size = size;
-	/*
-	 * We need to hold a reference to the page to make sure it
-	 * doesn't get evicted before ext4_end_io_work() has a chance
-	 * to convert the extent from written to unwritten.
-	 */
-	io_end->page = page;
-	get_page(io_end->page);
-
-	bh->b_private = io_end;
-	bh->b_end_io = ext4_end_io_buffer_write;
-	return 0;
+	ext4_put_io_end_defer(io_end);
 }
 
 /*
@@ -3083,6 +3057,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	get_block_t *get_block_func = NULL;
 	int dio_flags = 0;
 	loff_t final_size = offset + count;
+	ext4_io_end_t *io_end = NULL;
 
 	/* Use the old path for reads and writes beyond i_size. */
 	if (rw != WRITE || final_size > inode->i_size)
@@ -3090,11 +3065,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 
 	BUG_ON(iocb->private == NULL);
 
+	/*
+	 * Make all waiters for direct IO properly wait also for extent
+	 * conversion. This also disallows race between truncate() and
+	 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+	 */
+	if (rw == WRITE)
+		atomic_inc(&inode->i_dio_count);
+
 	/* If we do a overwrite dio, i_mutex locking can be released */
 	overwrite = *((int *)iocb->private);
 
 	if (overwrite) {
-		atomic_inc(&inode->i_dio_count);
 		down_read(&EXT4_I(inode)->i_data_sem);
 		mutex_unlock(&inode->i_mutex);
 	}
@@ -3121,13 +3103,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	iocb->private = NULL;
 	ext4_inode_aio_set(inode, NULL);
 	if (!is_sync_kiocb(iocb)) {
-		ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+		io_end = ext4_init_io_end(inode, GFP_NOFS);
 		if (!io_end) {
 			ret = -ENOMEM;
 			goto retake_lock;
 		}
 		io_end->flag |= EXT4_IO_END_DIRECT;
-		iocb->private = io_end;
+		/*
+		 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+		 */
+		iocb->private = ext4_get_io_end(io_end);
 		/*
 		 * we save the io structure for current async direct
 		 * IO, so that later ext4_map_blocks() could flag the
@@ -3151,33 +3136,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 				   NULL,
 				   dio_flags);
 
-	if (iocb->private)
-		ext4_inode_aio_set(inode, NULL);
 	/*
-	 * The io_end structure takes a reference to the inode, that
-	 * structure needs to be destroyed and the reference to the
-	 * inode need to be dropped, when IO is complete, even with 0
-	 * byte write, or failed.
-	 *
-	 * In the successful AIO DIO case, the io_end structure will
-	 * be destroyed and the reference to the inode will be dropped
-	 * after the end_io call back function is called.
-	 *
-	 * In the case there is 0 byte write, or error case, since VFS
-	 * direct IO won't invoke the end_io call back function, we
-	 * need to free the end_io structure here.
+	 * Put our reference to io_end. This can free the io_end structure e.g.
+	 * in sync IO case or in case of error. It can even perform extent
+	 * conversion if all bios we submitted finished before we got here.
+	 * Note that in that case iocb->private can be already set to NULL
+	 * here.
 	 */
-	if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
-		ext4_free_io_end(iocb->private);
-		iocb->private = NULL;
-	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+	if (io_end) {
+		ext4_inode_aio_set(inode, NULL);
+		ext4_put_io_end(io_end);
+		/*
+		 * When no IO was submitted ext4_end_io_dio() was not
+		 * called so we have to put iocb's reference.
+		 */
+		if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+			WARN_ON(iocb->private != io_end);
+			WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+			WARN_ON(io_end->iocb);
+			/*
+			 * Generic code already did inode_dio_done() so we
+			 * have to clear EXT4_IO_END_DIRECT to not do it for
+			 * the second time.
+			 */
+			io_end->flag = 0;
+			ext4_put_io_end(io_end);
+			iocb->private = NULL;
+		}
+	}
+	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
 		int err;
 		/*
 		 * for non AIO case, since the IO is already
 		 * completed, we could do the conversion right here
 		 */
-		err = ext4_convert_unwritten_extents(inode,
+		err = ext4_convert_unwritten_extents(NULL, inode,
 						     offset, ret);
 		if (err < 0)
 			ret = err;
@@ -3185,9 +3179,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	}
 
 retake_lock:
+	if (rw == WRITE)
+		inode_dio_done(inode);
 	/* take i_mutex locking again if we do a ovewrite dio */
 	if (overwrite) {
-		inode_dio_done(inode);
 		up_read(&EXT4_I(inode)->i_data_sem);
 		mutex_lock(&inode->i_mutex);
 	}
@@ -3242,27 +3237,13 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-static const struct address_space_operations ext4_ordered_aops = {
-	.readpage		= ext4_readpage,
-	.readpages		= ext4_readpages,
-	.writepage		= ext4_writepage,
-	.write_begin		= ext4_write_begin,
-	.write_end		= ext4_ordered_write_end,
-	.bmap			= ext4_bmap,
-	.invalidatepage		= ext4_invalidatepage,
-	.releasepage		= ext4_releasepage,
-	.direct_IO		= ext4_direct_IO,
-	.migratepage		= buffer_migrate_page,
-	.is_partially_uptodate  = block_is_partially_uptodate,
-	.error_remove_page	= generic_error_remove_page,
-};
-
-static const struct address_space_operations ext4_writeback_aops = {
+static const struct address_space_operations ext4_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
 	.write_begin		= ext4_write_begin,
-	.write_end		= ext4_writeback_write_end,
+	.write_end		= ext4_write_end,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
@@ -3276,6 +3257,7 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_journalled_write_end,
 	.set_page_dirty		= ext4_journalled_set_page_dirty,
@@ -3291,7 +3273,7 @@ static const struct address_space_operations ext4_da_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
-	.writepages		= ext4_da_writepages,
+	.writepages		= ext4_writepages,
 	.write_begin		= ext4_da_write_begin,
 	.write_end		= ext4_da_write_end,
 	.bmap			= ext4_bmap,
@@ -3307,108 +3289,73 @@ void ext4_set_aops(struct inode *inode)
 {
 	switch (ext4_inode_journal_mode(inode)) {
 	case EXT4_INODE_ORDERED_DATA_MODE:
-		if (test_opt(inode->i_sb, DELALLOC))
-			inode->i_mapping->a_ops = &ext4_da_aops;
-		else
-			inode->i_mapping->a_ops = &ext4_ordered_aops;
+		ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
 		break;
 	case EXT4_INODE_WRITEBACK_DATA_MODE:
-		if (test_opt(inode->i_sb, DELALLOC))
-			inode->i_mapping->a_ops = &ext4_da_aops;
-		else
-			inode->i_mapping->a_ops = &ext4_writeback_aops;
+		ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
 		break;
 	case EXT4_INODE_JOURNAL_DATA_MODE:
 		inode->i_mapping->a_ops = &ext4_journalled_aops;
-		break;
+		return;
 	default:
 		BUG();
 	}
+	if (test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
+	else
+		inode->i_mapping->a_ops = &ext4_aops;
 }
 
-
 /*
- * ext4_discard_partial_page_buffers()
- * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
- * This function finds and locks the page containing the offset
- * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
- * Calling functions that already have the page locked should call
- * ext4_discard_partial_page_buffers_no_lock directly.
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
  */
-int ext4_discard_partial_page_buffers(handle_t *handle,
-		struct address_space *mapping, loff_t from,
-		loff_t length, int flags)
+int ext4_block_truncate_page(handle_t *handle,
+		struct address_space *mapping, loff_t from)
 {
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned length;
+	unsigned blocksize;
 	struct inode *inode = mapping->host;
-	struct page *page;
-	int err = 0;
-
-	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-				   mapping_gfp_mask(mapping) & ~__GFP_FS);
-	if (!page)
-		return -ENOMEM;
 
-	err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
-		from, length, flags);
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
 
-	unlock_page(page);
-	page_cache_release(page);
-	return err;
+	return ext4_block_zero_page_range(handle, mapping, from, length);
 }
 
 /*
- * ext4_discard_partial_page_buffers_no_lock()
- * Zeros a page range of length 'length' starting from offset 'from'.
- * Buffer heads that correspond to the block aligned regions of the
- * zeroed range will be unmapped.  Unblock aligned regions
- * will have the corresponding buffer head mapped if needed so that
- * that region of the page can be updated with the partial zero out.
- *
- * This function assumes that the page has already been  locked.  The
- * The range to be discarded must be contained with in the given page.
- * If the specified range exceeds the end of the page it will be shortened
- * to the end of the page that corresponds to 'from'.  This function is
- * appropriate for updating a page and it buffer heads to be unmapped and
- * zeroed for blocks that have been either released, or are going to be
- * released.
- *
- * handle: The journal handle
- * inode:  The files inode
- * page:   A locked page that contains the offset "from"
- * from:   The starting byte offset (from the beginning of the file)
- *         to begin discarding
- * len:    The length of bytes to discard
- * flags:  Optional flags that may be used:
- *
- *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
- *         Only zero the regions of the page whose buffer heads
- *         have already been unmapped.  This flag is appropriate
- *         for updating the contents of a page whose blocks may
- *         have already been released, and we only want to zero
- *         out the regions that correspond to those released blocks.
- *
- * Returns zero on success or negative on failure.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
  */
-static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
-		struct inode *inode, struct page *page, loff_t from,
-		loff_t length, int flags)
+int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned int offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned int blocksize, max, pos;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, max, pos;
 	ext4_lblk_t iblock;
+	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
+	struct page *page;
 	int err = 0;
 
-	blocksize = inode->i_sb->s_blocksize;
-	max = PAGE_CACHE_SIZE - offset;
+	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+				   mapping_gfp_mask(mapping) & ~__GFP_FS);
+	if (!page)
+		return -ENOMEM;
 
-	if (index != page->index)
-		return -EINVAL;
+	blocksize = inode->i_sb->s_blocksize;
+	max = blocksize - (offset & (blocksize - 1));
 
 	/*
 	 * correct length if it does not fall between
-	 * 'from' and the end of the page
+	 * 'from' and the end of the block
 	 */
 	if (length > max || length < 0)
 		length = max;
@@ -3426,106 +3373,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
 		iblock++;
 		pos += blocksize;
 	}
-
-	pos = offset;
-	while (pos < offset + length) {
-		unsigned int end_of_block, range_to_discard;
-
-		err = 0;
-
-		/* The length of space left to zero and unmap */
-		range_to_discard = offset + length - pos;
-
-		/* The length of space until the end of the block */
-		end_of_block = blocksize - (pos & (blocksize-1));
-
-		/*
-		 * Do not unmap or zero past end of block
-		 * for this buffer head
-		 */
-		if (range_to_discard > end_of_block)
-			range_to_discard = end_of_block;
-
-
-		/*
-		 * Skip this buffer head if we are only zeroing unampped
-		 * regions of the page
-		 */
-		if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
-			buffer_mapped(bh))
-				goto next;
-
-		/* If the range is block aligned, unmap */
-		if (range_to_discard == blocksize) {
-			clear_buffer_dirty(bh);
-			bh->b_bdev = NULL;
-			clear_buffer_mapped(bh);
-			clear_buffer_req(bh);
-			clear_buffer_new(bh);
-			clear_buffer_delay(bh);
-			clear_buffer_unwritten(bh);
-			clear_buffer_uptodate(bh);
-			zero_user(page, pos, range_to_discard);
-			BUFFER_TRACE(bh, "Buffer discarded");
-			goto next;
-		}
-
-		/*
-		 * If this block is not completely contained in the range
-		 * to be discarded, then it is not going to be released. Because
-		 * we need to keep this block, we need to make sure this part
-		 * of the page is uptodate before we modify it by writeing
-		 * partial zeros on it.
-		 */
+	if (buffer_freed(bh)) {
+		BUFFER_TRACE(bh, "freed: skip");
+		goto unlock;
+	}
+	if (!buffer_mapped(bh)) {
+		BUFFER_TRACE(bh, "unmapped");
+		ext4_get_block(inode, iblock, bh, 0);
+		/* unmapped? It's a hole - nothing to do */
 		if (!buffer_mapped(bh)) {
-			/*
-			 * Buffer head must be mapped before we can read
-			 * from the block
-			 */
-			BUFFER_TRACE(bh, "unmapped");
-			ext4_get_block(inode, iblock, bh, 0);
-			/* unmapped? It's a hole - nothing to do */
-			if (!buffer_mapped(bh)) {
-				BUFFER_TRACE(bh, "still unmapped");
-				goto next;
-			}
+			BUFFER_TRACE(bh, "still unmapped");
+			goto unlock;
 		}
+	}
 
-		/* Ok, it's mapped. Make sure it's up-to-date */
-		if (PageUptodate(page))
-			set_buffer_uptodate(bh);
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
 
-		if (!buffer_uptodate(bh)) {
-			err = -EIO;
-			ll_rw_block(READ, 1, &bh);
-			wait_on_buffer(bh);
-			/* Uhhuh. Read error. Complain and punt.*/
-			if (!buffer_uptodate(bh))
-				goto next;
-		}
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+	if (ext4_should_journal_data(inode)) {
+		BUFFER_TRACE(bh, "get write access");
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err)
+			goto unlock;
+	}
+	zero_user(page, offset, length);
+	BUFFER_TRACE(bh, "zeroed end of block");
 
-		if (ext4_should_journal_data(inode)) {
-			BUFFER_TRACE(bh, "get write access");
-			err = ext4_journal_get_write_access(handle, bh);
-			if (err)
-				goto next;
-		}
+	if (ext4_should_journal_data(inode)) {
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+	} else {
+		err = 0;
+		mark_buffer_dirty(bh);
+		if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+			err = ext4_jbd2_file_inode(handle, inode);
+	}
 
-		zero_user(page, pos, range_to_discard);
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
 
-		err = 0;
-		if (ext4_should_journal_data(inode)) {
-			err = ext4_handle_dirty_metadata(handle, inode, bh);
-		} else
-			mark_buffer_dirty(bh);
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+			     loff_t lstart, loff_t length)
+{
+	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned partial_start, partial_end;
+	ext4_fsblk_t start, end;
+	loff_t byte_end = (lstart + length - 1);
+	int err = 0;
 
-		BUFFER_TRACE(bh, "Partial buffer zeroed");
-next:
-		bh = bh->b_this_page;
-		iblock++;
-		pos += range_to_discard;
-	}
+	partial_start = lstart & (sb->s_blocksize - 1);
+	partial_end = byte_end & (sb->s_blocksize - 1);
 
+	start = lstart >> sb->s_blocksize_bits;
+	end = byte_end >> sb->s_blocksize_bits;
+
+	/* Handle partial zero within the single block */
+	if (start == end &&
+	    (partial_start || (partial_end != sb->s_blocksize - 1))) {
+		err = ext4_block_zero_page_range(handle, mapping,
+						 lstart, length);
+		return err;
+	}
+	/* Handle partial zero out on the start of the range */
+	if (partial_start) {
+		err = ext4_block_zero_page_range(handle, mapping,
+						 lstart, sb->s_blocksize);
+		if (err)
+			return err;
+	}
+	/* Handle partial zero out on the end of the range */
+	if (partial_end != sb->s_blocksize - 1)
+		err = ext4_block_zero_page_range(handle, mapping,
+						 byte_end - partial_end,
+						 partial_end + 1);
 	return err;
 }
 
@@ -3551,23 +3483,128 @@ int ext4_can_truncate(struct inode *inode)
  * Returns: 0 on success or negative on failure
  */
 
-int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	ext4_lblk_t first_block, stop_block;
+	struct address_space *mapping = inode->i_mapping;
+	loff_t first_block_offset, last_block_offset;
+	handle_t *handle;
+	unsigned int credits;
+	int ret = 0;
+
 	if (!S_ISREG(inode->i_mode))
 		return -EOPNOTSUPP;
 
-	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-		/* TODO: Add support for non extent hole punching */
+	if (EXT4_SB(sb)->s_cluster_ratio > 1) {
+		/* TODO: Add support for bigalloc file systems */
 		return -EOPNOTSUPP;
 	}
 
-	if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
-		/* TODO: Add support for bigalloc file systems */
-		return -EOPNOTSUPP;
+	trace_ext4_punch_hole(inode, offset, length);
+
+	/*
+	 * Write out all dirty pages to avoid race conditions
+	 * Then release them.
+	 */
+	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		ret = filemap_write_and_wait_range(mapping, offset,
+						   offset + length - 1);
+		if (ret)
+			return ret;
+	}
+
+	mutex_lock(&inode->i_mutex);
+	/* It's not possible punch hole on append only file */
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+		ret = -EPERM;
+		goto out_mutex;
+	}
+	if (IS_SWAPFILE(inode)) {
+		ret = -ETXTBSY;
+		goto out_mutex;
+	}
+
+	/* No need to punch hole beyond i_size */
+	if (offset >= inode->i_size)
+		goto out_mutex;
+
+	/*
+	 * If the hole extends beyond i_size, set the hole
+	 * to end after the page that contains i_size
+	 */
+	if (offset + length > inode->i_size) {
+		length = inode->i_size +
+		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+		   offset;
+	}
+
+	first_block_offset = round_up(offset, sb->s_blocksize);
+	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+
+	/* Now release the pages and zero block aligned part of pages*/
+	if (last_block_offset > first_block_offset)
+		truncate_pagecache_range(inode, first_block_offset,
+					 last_block_offset);
+
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		credits = ext4_writepage_trans_blocks(inode);
+	else
+		credits = ext4_blocks_for_truncate(inode);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		ext4_std_error(sb, ret);
+		goto out_dio;
+	}
+
+	ret = ext4_zero_partial_blocks(handle, inode, offset,
+				       length);
+	if (ret)
+		goto out_stop;
+
+	first_block = (offset + sb->s_blocksize - 1) >>
+		EXT4_BLOCK_SIZE_BITS(sb);
+	stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+	/* If there are no blocks to remove, return now */
+	if (first_block >= stop_block)
+		goto out_stop;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_discard_preallocations(inode);
+
+	ret = ext4_es_remove_extent(inode, first_block,
+				    stop_block - first_block);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
 	}
 
-	return ext4_ext_punch_hole(file, offset, length);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ret = ext4_ext_remove_space(inode, first_block,
+					    stop_block - 1);
+	else
+		ret = ext4_free_hole_blocks(handle, inode, first_block,
+					    stop_block);
+
+	ext4_discard_preallocations(inode);
+	up_write(&EXT4_I(inode)->i_data_sem);
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+out_stop:
+	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
 }
 
 /*
@@ -3600,6 +3637,18 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
  */
 void ext4_truncate(struct inode *inode)
 {
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int credits;
+	handle_t *handle;
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * There is a possibility that we're either freeing the inode
+	 * or it completely new indode. In those cases we might not
+	 * have i_mutex locked because it's not necessary.
+	 */
+	if (!(inode->i_state & (I_NEW|I_FREEING)))
+		WARN_ON(!mutex_is_locked(&inode->i_mutex));
 	trace_ext4_truncate_enter(inode);
 
 	if (!ext4_can_truncate(inode))
@@ -3619,9 +3668,59 @@ void ext4_truncate(struct inode *inode)
 	}
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		ext4_ext_truncate(inode);
+		credits = ext4_writepage_trans_blocks(inode);
 	else
-		ext4_ind_truncate(inode);
+		credits = ext4_blocks_for_truncate(inode);
+
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+	if (IS_ERR(handle)) {
+		ext4_std_error(inode->i_sb, PTR_ERR(handle));
+		return;
+	}
+
+	if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+		ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+	/*
+	 * We add the inode to the orphan list, so that if this
+	 * truncate spans multiple transactions, and we crash, we will
+	 * resume the truncate when the filesystem recovers.  It also
+	 * marks the inode dirty, to catch the new size.
+	 *
+	 * Implication: the file must always be in a sane, consistent
+	 * truncatable state while each transaction commits.
+	 */
+	if (ext4_orphan_add(handle, inode))
+		goto out_stop;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+
+	ext4_discard_preallocations(inode);
+
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ext4_ext_truncate(handle, inode);
+	else
+		ext4_ind_truncate(handle, inode);
+
+	up_write(&ei->i_data_sem);
+
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+
+out_stop:
+	/*
+	 * If this was a simple ftruncate() and the file will remain alive,
+	 * then we need to clear up the orphan record which we created above.
+	 * However, if this was a real unlink then we were called by
+	 * ext4_delete_inode(), and we allow that function to clean up the
+	 * orphan info for us.
+	 */
+	if (inode->i_nlink)
+		ext4_orphan_del(handle, inode);
+
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
 
 	trace_ext4_truncate_exit(inode);
 }
@@ -3660,11 +3759,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
 	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
 
 	bh = sb_getblk(sb, block);
-	if (!bh) {
-		EXT4_ERROR_INODE_BLOCK(inode, block,
-				       "unable to read itable block");
-		return -EIO;
-	}
+	if (unlikely(!bh))
+		return -ENOMEM;
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
 
@@ -3696,7 +3792,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
 
 			/* Is the inode bitmap in cache? */
 			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
-			if (!bitmap_bh)
+			if (unlikely(!bitmap_bh))
 				goto make_io;
 
 			/*
@@ -3732,13 +3828,14 @@ make_io:
 		if (EXT4_SB(sb)->s_inode_readahead_blks) {
 			ext4_fsblk_t b, end, table;
 			unsigned num;
+			__u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
 
 			table = ext4_inode_table(sb, gdp);
 			/* s_inode_readahead_blks is always a power of 2 */
-			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+			b = block & ~((ext4_fsblk_t) ra_blks - 1);
 			if (table > b)
 				b = table;
-			end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+			end = b + ra_blks;
 			num = EXT4_INODES_PER_GROUP(sb);
 			if (ext4_has_group_desc_csum(sb))
 				num -= ext4_itable_unused_count(sb, gdp);
@@ -3935,8 +4032,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	 * NeilBrown 1999oct15
 	 */
 	if (inode->i_nlink == 0) {
-		if (inode->i_mode == 0 ||
-		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+		if ((inode->i_mode == 0 ||
+		     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+		    ino != EXT4_BOOT_LOADER_INO) {
 			/* this inode is deleted */
 			ret = -ESTALE;
 			goto bad_inode;
@@ -3944,7 +4042,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		/* The only unlinked inodes we let through here have
 		 * valid i_mode and are being read by the orphan
 		 * recovery code: that's fine, we're about to complete
-		 * the process of deleting those. */
+		 * the process of deleting those.
+		 * OR it is the EXT4_BOOT_LOADER_INO which is
+		 * not initialized on a new filesystem. */
 	}
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
@@ -4064,6 +4164,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+	} else if (ino == EXT4_BOOT_LOADER_INO) {
+		make_bad_inode(inode);
 	} else {
 		ret = -EIO;
 		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
@@ -4346,7 +4448,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
 				      inode->i_size >> PAGE_CACHE_SHIFT);
 		if (!page)
 			return;
-		ret = __ext4_journalled_invalidatepage(page, offset);
+		ret = __ext4_journalled_invalidatepage(page, offset,
+						PAGE_CACHE_SIZE - offset);
 		unlock_page(page);
 		page_cache_release(page);
 		if (ret != -EBUSY)
@@ -4404,8 +4507,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 
 		/* (user+group)*(old+new) structure, inode write (sb,
 		 * inode block, ? - but truncate inode update has it) */
-		handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
-					EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+		handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+			(EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+			 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
@@ -4440,7 +4544,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	    (attr->ia_size < inode->i_size)) {
 		handle_t *handle;
 
-		handle = ext4_journal_start(inode, 3);
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
@@ -4460,7 +4564,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 							    attr->ia_size);
 			if (error) {
 				/* Do as much error cleanup as possible */
-				handle = ext4_journal_start(inode, 3);
+				handle = ext4_journal_start(inode,
+							    EXT4_HT_INODE, 3);
 				if (IS_ERR(handle)) {
 					ext4_orphan_del(NULL, inode);
 					goto err_out;
@@ -4526,7 +4631,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
 	struct inode *inode;
-	unsigned long delalloc_blocks;
+	unsigned long long delalloc_blocks;
 
 	inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
@@ -4544,15 +4649,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
 				EXT4_I(inode)->i_reserved_data_blocks);
 
-	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
 	return 0;
 }
 
-static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+				   int pextents)
 {
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-		return ext4_ind_trans_blocks(inode, nrblocks, chunk);
-	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+		return ext4_ind_trans_blocks(inode, lblocks);
+	return ext4_ext_index_trans_blocks(inode, pextents);
 }
 
 /*
@@ -4566,7 +4672,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  *
  * Also account for superblock, inode, quota and xattr blocks
  */
-static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+				  int pextents)
 {
 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
 	int gdpblocks;
@@ -4574,14 +4681,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 	int ret = 0;
 
 	/*
-	 * How many index blocks need to touch to modify nrblocks?
-	 * The "Chunk" flag indicating whether the nrblocks is
-	 * physically contiguous on disk
-	 *
-	 * For Direct IO and fallocate, they calls get_block to allocate
-	 * one single extent at a time, so they could set the "Chunk" flag
+	 * How many index blocks need to touch to map @lblocks logical blocks
+	 * to @pextents physical extents?
 	 */
-	idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+	idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
 
 	ret = idxblocks;
 
@@ -4589,12 +4692,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 	 * Now let's see how many group bitmaps and group descriptors need
 	 * to account
 	 */
-	groups = idxblocks;
-	if (chunk)
-		groups += 1;
-	else
-		groups += nrblocks;
-
+	groups = idxblocks + pextents;
 	gdpblocks = groups;
 	if (groups > ngroups)
 		groups = ngroups;
@@ -4625,7 +4723,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
 	int bpp = ext4_journal_blocks_per_page(inode);
 	int ret;
 
-	ret = ext4_meta_trans_blocks(inode, bpp, 0);
+	ret = ext4_meta_trans_blocks(inode, bpp, bpp);
 
 	/* Account for data blocks for journalled mode */
 	if (ext4_should_journal_data(inode))
@@ -4801,7 +4899,7 @@ void ext4_dirty_inode(struct inode *inode, int flags)
 {
 	handle_t *handle;
 
-	handle = ext4_journal_start(inode, 2);
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 	if (IS_ERR(handle))
 		goto out;
 
@@ -4902,7 +5000,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 
 	/* Finally we can mark the inode as dirty. */
 
-	handle = ext4_journal_start(inode, 1);
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -4926,7 +5024,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	unsigned long len;
 	int ret;
 	struct file *file = vma->vm_file;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
 	handle_t *handle;
 	get_block_t *get_block;
@@ -4968,7 +5066,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 					    0, len, NULL,
 					    ext4_bh_unmapped)) {
 			/* Wait so that we don't change page under IO */
-			wait_on_page_writeback(page);
+			wait_for_stable_page(page);
 			ret = VM_FAULT_LOCKED;
 			goto out;
 		}
@@ -4980,7 +5078,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	else
 		get_block = ext4_get_block;
 retry_alloc:
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+				    ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = VM_FAULT_SIGBUS;
 		goto out;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5747f52f7c7..9491ac0590f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -17,12 +17,204 @@
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
+#include "ext4_extents.h"
 
 #define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
 
+/**
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a:          pointer to first memory area
+ * @b:          pointer to second memory area
+ * @len:        number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+	unsigned char *ap, *bp;
+	unsigned char tmp;
+
+	ap = (unsigned char *)a;
+	bp = (unsigned char *)b;
+	while (len-- > 0) {
+		tmp = *ap;
+		*ap = *bp;
+		*bp = tmp;
+		ap++;
+		bp++;
+	}
+}
+
+/**
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1:     pointer to first inode
+ * @inode2:     pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+	loff_t isize;
+	struct ext4_inode_info *ei1;
+	struct ext4_inode_info *ei2;
+
+	ei1 = EXT4_I(inode1);
+	ei2 = EXT4_I(inode2);
+
+	memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
+	memswap(&inode1->i_version, &inode2->i_version,
+		  sizeof(inode1->i_version));
+	memswap(&inode1->i_blocks, &inode2->i_blocks,
+		  sizeof(inode1->i_blocks));
+	memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
+	memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
+	memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
+
+	memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+	memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
+	memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+	memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
+	memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
+
+	isize = i_size_read(inode1);
+	i_size_write(inode1, i_size_read(inode2));
+	i_size_write(inode2, isize);
+}
+
+/**
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb:         the super block of the filesystem
+ * @inode:      the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+				struct inode *inode)
+{
+	handle_t *handle;
+	int err;
+	struct inode *inode_bl;
+	struct ext4_inode_info *ei;
+	struct ext4_inode_info *ei_bl;
+	struct ext4_sb_info *sbi;
+
+	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
+		err = -EINVAL;
+		goto swap_boot_out;
+	}
+
+	if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto swap_boot_out;
+	}
+
+	sbi = EXT4_SB(sb);
+	ei = EXT4_I(inode);
+
+	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+	if (IS_ERR(inode_bl)) {
+		err = PTR_ERR(inode_bl);
+		goto swap_boot_out;
+	}
+	ei_bl = EXT4_I(inode_bl);
+
+	filemap_flush(inode->i_mapping);
+	filemap_flush(inode_bl->i_mapping);
+
+	/* Protect orig inodes against a truncate and make sure,
+	 * that only 1 swap_inode_boot_loader is running. */
+	ext4_inode_double_lock(inode, inode_bl);
+
+	truncate_inode_pages(&inode->i_data, 0);
+	truncate_inode_pages(&inode_bl->i_data, 0);
+
+	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(inode);
+	ext4_inode_block_unlocked_dio(inode_bl);
+	inode_dio_wait(inode);
+	inode_dio_wait(inode_bl);
+
+	handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+	if (IS_ERR(handle)) {
+		err = -EINVAL;
+		goto swap_boot_out;
+	}
+
+	/* Protect extent tree against block allocations via delalloc */
+	ext4_double_down_write_data_sem(inode, inode_bl);
+
+	if (inode_bl->i_nlink == 0) {
+		/* this inode has never been used as a BOOT_LOADER */
+		set_nlink(inode_bl, 1);
+		i_uid_write(inode_bl, 0);
+		i_gid_write(inode_bl, 0);
+		inode_bl->i_flags = 0;
+		ei_bl->i_flags = 0;
+		inode_bl->i_version = 1;
+		i_size_write(inode_bl, 0);
+		inode_bl->i_mode = S_IFREG;
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+					      EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+			ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+			ext4_ext_tree_init(handle, inode_bl);
+		} else
+			memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+	}
+
+	swap_inode_data(inode, inode_bl);
+
+	inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	inode_bl->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+
+	ext4_discard_preallocations(inode);
+
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err < 0) {
+		ext4_warning(inode->i_sb,
+			"couldn't mark inode #%lu dirty (err %d)",
+			inode->i_ino, err);
+		/* Revert all changes: */
+		swap_inode_data(inode, inode_bl);
+	} else {
+		err = ext4_mark_inode_dirty(handle, inode_bl);
+		if (err < 0) {
+			ext4_warning(inode_bl->i_sb,
+				"couldn't mark inode #%lu dirty (err %d)",
+				inode_bl->i_ino, err);
+			/* Revert all changes: */
+			swap_inode_data(inode, inode_bl);
+			ext4_mark_inode_dirty(handle, inode);
+		}
+	}
+
+	ext4_journal_stop(handle);
+
+	ext4_double_up_write_data_sem(inode, inode_bl);
+
+	ext4_inode_resume_unlocked_dio(inode);
+	ext4_inode_resume_unlocked_dio(inode_bl);
+
+	ext4_inode_double_unlock(inode, inode_bl);
+
+	iput(inode_bl);
+
+swap_boot_out:
+	return err;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct super_block *sb = inode->i_sb;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int flags;
@@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			if (!capable(CAP_SYS_RESOURCE))
 				goto flags_out;
 		}
-		if (oldflags & EXT4_EXTENTS_FL) {
-			/* We don't support clearning extent flags */
-			if (!(flags & EXT4_EXTENTS_FL)) {
-				err = -EOPNOTSUPP;
-				goto flags_out;
-			}
-		} else if (flags & EXT4_EXTENTS_FL) {
-			/* migrate the file */
+		if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
 			migrate = 1;
-			flags &= ~EXT4_EXTENTS_FL;
-		}
 
 		if (flags & EXT4_EOFBLOCKS_FL) {
 			/* we don't support adding EOFBLOCKS flag */
@@ -104,7 +287,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		} else if (oldflags & EXT4_EOFBLOCKS_FL)
 			ext4_truncate(inode);
 
-		handle = ext4_journal_start(inode, 1);
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 		if (IS_ERR(handle)) {
 			err = PTR_ERR(handle);
 			goto flags_out;
@@ -137,8 +320,13 @@ flags_err:
 			err = ext4_change_inode_journal_flag(inode, jflag);
 		if (err)
 			goto flags_out;
-		if (migrate)
-			err = ext4_ext_migrate(inode);
+		if (migrate) {
+			if (flags & EXT4_EXTENTS_FL)
+				err = ext4_ext_migrate(inode);
+			else
+				err = ext4_ind_migrate(inode);
+		}
+
 flags_out:
 		mutex_unlock(&inode->i_mutex);
 		mnt_drop_write_file(filp);
@@ -173,7 +361,7 @@ flags_out:
 		}
 
 		mutex_lock(&inode->i_mutex);
-		handle = ext4_journal_start(inode, 1);
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 		if (IS_ERR(handle)) {
 			err = PTR_ERR(handle);
 			goto unlock_out;
@@ -313,6 +501,9 @@ mext_out:
 		if (err == 0)
 			err = err2;
 		mnt_drop_write_file(filp);
+		if (!err && ext4_has_group_desc_csum(sb) &&
+		    test_opt(sb, INIT_INODE_TABLE))
+			err = ext4_register_li_request(sb, input.group);
 group_add_out:
 		ext4_resize_end(sb);
 		return err;
@@ -354,10 +545,15 @@ group_add_out:
 		return err;
 	}
 
+	case EXT4_IOC_SWAP_BOOT:
+		if (!(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+		return swap_inode_boot_loader(sb, inode);
+
 	case EXT4_IOC_RESIZE_FS: {
 		ext4_fsblk_t n_blocks_count;
-		struct super_block *sb = inode->i_sb;
 		int err = 0, err2 = 0;
+		ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
 
 		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
@@ -388,6 +584,11 @@ group_add_out:
 		if (err == 0)
 			err = err2;
 		mnt_drop_write_file(filp);
+		if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
+		    ext4_has_group_desc_csum(sb) &&
+		    test_opt(sb, INIT_INODE_TABLE))
+			err = ext4_register_li_request(sb, o_group);
+
 resizefs_out:
 		ext4_resize_end(sb);
 		return err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1bf6fe785c4..a9ff5e5137c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,11 +23,18 @@
 
 #include "ext4_jbd2.h"
 #include "mballoc.h"
-#include <linux/debugfs.h>
 #include <linux/log2.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <trace/events/ext4.h>
 
+#ifdef CONFIG_EXT4_DEBUG
+ushort ext4_mballoc_debug __read_mostly;
+
+module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
+MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
+#endif
+
 /*
  * MUSTDO:
  *   - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -398,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr)
 	ext4_clear_bit(bit, addr);
 }
 
+static inline int mb_test_and_clear_bit(int bit, void *addr)
+{
+	addr = mb_correct_addr_and_bit(&bit, addr);
+	return ext4_test_and_clear_bit(bit, addr);
+}
+
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
 	int fix = 0, ret, tmpmax;
@@ -757,6 +770,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 	spin_unlock(&EXT4_SB(sb)->s_bal_lock);
 }
 
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+	int count;
+	int order = 1;
+	void *buddy;
+
+	while ((buddy = mb_find_buddy(e4b, order++, &count))) {
+		ext4_set_bits(buddy, 0, count);
+	}
+	e4b->bd_info->bb_fragments = 0;
+	memset(e4b->bd_info->bb_counters, 0,
+		sizeof(*e4b->bd_info->bb_counters) *
+		(e4b->bd_sb->s_blocksize_bits + 2));
+
+	ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+		e4b->bd_bitmap, e4b->bd_group);
+}
+
 /* The buddy information is attached the buddy cache inode
  * for convenience. The information regarding each group
  * is loaded via ext4_mb_load_buddy. The information involve
@@ -853,8 +884,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 	first_block = page->index * blocks_per_page;
 	for (i = 0; i < blocks_per_page; i++) {
-		int group;
-
 		group = (first_block + i) >> 1;
 		if (group >= ngroups)
 			break;
@@ -1004,6 +1033,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 	struct page *page;
 	int ret = 0;
 
+	might_sleep();
 	mb_debug(1, "init group %u\n", group);
 	this_grp = ext4_get_group_info(sb, group);
 	/*
@@ -1075,6 +1105,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct inode *inode = sbi->s_buddy_cache;
 
+	might_sleep();
 	mb_debug(1, "load group %u\n", group);
 
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
@@ -1237,6 +1268,33 @@ static void mb_clear_bits(void *bm, int cur, int len)
 	}
 }
 
+/* clear bits in given range
+ * will return first found zero bit if any, -1 otherwise
+ */
+static int mb_test_and_clear_bits(void *bm, int cur, int len)
+{
+	__u32 *addr;
+	int zero_bit = -1;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: clear whole word at once */
+			addr = bm + (cur >> 3);
+			if (*addr != (__u32)(-1) && zero_bit == -1)
+				zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
+			*addr = 0;
+			cur += 32;
+			continue;
+		}
+		if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
+			zero_bit = cur;
+		cur++;
+	}
+
+	return zero_bit;
+}
+
 void ext4_set_bits(void *bm, int cur, int len)
 {
 	__u32 *addr;
@@ -1255,17 +1313,90 @@ void ext4_set_bits(void *bm, int cur, int len)
 	}
 }
 
+/*
+ * _________________________________________________________________ */
+
+static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
+{
+	if (mb_test_bit(*bit + side, bitmap)) {
+		mb_clear_bit(*bit, bitmap);
+		(*bit) -= side;
+		return 1;
+	}
+	else {
+		(*bit) += side;
+		mb_set_bit(*bit, bitmap);
+		return -1;
+	}
+}
+
+static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
+{
+	int max;
+	int order = 1;
+	void *buddy = mb_find_buddy(e4b, order, &max);
+
+	while (buddy) {
+		void *buddy2;
+
+		/* Bits in range [first; last] are known to be set since
+		 * corresponding blocks were allocated. Bits in range
+		 * (first; last) will stay set because they form buddies on
+		 * upper layer. We just deal with borders if they don't
+		 * align with upper layer and then go up.
+		 * Releasing entire group is all about clearing
+		 * single bit of highest order buddy.
+		 */
+
+		/* Example:
+		 * ---------------------------------
+		 * |   1   |   1   |   1   |   1   |
+		 * ---------------------------------
+		 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+		 * ---------------------------------
+		 *   0   1   2   3   4   5   6   7
+		 *      \_____________________/
+		 *
+		 * Neither [1] nor [6] is aligned to above layer.
+		 * Left neighbour [0] is free, so mark it busy,
+		 * decrease bb_counters and extend range to
+		 * [0; 6]
+		 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
+		 * mark [6] free, increase bb_counters and shrink range to
+		 * [0; 5].
+		 * Then shift range to [0; 2], go up and do the same.
+		 */
+
+
+		if (first & 1)
+			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
+		if (!(last & 1))
+			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
+		if (first > last)
+			break;
+		order++;
+
+		if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+			mb_clear_bits(buddy, first, last - first + 1);
+			e4b->bd_info->bb_counters[order - 1] += last - first + 1;
+			break;
+		}
+		first >>= 1;
+		last >>= 1;
+		buddy = buddy2;
+	}
+}
+
 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
-			  int first, int count)
+			   int first, int count)
 {
-	int block = 0;
-	int max = 0;
-	int order;
-	void *buddy;
-	void *buddy2;
+	int left_is_free = 0;
+	int right_is_free = 0;
+	int block;
+	int last = first + count - 1;
 	struct super_block *sb = e4b->bd_sb;
 
-	BUG_ON(first + count > (sb->s_blocksize << 3));
+	BUG_ON(last >= (sb->s_blocksize << 3));
 	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 	mb_check_buddy(e4b);
 	mb_free_blocks_double(inode, e4b, first, count);
@@ -1274,67 +1405,54 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 	if (first < e4b->bd_info->bb_first_free)
 		e4b->bd_info->bb_first_free = first;
 
-	/* let's maintain fragments counter */
+	/* access memory sequentially: check left neighbour,
+	 * clear range and then check right neighbour
+	 */
 	if (first != 0)
-		block = !mb_test_bit(first - 1, e4b->bd_bitmap);
-	if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
-		max = !mb_test_bit(first + count, e4b->bd_bitmap);
-	if (block && max)
-		e4b->bd_info->bb_fragments--;
-	else if (!block && !max)
-		e4b->bd_info->bb_fragments++;
+		left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
+	block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
+	if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
+		right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
 
-	/* let's maintain buddy itself */
-	while (count-- > 0) {
-		block = first++;
-		order = 0;
-
-		if (!mb_test_bit(block, e4b->bd_bitmap)) {
-			ext4_fsblk_t blocknr;
-
-			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
-			blocknr += EXT4_C2B(EXT4_SB(sb), block);
-			ext4_grp_locked_error(sb, e4b->bd_group,
-					      inode ? inode->i_ino : 0,
-					      blocknr,
-					      "freeing already freed block "
-					      "(bit %u)", block);
-		}
-		mb_clear_bit(block, e4b->bd_bitmap);
-		e4b->bd_info->bb_counters[order]++;
+	if (unlikely(block != -1)) {
+		ext4_fsblk_t blocknr;
 
-		/* start of the buddy */
-		buddy = mb_find_buddy(e4b, order, &max);
-
-		do {
-			block &= ~1UL;
-			if (mb_test_bit(block, buddy) ||
-					mb_test_bit(block + 1, buddy))
-				break;
-
-			/* both the buddies are free, try to coalesce them */
-			buddy2 = mb_find_buddy(e4b, order + 1, &max);
+		blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+		blocknr += EXT4_C2B(EXT4_SB(sb), block);
+		ext4_grp_locked_error(sb, e4b->bd_group,
+				      inode ? inode->i_ino : 0,
+				      blocknr,
+				      "freeing already freed block "
+				      "(bit %u)", block);
+		mb_regenerate_buddy(e4b);
+		goto done;
+	}
 
-			if (!buddy2)
-				break;
+	/* let's maintain fragments counter */
+	if (left_is_free && right_is_free)
+		e4b->bd_info->bb_fragments--;
+	else if (!left_is_free && !right_is_free)
+		e4b->bd_info->bb_fragments++;
 
-			if (order > 0) {
-				/* for special purposes, we don't set
-				 * free bits in bitmap */
-				mb_set_bit(block, buddy);
-				mb_set_bit(block + 1, buddy);
-			}
-			e4b->bd_info->bb_counters[order]--;
-			e4b->bd_info->bb_counters[order]--;
+	/* buddy[0] == bd_bitmap is a special case, so handle
+	 * it right away and let mb_buddy_mark_free stay free of
+	 * zero order checks.
+	 * Check if neighbours are to be coaleasced,
+	 * adjust bitmap bb_counters and borders appropriately.
+	 */
+	if (first & 1) {
+		first += !left_is_free;
+		e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
+	}
+	if (!(last & 1)) {
+		last -= !right_is_free;
+		e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
+	}
 
-			block = block >> 1;
-			order++;
-			e4b->bd_info->bb_counters[order]++;
+	if (first <= last)
+		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
 
-			mb_clear_bit(block, buddy2);
-			buddy = buddy2;
-		} while (1);
-	}
+done:
 	mb_set_largest_free_order(sb, e4b->bd_info);
 	mb_check_buddy(e4b);
 }
@@ -1884,15 +2002,19 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	case 0:
 		BUG_ON(ac->ac_2order == 0);
 
-		if (grp->bb_largest_free_order < ac->ac_2order)
-			return 0;
-
 		/* Avoid using the first bg of a flexgroup for data files */
 		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
 		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
 		    ((group % flex_size) == 0))
 			return 0;
 
+		if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
+		    (free / fragments) >= ac->ac_g_ex.fe_len)
+			return 1;
+
+		if (grp->bb_largest_free_order < ac->ac_2order)
+			return 0;
+
 		return 1;
 	case 1:
 		if ((free / fragments) >= ac->ac_g_ex.fe_len)
@@ -1983,7 +2105,12 @@ repeat:
 		group = ac->ac_g_ex.fe_group;
 
 		for (i = 0; i < ngroups; group++, i++) {
-			if (group == ngroups)
+			cond_resched();
+			/*
+			 * Artificially restricted ngroups for non-extent
+			 * files makes group > ngroups possible on first loop.
+			 */
+			if (group >= ngroups)
 				group = 0;
 
 			/* This now checks without needing the buddy page */
@@ -2007,7 +2134,7 @@ repeat:
 			}
 
 			ac->ac_groups_scanned++;
-			if (cr == 0)
+			if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
 				ext4_mb_simple_scan_group(ac, &e4b);
 			else if (cr == 1 && sbi->s_stripe &&
 					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
@@ -2138,7 +2265,7 @@ static const struct seq_operations ext4_mb_seq_groups_ops = {
 
 static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
 {
-	struct super_block *sb = PDE(inode)->data;
+	struct super_block *sb = PDE_DATA(inode);
 	int rc;
 
 	rc = seq_open(file, &ext4_mb_seq_groups_ops);
@@ -2656,40 +2783,6 @@ static void ext4_free_data_callback(struct super_block *sb,
 	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
 }
 
-#ifdef CONFIG_EXT4_DEBUG
-u8 mb_enable_debug __read_mostly;
-
-static struct dentry *debugfs_dir;
-static struct dentry *debugfs_debug;
-
-static void __init ext4_create_debugfs_entry(void)
-{
-	debugfs_dir = debugfs_create_dir("ext4", NULL);
-	if (debugfs_dir)
-		debugfs_debug = debugfs_create_u8("mballoc-debug",
-						  S_IRUGO | S_IWUSR,
-						  debugfs_dir,
-						  &mb_enable_debug);
-}
-
-static void ext4_remove_debugfs_entry(void)
-{
-	debugfs_remove(debugfs_debug);
-	debugfs_remove(debugfs_dir);
-}
-
-#else
-
-static void __init ext4_create_debugfs_entry(void)
-{
-}
-
-static void ext4_remove_debugfs_entry(void)
-{
-}
-
-#endif
-
 int __init ext4_init_mballoc(void)
 {
 	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
@@ -2711,7 +2804,6 @@ int __init ext4_init_mballoc(void)
 		kmem_cache_destroy(ext4_ac_cachep);
 		return -ENOMEM;
 	}
-	ext4_create_debugfs_entry();
 	return 0;
 }
 
@@ -2726,7 +2818,6 @@ void ext4_exit_mballoc(void)
 	kmem_cache_destroy(ext4_ac_cachep);
 	kmem_cache_destroy(ext4_free_data_cachep);
 	ext4_groupinfo_destroy_slabs();
-	ext4_remove_debugfs_entry();
 }
 
 
@@ -2829,8 +2920,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
 							  ac->ac_b_ex.fe_group);
-		atomic_sub(ac->ac_b_ex.fe_len,
-			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic64_sub(ac->ac_b_ex.fe_len,
+			     &sbi->s_flex_groups[flex_group].free_clusters);
 	}
 
 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3367,7 +3458,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 	if (pa->pa_type == MB_GROUP_PA)
 		grp_blk--;
 
-	ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
+	grp = ext4_get_group_number(sb, grp_blk);
 
 	/*
 	 * possible race:
@@ -3444,7 +3535,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 			win = offs;
 
 		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
-			EXT4_B2C(sbi, win);
+			EXT4_NUM_B2C(sbi, win);
 		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
 		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
 	}
@@ -3717,11 +3808,7 @@ repeat:
 	if (free < needed && busy) {
 		busy = 0;
 		ext4_unlock_group(sb, group);
-		/*
-		 * Yield the CPU here so that we don't get soft lockup
-		 * in non preempt case.
-		 */
-		yield();
+		cond_resched();
 		goto repeat;
 	}
 
@@ -3836,7 +3923,7 @@ repeat:
 
 	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
 		BUG_ON(pa->pa_type != MB_INODE_PA);
-		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+		group = ext4_get_group_number(sb, pa->pa_pstart);
 
 		err = ext4_mb_load_buddy(sb, group, &e4b);
 		if (err) {
@@ -3872,7 +3959,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 	struct super_block *sb = ac->ac_sb;
 	ext4_group_t ngroups, i;
 
-	if (!mb_enable_debug ||
+	if (!ext4_mballoc_debug ||
 	    (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
 		return;
 
@@ -4005,8 +4092,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	len = ar->len;
 
 	/* just a dirty hack to filter too big requests  */
-	if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10)
-		len = EXT4_CLUSTERS_PER_GROUP(sb) - 10;
+	if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
+		len = EXT4_CLUSTERS_PER_GROUP(sb);
 
 	/* start searching from the goal */
 	goal = ar->goal;
@@ -4098,7 +4185,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 
 	list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
 
-		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+		group = ext4_get_group_number(sb, pa->pa_pstart);
 		if (ext4_mb_load_buddy(sb, group, &e4b)) {
 			ext4_error(sb, "Error loading buddy information for %u",
 					group);
@@ -4136,7 +4223,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
 		/* The max size of hash table is PREALLOC_TB_SIZE */
 		order = PREALLOC_TB_SIZE - 1;
 	/* Add the prealloc space to lg */
-	rcu_read_lock();
+	spin_lock(&lg->lg_prealloc_lock);
 	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
 						pa_inode_list) {
 		spin_lock(&tmp_pa->pa_lock);
@@ -4160,12 +4247,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
 	if (!added)
 		list_add_tail_rcu(&pa->pa_inode_list,
 					&lg->lg_prealloc_list[order]);
-	rcu_read_unlock();
+	spin_unlock(&lg->lg_prealloc_lock);
 
 	/* Now trim the list to be not more than 8 elements */
 	if (lg_prealloc_count > 8) {
 		ext4_mb_discard_lg_preallocations(sb, lg,
-						order, lg_prealloc_count);
+						  order, lg_prealloc_count);
 		return;
 	}
 	return ;
@@ -4246,6 +4333,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	unsigned int inquota = 0;
 	unsigned int reserv_clstrs = 0;
 
+	might_sleep();
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
 
@@ -4271,7 +4359,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
 
 			/* let others to free the space */
-			yield();
+			cond_resched();
 			ar->len = ar->len >> 1;
 		}
 		if (!ar->len) {
@@ -4318,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 repeat:
 		/* allocate space in core */
 		*errp = ext4_mb_regular_allocator(ac);
-		if (*errp) {
-			ext4_discard_allocated_blocks(ac);
-			goto errout;
-		}
+		if (*errp)
+			goto discard_and_exit;
 
 		/* as we've just preallocated more space than
-		 * user requested orinally, we store allocated
+		 * user requested originally, we store allocated
 		 * space in a special descriptor */
 		if (ac->ac_status == AC_STATUS_FOUND &&
-				ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
-			ext4_mb_new_preallocation(ac);
+		    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+			*errp = ext4_mb_new_preallocation(ac);
+		if (*errp) {
+		discard_and_exit:
+			ext4_discard_allocated_blocks(ac);
+			goto errout;
+		}
 	}
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
 		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4449,11 +4540,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	node = rb_prev(new_node);
 	if (node) {
 		entry = rb_entry(node, struct ext4_free_data, efd_node);
-		if (can_merge(entry, new_entry)) {
+		if (can_merge(entry, new_entry) &&
+		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
 			new_entry->efd_start_cluster = entry->efd_start_cluster;
 			new_entry->efd_count += entry->efd_count;
 			rb_erase(node, &(db->bb_free_root));
-			ext4_journal_callback_del(handle, &entry->efd_jce);
 			kmem_cache_free(ext4_free_data_cachep, entry);
 		}
 	}
@@ -4461,10 +4552,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	node = rb_next(new_node);
 	if (node) {
 		entry = rb_entry(node, struct ext4_free_data, efd_node);
-		if (can_merge(new_entry, entry)) {
+		if (can_merge(new_entry, entry) &&
+		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
 			new_entry->efd_count += entry->efd_count;
 			rb_erase(node, &(db->bb_free_root));
-			ext4_journal_callback_del(handle, &entry->efd_jce);
 			kmem_cache_free(ext4_free_data_cachep, entry);
 		}
 	}
@@ -4489,7 +4580,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	struct buffer_head *bitmap_bh = NULL;
 	struct super_block *sb = inode->i_sb;
 	struct ext4_group_desc *gdp;
-	unsigned long freed = 0;
 	unsigned int overflow;
 	ext4_grpblk_t bit;
 	struct buffer_head *gd_bh;
@@ -4500,6 +4590,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	int err = 0;
 	int ret;
 
+	might_sleep();
 	if (bh) {
 		if (block)
 			BUG_ON(block != bh->b_blocknr);
@@ -4525,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 		BUG_ON(bh && (count > 1));
 
 		for (i = 0; i < count; i++) {
+			cond_resched();
 			if (!bh)
 				tbh = sb_find_get_block(inode->i_sb,
 							block + i);
-			if (unlikely(!tbh))
+			if (!tbh)
 				continue;
 			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
 				    inode, tbh, block + i);
@@ -4590,7 +4682,7 @@ do_more:
 			EXT4_BLOCKS_PER_GROUP(sb);
 		count -= overflow;
 	}
-	count_clusters = EXT4_B2C(sbi, count);
+	count_clusters = EXT4_NUM_B2C(sbi, count);
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh) {
 		err = -EIO;
@@ -4691,14 +4783,12 @@ do_more:
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic_add(count_clusters,
-			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic64_add(count_clusters,
+			     &sbi->s_flex_groups[flex_group].free_clusters);
 	}
 
 	ext4_mb_unload_buddy(&e4b);
 
-	freed += count;
-
 	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
 		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
 
@@ -4832,12 +4922,12 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 	ext4_group_desc_csum_set(sb, block_group, desc);
 	ext4_unlock_group(sb, block_group);
 	percpu_counter_add(&sbi->s_freeclusters_counter,
-			   EXT4_B2C(sbi, blocks_freed));
+			   EXT4_NUM_B2C(sbi, blocks_freed));
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-		atomic_add(EXT4_B2C(sbi, blocks_freed),
-			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
 	}
 
 	ext4_mb_unload_buddy(&e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 3ccd889ba95..08481ee84cd 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,11 @@
 /*
  */
 #ifdef CONFIG_EXT4_DEBUG
-extern u8 mb_enable_debug;
+extern ushort ext4_mballoc_debug;
 
 #define mb_debug(n, fmt, a...)	                                        \
 	do {								\
-		if ((n) <= mb_enable_debug) {		        	\
+		if ((n) <= ext4_mballoc_debug) {		        \
 			printk(KERN_DEBUG "(%s, %d): %s: ",		\
 			       __FILE__, __LINE__, __func__);		\
 			printk(fmt, ## a);				\
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index db8226d595f..49e8bdff916 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
 			return retval;
 	}
 	return retval;
-
 }
 
 int ext4_ext_migrate(struct inode *inode)
@@ -456,11 +455,14 @@ int ext4_ext_migrate(struct inode *inode)
 		 */
 		return retval;
 
-	handle = ext4_journal_start(inode,
-					EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
-					+ 1);
+	/*
+	 * Worst case we can touch the allocation bitmaps, a bgd
+	 * block, and a block to link in the orphan list.  We do need
+	 * need to worry about credits for modifying the quota inode.
+	 */
+	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
+		4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+
 	if (IS_ERR(handle)) {
 		retval = PTR_ERR(handle);
 		return retval;
@@ -507,7 +509,7 @@ int ext4_ext_migrate(struct inode *inode)
 	ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 	up_read((&EXT4_I(inode)->i_data_sem));
 
-	handle = ext4_journal_start(inode, 1);
+	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
 	if (IS_ERR(handle)) {
 		/*
 		 * It is impossible to update on-disk structures without
@@ -603,3 +605,64 @@ out:
 
 	return retval;
 }
+
+/*
+ * Migrate a simple extent-based inode to use the i_blocks[] array
+ */
+int ext4_ind_migrate(struct inode *inode)
+{
+	struct ext4_extent_header	*eh;
+	struct ext4_super_block		*es = EXT4_SB(inode->i_sb)->s_es;
+	struct ext4_inode_info		*ei = EXT4_I(inode);
+	struct ext4_extent		*ex;
+	unsigned int			i, len;
+	ext4_fsblk_t			blk;
+	handle_t			*handle;
+	int				ret;
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+	    (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return -EINVAL;
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+		return -EOPNOTSUPP;
+
+	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ret = ext4_ext_check_inode(inode);
+	if (ret)
+		goto errout;
+
+	eh = ext_inode_hdr(inode);
+	ex  = EXT_FIRST_EXTENT(eh);
+	if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
+	    eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
+		ret = -EOPNOTSUPP;
+		goto errout;
+	}
+	if (eh->eh_entries == 0)
+		blk = len = 0;
+	else {
+		len = le16_to_cpu(ex->ee_len);
+		blk = ext4_ext_pblock(ex);
+		if (len > EXT4_NDIR_BLOCKS) {
+			ret = -EOPNOTSUPP;
+			goto errout;
+		}
+	}
+
+	ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+	memset(ei->i_data, 0, sizeof(ei->i_data));
+	for (i=0; i < len; i++)
+		ei->i_data[i] = cpu_to_le32(blk++);
+	ext4_mark_inode_dirty(handle, inode);
+errout:
+	ext4_journal_stop(handle);
+	up_write(&EXT4_I(inode)->i_data_sem);
+	return ret;
+}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index fe7c63f4717..214461e42a0 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -7,7 +7,7 @@
 #include "ext4.h"
 
 /* Checksumming functions */
-static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int offset = offsetof(struct mmp_struct, mmp_checksum);
@@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
 	lock_buffer(bh);
 	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
-	submit_bh(WRITE_SYNC, bh);
+	submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
 	wait_on_buffer(bh);
 	sb_end_write(sb);
 	if (unlikely(!buffer_uptodate(bh)))
@@ -80,18 +80,20 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
 	 * is not blocked in the elevator. */
 	if (!*bh)
 		*bh = sb_getblk(sb, mmp_block);
+	if (!*bh)
+		return -ENOMEM;
 	if (*bh) {
 		get_bh(*bh);
 		lock_buffer(*bh);
 		(*bh)->b_end_io = end_buffer_read_sync;
-		submit_bh(READ_SYNC, *bh);
+		submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
 		wait_on_buffer(*bh);
 		if (!buffer_uptodate(*bh)) {
 			brelse(*bh);
 			*bh = NULL;
 		}
 	}
-	if (!*bh) {
+	if (unlikely(!*bh)) {
 		ext4_warning(sb, "Error while reading MMP block %llu",
 			     mmp_block);
 		return -EIO;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d9cc5ee42f5..e86dddbd829 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -32,16 +32,18 @@
  */
 static inline int
 get_ext_path(struct inode *inode, ext4_lblk_t lblock,
-		struct ext4_ext_path **path)
+		struct ext4_ext_path **orig_path)
 {
 	int ret = 0;
+	struct ext4_ext_path *path;
 
-	*path = ext4_ext_find_extent(inode, lblock, *path);
-	if (IS_ERR(*path)) {
-		ret = PTR_ERR(*path);
-		*path = NULL;
-	} else if ((*path)[ext_depth(inode)].p_ext == NULL)
+	path = ext4_ext_find_extent(inode, lblock, *orig_path);
+	if (IS_ERR(path))
+		ret = PTR_ERR(path);
+	else if (path[ext_depth(inode)].p_ext == NULL)
 		ret = -ENODATA;
+	else
+		*orig_path = path;
 
 	return ret;
 }
@@ -142,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 }
 
 /**
- * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
+ * ext4_double_down_write_data_sem - Acquire two inodes' write lock
+ *                                   of i_data_sem
  *
  * Acquire write lock of i_data_sem of the two inodes
  */
-static void
-double_down_write_data_sem(struct inode *first, struct inode *second)
+void
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
 {
 	if (first < second) {
 		down_write(&EXT4_I(first)->i_data_sem);
@@ -160,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second)
 }
 
 /**
- * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
  *
  * @orig_inode:		original inode structure to be released its lock first
  * @donor_inode:	donor inode structure to be released its lock second
  * Release write lock of i_data_sem of two inodes (orig and donor).
  */
-static void
-double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+void
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+			      struct inode *donor_inode)
 {
 	up_write(&EXT4_I(orig_inode)->i_data_sem);
 	up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -405,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
 		mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
 						end_ext, eh, range_to_move);
 
-	if (depth) {
-		ret = ext4_handle_dirty_metadata(handle, orig_inode,
-						 orig_path->p_bh);
-		if (ret)
-			return ret;
-	} else {
-		ret = ext4_mark_inode_dirty(handle, orig_inode);
-		if (ret < 0)
-			return ret;
-	}
-
-	return 0;
+	return ext4_ext_dirty(handle, orig_inode, orig_path);
 }
 
 /**
@@ -611,24 +604,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent *ext;
+	int ret = 0;
 	ext4_lblk_t last = from + count;
 	while (from < last) {
 		*err = get_ext_path(inode, from, &path);
 		if (*err)
-			return 0;
+			goto out;
 		ext = path[ext_depth(inode)].p_ext;
-		if (!ext) {
-			ext4_ext_drop_refs(path);
-			return 0;
-		}
-		if (uninit != ext4_ext_is_uninitialized(ext)) {
-			ext4_ext_drop_refs(path);
-			return 0;
-		}
+		if (uninit != ext4_ext_is_uninitialized(ext))
+			goto out;
 		from += ext4_ext_get_actual_len(ext);
 		ext4_ext_drop_refs(path);
 	}
-	return 1;
+	ret = 1;
+out:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+	return ret;
 }
 
 /**
@@ -666,6 +660,14 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	int replaced_count = 0;
 	int dext_alen;
 
+	*err = ext4_es_remove_extent(orig_inode, from, count);
+	if (*err)
+		goto out;
+
+	*err = ext4_es_remove_extent(donor_inode, from, count);
+	if (*err)
+		goto out;
+
 	/* Get the original extent for the block "orig_off" */
 	*err = get_ext_path(orig_inode, orig_off, &orig_path);
 	if (*err)
@@ -681,6 +683,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 
 	depth = ext_depth(donor_inode);
 	dext = donor_path[depth].p_ext;
+	if (unlikely(!dext))
+		goto missing_donor_extent;
 	tmp_dext = *dext;
 
 	*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
@@ -691,7 +695,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	/* Loop for the donor extents */
 	while (1) {
 		/* The extent for donor must be found. */
-		if (!dext) {
+		if (unlikely(!dext)) {
+		missing_donor_extent:
 			EXT4_ERROR_INODE(donor_inode,
 				   "The extent for donor must be found");
 			*err = -EIO;
@@ -723,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 		donor_off += dext_alen;
 		orig_off += dext_alen;
 
+		BUG_ON(replaced_count > count);
 		/* Already moved the expected blocks */
 		if (replaced_count >= count)
 			break;
@@ -761,9 +767,6 @@ out:
 		kfree(donor_path);
 	}
 
-	ext4_ext_invalidate_cache(orig_inode);
-	ext4_ext_invalidate_cache(donor_inode);
-
 	return replaced_count;
 }
 
@@ -803,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
 		page_cache_release(page[0]);
 		return -ENOMEM;
 	}
-
+	/*
+	 * grab_cache_page_write_begin() may not wait on page's writeback if
+	 * BDI not demand that. But it is reasonable to be very conservative
+	 * here and explicitly wait on page's writeback
+	 */
+	wait_on_page_writeback(page[0]);
+	wait_on_page_writeback(page[1]);
 	if (inode1 > inode2) {
 		struct page *tmp;
 		tmp = page[0];
@@ -845,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
 		if (buffer_uptodate(bh))
 			continue;
 		if (!buffer_mapped(bh)) {
-			int err = 0;
 			err = ext4_get_block(inode, block, bh, 0);
 			if (err) {
 				SetPageError(page);
@@ -900,11 +908,10 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 		  pgoff_t orig_page_offset, int data_offset_in_page,
 		  int block_len_in_page, int uninit, int *err)
 {
-	struct inode *orig_inode = o_filp->f_dentry->d_inode;
+	struct inode *orig_inode = file_inode(o_filp);
 	struct page *pagep[2] = {NULL, NULL};
 	handle_t *handle;
 	ext4_lblk_t orig_blk_offset;
-	long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
 	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
 	unsigned int w_flags = 0;
 	unsigned int tmp_data_size, data_size, replaced_size;
@@ -920,7 +927,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 again:
 	*err = 0;
 	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
-	handle = ext4_journal_start(orig_inode, jblocks);
+	handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
 	if (IS_ERR(handle)) {
 		*err = PTR_ERR(handle);
 		return 0;
@@ -932,8 +939,6 @@ again:
 	orig_blk_offset = orig_page_offset * blocks_per_page +
 		data_offset_in_page;
 
-	offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
-
 	/* Calculate data_size */
 	if ((orig_blk_offset + block_len_in_page - 1) ==
 	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
@@ -965,7 +970,7 @@ again:
 	 * necessary, just swap data blocks between orig and donor.
 	 */
 	if (uninit) {
-		double_down_write_data_sem(orig_inode, donor_inode);
+		ext4_double_down_write_data_sem(orig_inode, donor_inode);
 		/* If any of extents in range became initialized we have to
 		 * fallback to data copying */
 		uninit = mext_check_coverage(orig_inode, orig_blk_offset,
@@ -979,7 +984,7 @@ again:
 			goto drop_data_sem;
 
 		if (!uninit) {
-			double_up_write_data_sem(orig_inode, donor_inode);
+			ext4_double_up_write_data_sem(orig_inode, donor_inode);
 			goto data_copy;
 		}
 		if ((page_has_private(pagep[0]) &&
@@ -993,7 +998,7 @@ again:
 						donor_inode, orig_blk_offset,
 						block_len_in_page, err);
 	drop_data_sem:
-		double_up_write_data_sem(orig_inode, donor_inode);
+		ext4_double_up_write_data_sem(orig_inode, donor_inode);
 		goto unlock_pages;
 	}
 data_copy:
@@ -1022,7 +1027,7 @@ data_copy:
 	}
 	/* Perform all necessary steps similar write_begin()/write_end()
 	 * but keeping in mind that i_size will not change */
-	*err = __block_write_begin(pagep[0], from, from + replaced_size,
+	*err = __block_write_begin(pagep[0], from, replaced_size,
 				   ext4_get_block);
 	if (!*err)
 		*err = block_commit_write(pagep[0], from, from + replaced_size);
@@ -1054,11 +1059,11 @@ repair_branches:
 	 * Extents are swapped already, but we are not able to copy data.
 	 * Try to swap extents to it's original places
 	 */
-	double_down_write_data_sem(orig_inode, donor_inode);
+	ext4_double_down_write_data_sem(orig_inode, donor_inode);
 	replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
 					       orig_blk_offset,
 					       block_len_in_page, &err2);
-	double_up_write_data_sem(orig_inode, donor_inode);
+	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	if (replaced_count != block_len_in_page) {
 		EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
 				       "Unable to copy data block,"
@@ -1198,15 +1203,15 @@ mext_check_arguments(struct inode *orig_inode,
 }
 
 /**
- * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
  *
  * @inode1:	the inode structure
  * @inode2:	the inode structure
  *
  * Lock two inodes' i_mutex
  */
-static void
-mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+void
+ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
 {
 	BUG_ON(inode1 == inode2);
 	if (inode1 < inode2) {
@@ -1219,15 +1224,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 }
 
 /**
- * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
  *
  * @inode1:     the inode that is released first
  * @inode2:     the inode that is released second
  *
  */
 
-static void
-mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+void
+ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
 {
 	mutex_unlock(&inode1->i_mutex);
 	mutex_unlock(&inode2->i_mutex);
@@ -1279,8 +1284,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		 __u64 orig_start, __u64 donor_start, __u64 len,
 		 __u64 *moved_len)
 {
-	struct inode *orig_inode = o_filp->f_dentry->d_inode;
-	struct inode *donor_inode = d_filp->f_dentry->d_inode;
+	struct inode *orig_inode = file_inode(o_filp);
+	struct inode *donor_inode = file_inode(d_filp);
 	struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
 	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
 	ext4_lblk_t block_start = orig_start;
@@ -1322,7 +1327,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		return -EINVAL;
 	}
 	/* Protect orig and donor inodes against a truncate */
-	mext_inode_double_lock(orig_inode, donor_inode);
+	ext4_inode_double_lock(orig_inode, donor_inode);
 
 	/* Wait for all existing dio workers */
 	ext4_inode_block_unlocked_dio(orig_inode);
@@ -1331,7 +1336,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	inode_dio_wait(donor_inode);
 
 	/* Protect extent tree against block allocations via delalloc */
-	double_down_write_data_sem(orig_inode, donor_inode);
+	ext4_double_down_write_data_sem(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
 	ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
 				    donor_start, &len);
@@ -1455,7 +1460,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		 * b. racing with ->readpage, ->write_begin, and ext4_get_block
 		 *    in move_extent_per_page
 		 */
-		double_up_write_data_sem(orig_inode, donor_inode);
+		ext4_double_up_write_data_sem(orig_inode, donor_inode);
 
 		while (orig_page_offset <= seq_end_page) {
 
@@ -1489,7 +1494,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 				block_len_in_page = rest_blocks;
 		}
 
-		double_down_write_data_sem(orig_inode, donor_inode);
+		ext4_double_down_write_data_sem(orig_inode, donor_inode);
 		if (ret < 0)
 			break;
 
@@ -1527,10 +1532,10 @@ out:
 		ext4_ext_drop_refs(holecheck_path);
 		kfree(holecheck_path);
 	}
-	double_up_write_data_sem(orig_inode, donor_inode);
+	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	ext4_inode_resume_unlocked_dio(orig_inode);
 	ext4_inode_resume_unlocked_dio(donor_inode);
-	mext_inode_double_unlock(orig_inode, donor_inode);
+	ext4_inode_double_unlock(orig_inode, donor_inode);
 
 	return ret;
 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f9ed946a448..234b834d5a9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -47,38 +47,111 @@
 #define NAMEI_RA_CHUNKS  2
 #define NAMEI_RA_BLOCKS  4
 #define NAMEI_RA_SIZE	     (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
 
 static struct buffer_head *ext4_append(handle_t *handle,
 					struct inode *inode,
-					ext4_lblk_t *block, int *err)
+					ext4_lblk_t *block)
 {
 	struct buffer_head *bh;
+	int err = 0;
 
 	if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
 		     ((inode->i_size >> 10) >=
-		      EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
-		*err = -ENOSPC;
-		return NULL;
-	}
+		      EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
+		return ERR_PTR(-ENOSPC);
 
 	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
 
-	bh = ext4_bread(handle, inode, *block, 1, err);
-	if (bh) {
-		inode->i_size += inode->i_sb->s_blocksize;
-		EXT4_I(inode)->i_disksize = inode->i_size;
-		*err = ext4_journal_get_write_access(handle, bh);
-		if (*err) {
+	bh = ext4_bread(handle, inode, *block, 1, &err);
+	if (!bh)
+		return ERR_PTR(err);
+	inode->i_size += inode->i_sb->s_blocksize;
+	EXT4_I(inode)->i_disksize = inode->i_size;
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err) {
+		brelse(bh);
+		ext4_std_error(inode->i_sb, err);
+		return ERR_PTR(err);
+	}
+	return bh;
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+			       struct ext4_dir_entry *dirent);
+
+typedef enum {
+	EITHER, INDEX, DIRENT
+} dirblock_type_t;
+
+#define ext4_read_dirblock(inode, block, type) \
+	__ext4_read_dirblock((inode), (block), (type), __LINE__)
+
+static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
+					      ext4_lblk_t block,
+					      dirblock_type_t type,
+					      unsigned int line)
+{
+	struct buffer_head *bh;
+	struct ext4_dir_entry *dirent;
+	int err = 0, is_dx_block = 0;
+
+	bh = ext4_bread(NULL, inode, block, 0, &err);
+	if (!bh) {
+		if (err == 0) {
+			ext4_error_inode(inode, __func__, line, block,
+					       "Directory hole found");
+			return ERR_PTR(-EIO);
+		}
+		__ext4_warning(inode->i_sb, __func__, line,
+			       "error reading directory block "
+			       "(ino %lu, block %lu)", inode->i_ino,
+			       (unsigned long) block);
+		return ERR_PTR(err);
+	}
+	dirent = (struct ext4_dir_entry *) bh->b_data;
+	/* Determine whether or not we have an index block */
+	if (is_dx(inode)) {
+		if (block == 0)
+			is_dx_block = 1;
+		else if (ext4_rec_len_from_disk(dirent->rec_len,
+						inode->i_sb->s_blocksize) ==
+			 inode->i_sb->s_blocksize)
+			is_dx_block = 1;
+	}
+	if (!is_dx_block && type == INDEX) {
+		ext4_error_inode(inode, __func__, line, block,
+		       "directory leaf block found instead of index block");
+		return ERR_PTR(-EIO);
+	}
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) ||
+	    buffer_verified(bh))
+		return bh;
+
+	/*
+	 * An empty leaf block can get mistaken for a index block; for
+	 * this reason, we can only check the index checksum when the
+	 * caller is sure it should be an index block.
+	 */
+	if (is_dx_block && type == INDEX) {
+		if (ext4_dx_csum_verify(inode, dirent))
+			set_buffer_verified(bh);
+		else {
+			ext4_error_inode(inode, __func__, line, block,
+				"Directory index failed checksum");
 			brelse(bh);
-			bh = NULL;
+			return ERR_PTR(-EIO);
 		}
 	}
-	if (!bh && !(*err)) {
-		*err = -EIO;
-		ext4_error(inode->i_sb,
-			   "Directory hole detected on inode %lu\n",
-			   inode->i_ino);
+	if (!is_dx_block) {
+		if (ext4_dirent_csum_verify(inode, dirent))
+			set_buffer_verified(bh);
+		else {
+			ext4_error_inode(inode, __func__, line, block,
+				"Directory block failed checksum");
+			brelse(bh);
+			return ERR_PTR(-EIO);
+		}
 	}
 	return bh;
 }
@@ -343,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	__u32 csum, old_csum;
+	__u32 csum;
+	__le32 save_csum;
 	int size;
 
 	size = count_offset + (count * sizeof(struct dx_entry));
-	old_csum = t->dt_checksum;
+	save_csum = t->dt_checksum;
 	t->dt_checksum = 0;
 	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
 	csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
-	t->dt_checksum = old_csum;
+	t->dt_checksum = save_csum;
 
 	return cpu_to_le32(csum);
 }
@@ -604,9 +678,9 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	u32 hash;
 
 	frame->bh = NULL;
-	if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) {
-		if (*err == 0)
-			*err = ERR_BAD_DX_DIR;
+	bh = ext4_read_dirblock(dir, 0, INDEX);
+	if (IS_ERR(bh)) {
+		*err = PTR_ERR(bh);
 		goto fail;
 	}
 	root = (struct dx_root *) bh->b_data;
@@ -643,15 +717,6 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 		goto fail;
 	}
 
-	if (!buffer_verified(bh) &&
-	    !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) {
-		ext4_warning(dir->i_sb, "Root failed checksum");
-		brelse(bh);
-		*err = ERR_BAD_DX_DIR;
-		goto fail;
-	}
-	set_buffer_verified(bh);
-
 	entries = (struct dx_entry *) (((char *)&root->info) +
 				       root->info.info_length);
 
@@ -709,22 +774,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 		frame->entries = entries;
 		frame->at = at;
 		if (!indirect--) return frame;
-		if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) {
-			if (!(*err))
-				*err = ERR_BAD_DX_DIR;
+		bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+		if (IS_ERR(bh)) {
+			*err = PTR_ERR(bh);
 			goto fail2;
 		}
-		at = entries = ((struct dx_node *) bh->b_data)->entries;
-
-		if (!buffer_verified(bh) &&
-		    !ext4_dx_csum_verify(dir,
-					 (struct ext4_dir_entry *)bh->b_data)) {
-			ext4_warning(dir->i_sb, "Node failed checksum");
-			brelse(bh);
-			*err = ERR_BAD_DX_DIR;
-			goto fail2;
-		}
-		set_buffer_verified(bh);
+		entries = ((struct dx_node *) bh->b_data)->entries;
 
 		if (dx_get_limit(entries) != dx_node_limit (dir)) {
 			ext4_warning(dir->i_sb,
@@ -783,7 +838,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 {
 	struct dx_frame *p;
 	struct buffer_head *bh;
-	int err, num_frames = 0;
+	int num_frames = 0;
 	__u32 bhash;
 
 	p = frame;
@@ -822,25 +877,9 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 	 * block so no check is necessary
 	 */
 	while (num_frames--) {
-		if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
-				      0, &err))) {
-			if (!err) {
-				ext4_error(dir->i_sb,
-					   "Directory hole detected on inode %lu\n",
-					   dir->i_ino);
-				return -EIO;
-			}
-			return err; /* Failure */
-		}
-
-		if (!buffer_verified(bh) &&
-		    !ext4_dx_csum_verify(dir,
-					 (struct ext4_dir_entry *)bh->b_data)) {
-			ext4_warning(dir->i_sb, "Node failed checksum");
-			return -EIO;
-		}
-		set_buffer_verified(bh);
-
+		bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
 		p++;
 		brelse(p->bh);
 		p->bh = bh;
@@ -866,20 +905,9 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 
 	dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
 							(unsigned long)block));
-	if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) {
-		if (!err) {
-			err = -EIO;
-			ext4_error(dir->i_sb,
-				   "Directory hole detected on inode %lu\n",
-				   dir->i_ino);
-		}
-		return err;
-	}
-
-	if (!buffer_verified(bh) &&
-	    !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
-		return -EIO;
-	set_buffer_verified(bh);
+	bh = ext4_read_dirblock(dir, block, DIRENT);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	top = (struct ext4_dir_entry_2 *) ((char *) de +
@@ -890,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 				bh->b_data, bh->b_size,
 				(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
 					 + ((char *)de - bh->b_data))) {
-			/* On error, skip the f_pos to the next block. */
-			dir_file->f_pos = (dir_file->f_pos |
-					(dir->i_sb->s_blocksize - 1)) + 1;
-			brelse(bh);
-			return count;
+			/* silently ignore the rest of the block */
+			break;
 		}
 		ext4fs_dirhash(de->name, de->name_len, hinfo);
 		if ((hinfo->hash < start_hash) ||
@@ -937,13 +962,24 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 
 	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
 		       start_hash, start_minor_hash));
-	dir = dir_file->f_path.dentry->d_inode;
+	dir = file_inode(dir_file);
 	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
 		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
 		if (hinfo.hash_version <= DX_HASH_TEA)
 			hinfo.hash_version +=
 				EXT4_SB(dir->i_sb)->s_hash_unsigned;
 		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+		if (ext4_has_inline_data(dir)) {
+			int has_inline_data = 1;
+			count = htree_inlinedir_to_tree(dir_file, dir, 0,
+							&hinfo, start_hash,
+							start_minor_hash,
+							&has_inline_data);
+			if (has_inline_data) {
+				*next_hash = ~0;
+				return count;
+			}
+		}
 		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
 					       start_hash, start_minor_hash);
 		*next_hash = ~0;
@@ -1333,26 +1369,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 		return NULL;
 	do {
 		block = dx_get_block(frame->at);
-		if (!(bh = ext4_bread(NULL, dir, block, 0, err))) {
-			if (!(*err)) {
-				*err = -EIO;
-				ext4_error(dir->i_sb,
-					   "Directory hole detected on inode %lu\n",
-					   dir->i_ino);
-			}
+		bh = ext4_read_dirblock(dir, block, DIRENT);
+		if (IS_ERR(bh)) {
+			*err = PTR_ERR(bh);
 			goto errout;
 		}
-
-		if (!buffer_verified(bh) &&
-		    !ext4_dirent_csum_verify(dir,
-				(struct ext4_dir_entry *)bh->b_data)) {
-			EXT4_ERROR_INODE(dir, "checksumming directory "
-					 "block %lu", (unsigned long)block);
-			brelse(bh);
-			*err = -EIO;
-			goto errout;
-		}
-		set_buffer_verified(bh);
 		retval = search_dirblock(bh, dir, d_name,
 					 block << EXT4_BLOCK_SIZE_BITS(sb),
 					 res_dir);
@@ -1443,24 +1464,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
 	return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
 }
 
-#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
-	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
-	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
-};
-
-static inline void ext4_set_de_type(struct super_block *sb,
-				struct ext4_dir_entry_2 *de,
-				umode_t mode) {
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
-		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
-}
-
 /*
  * Move count entries from end of map between two memory locations.
  * Returns pointer to last entry moved.
@@ -1536,11 +1539,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
-	bh2 = ext4_append (handle, dir, &newblock, &err);
-	if (!(bh2)) {
+	bh2 = ext4_append(handle, dir, &newblock);
+	if (IS_ERR(bh2)) {
 		brelse(*bh);
 		*bh = NULL;
-		goto errout;
+		*error = PTR_ERR(bh2);
+		return NULL;
 	}
 
 	BUFFER_TRACE(*bh, "get_write_access");
@@ -1621,7 +1625,6 @@ journal_error:
 	brelse(bh2);
 	*bh = NULL;
 	ext4_std_error(dir->i_sb, err);
-errout:
 	*error = err;
 	return NULL;
 }
@@ -1699,7 +1702,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	const char	*name = dentry->d_name.name;
 	int		namelen = dentry->d_name.len;
 	unsigned int	blocksize = dir->i_sb->s_blocksize;
-	unsigned short	reclen;
 	int		csum_size = 0;
 	int		err;
 
@@ -1707,7 +1709,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
 		csum_size = sizeof(struct ext4_dir_entry_tail);
 
-	reclen = EXT4_DIR_REC_LEN(namelen);
 	if (!de) {
 		err = ext4_find_dest_de(dir, inode,
 					bh, bh->b_data, blocksize - csum_size,
@@ -1798,10 +1799,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	len = ((char *) root) + (blocksize - csum_size) - (char *) de;
 
 	/* Allocate new block for the 0th block's dirents */
-	bh2 = ext4_append(handle, dir, &block, &retval);
-	if (!(bh2)) {
+	bh2 = ext4_append(handle, dir, &block);
+	if (IS_ERR(bh2)) {
 		brelse(bh);
-		return retval;
+		return PTR_ERR(bh2);
 	}
 	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
 	data1 = bh2->b_data;
@@ -1918,20 +1919,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	}
 	blocks = dir->i_size >> sb->s_blocksize_bits;
 	for (block = 0; block < blocks; block++) {
-		if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) {
-			if (!retval) {
-				retval = -EIO;
-				ext4_error(inode->i_sb,
-					   "Directory hole detected on inode %lu\n",
-					   inode->i_ino);
-			}
-			return retval;
-		}
-		if (!buffer_verified(bh) &&
-		    !ext4_dirent_csum_verify(dir,
-				(struct ext4_dir_entry *)bh->b_data))
-			return -EIO;
-		set_buffer_verified(bh);
+		bh = ext4_read_dirblock(dir, block, DIRENT);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+
 		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
 		if (retval != -ENOSPC) {
 			brelse(bh);
@@ -1943,9 +1934,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 			return make_indexed_dir(handle, dentry, inode, bh);
 		brelse(bh);
 	}
-	bh = ext4_append(handle, dir, &block, &retval);
-	if (!bh)
-		return retval;
+	bh = ext4_append(handle, dir, &block);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
 	de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
@@ -1982,22 +1973,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 		return err;
 	entries = frame->entries;
 	at = frame->at;
-
-	if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) {
-		if (!err) {
-			err = -EIO;
-			ext4_error(dir->i_sb,
-				   "Directory hole detected on inode %lu\n",
-				   dir->i_ino);
-		}
+	bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
+	if (IS_ERR(bh)) {
+		err = PTR_ERR(bh);
+		bh = NULL;
 		goto cleanup;
 	}
 
-	if (!buffer_verified(bh) &&
-	    !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
-		goto journal_error;
-	set_buffer_verified(bh);
-
 	BUFFER_TRACE(bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, bh);
 	if (err)
@@ -2025,9 +2007,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			err = -ENOSPC;
 			goto cleanup;
 		}
-		bh2 = ext4_append (handle, dir, &newblock, &err);
-		if (!(bh2))
+		bh2 = ext4_append(handle, dir, &newblock);
+		if (IS_ERR(bh2)) {
+			err = PTR_ERR(bh2);
 			goto cleanup;
+		}
 		node2 = (struct dx_node *)(bh2->b_data);
 		entries2 = node2->entries;
 		memset(&node2->fake, 0, sizeof(struct fake_dirent));
@@ -2106,8 +2090,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 journal_error:
 	ext4_std_error(dir->i_sb, err);
 cleanup:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 	dx_release(frames);
 	return err;
 }
@@ -2254,29 +2237,27 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 {
 	handle_t *handle;
 	struct inode *inode;
-	int err, retries = 0;
+	int err, credits, retries = 0;
 
 	dquot_initialize(dir);
 
+	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
 retry:
-	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		ext4_handle_sync(handle);
-
-	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+					    NULL, EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
 		err = ext4_add_nondir(handle, dentry, inode);
+		if (!err && IS_DIRSYNC(dir))
+			ext4_handle_sync(handle);
 	}
-	ext4_journal_stop(handle);
+	if (handle)
+		ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
@@ -2287,34 +2268,71 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
 {
 	handle_t *handle;
 	struct inode *inode;
-	int err, retries = 0;
+	int err, credits, retries = 0;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
 	dquot_initialize(dir);
 
+	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
 retry:
-	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		ext4_handle_sync(handle);
-
-	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+					    NULL, EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
 		inode->i_op = &ext4_special_inode_operations;
 		err = ext4_add_nondir(handle, dentry, inode);
+		if (!err && IS_DIRSYNC(dir))
+			ext4_handle_sync(handle);
 	}
-	ext4_journal_stop(handle);
+	if (handle)
+		ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, retries = 0;
+
+	dquot_initialize(dir);
+
+retry:
+	inode = ext4_new_inode_start_handle(dir, mode,
+					    NULL, 0, NULL,
+					    EXT4_HT_DIR,
+			EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  4 + EXT4_XATTR_TRANS_BLOCKS);
+	handle = ext4_journal_current_handle();
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ext4_file_inode_operations;
+		inode->i_fop = &ext4_file_operations;
+		ext4_set_aops(inode);
+		err = ext4_orphan_add(handle, inode);
+		if (err)
+			goto err_drop_inode;
+		mark_inode_dirty(inode);
+		d_tmpfile(dentry, inode);
+		unlock_new_inode(inode);
+	}
+	if (handle)
+		ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
+err_drop_inode:
+	ext4_journal_stop(handle);
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
 }
 
 struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
@@ -2351,6 +2369,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
 	struct buffer_head *dir_block = NULL;
 	struct ext4_dir_entry_2 *de;
 	struct ext4_dir_entry_tail *t;
+	ext4_lblk_t block = 0;
 	unsigned int blocksize = dir->i_sb->s_blocksize;
 	int csum_size = 0;
 	int err;
@@ -2367,16 +2386,10 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
 			goto out;
 	}
 
-	inode->i_size = EXT4_I(inode)->i_disksize = blocksize;
-	if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
-		if (!err) {
-			err = -EIO;
-			ext4_error(inode->i_sb,
-				   "Directory hole detected on inode %lu\n",
-				   inode->i_ino);
-		}
-		goto out;
-	}
+	inode->i_size = 0;
+	dir_block = ext4_append(handle, inode, &block);
+	if (IS_ERR(dir_block))
+		return PTR_ERR(dir_block);
 	BUFFER_TRACE(dir_block, "get_write_access");
 	err = ext4_journal_get_write_access(handle, dir_block);
 	if (err)
@@ -2403,25 +2416,20 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	handle_t *handle;
 	struct inode *inode;
-	int err, retries = 0;
+	int err, credits, retries = 0;
 
 	if (EXT4_DIR_LINK_MAX(dir))
 		return -EMLINK;
 
 	dquot_initialize(dir);
 
+	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
 retry:
-	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		ext4_handle_sync(handle);
-
-	inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
-			       &dentry->d_name, 0, NULL);
+	inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
+					    &dentry->d_name,
+					    0, NULL, EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
@@ -2449,8 +2457,12 @@ out_clear_inode:
 		goto out_clear_inode;
 	unlock_new_inode(inode);
 	d_instantiate(dentry, inode);
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
 out_stop:
-	ext4_journal_stop(handle);
+	if (handle)
+		ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
@@ -2476,25 +2488,14 @@ static int empty_dir(struct inode *inode)
 	}
 
 	sb = inode->i_sb;
-	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
-	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
-		if (err)
-			EXT4_ERROR_INODE(inode,
-				"error %d reading directory lblock 0", err);
-		else
-			ext4_warning(inode->i_sb,
-				     "bad directory (dir #%lu) - no data block",
-				     inode->i_ino);
+	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
+		EXT4_ERROR_INODE(inode, "invalid size");
 		return 1;
 	}
-	if (!buffer_verified(bh) &&
-	    !ext4_dirent_csum_verify(inode,
-			(struct ext4_dir_entry *)bh->b_data)) {
-		EXT4_ERROR_INODE(inode, "checksum error reading directory "
-				 "lblock 0");
-		return -EIO;
-	}
-	set_buffer_verified(bh);
+	bh = ext4_read_dirblock(inode, 0, EITHER);
+	if (IS_ERR(bh))
+		return 1;
+
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de1 = ext4_next_entry(de, sb->s_blocksize);
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
@@ -2517,28 +2518,9 @@ static int empty_dir(struct inode *inode)
 			err = 0;
 			brelse(bh);
 			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
-			bh = ext4_bread(NULL, inode, lblock, 0, &err);
-			if (!bh) {
-				if (err)
-					EXT4_ERROR_INODE(inode,
-						"error %d reading directory "
-						"lblock %u", err, lblock);
-				else
-					ext4_warning(inode->i_sb,
-						"bad directory (dir #%lu) - no data block",
-						inode->i_ino);
-
-				offset += sb->s_blocksize;
-				continue;
-			}
-			if (!buffer_verified(bh) &&
-			    !ext4_dirent_csum_verify(inode,
-					(struct ext4_dir_entry *)bh->b_data)) {
-				EXT4_ERROR_INODE(inode, "checksum error "
-						 "reading directory lblock 0");
-				return -EIO;
-			}
-			set_buffer_verified(bh);
+			bh = ext4_read_dirblock(inode, lblock, EITHER);
+			if (IS_ERR(bh))
+				return 1;
 			de = (struct ext4_dir_entry_2 *) bh->b_data;
 		}
 		if (ext4_check_dir_entry(inode, NULL, de, bh,
@@ -2717,25 +2699,18 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	struct inode *inode;
 	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de;
-	handle_t *handle;
+	handle_t *handle = NULL;
 
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
 	dquot_initialize(dir);
 	dquot_initialize(dentry->d_inode);
 
-	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
 	retval = -ENOENT;
 	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
 	if (!bh)
 		goto end_rmdir;
 
-	if (IS_DIRSYNC(dir))
-		ext4_handle_sync(handle);
-
 	inode = dentry->d_inode;
 
 	retval = -EIO;
@@ -2746,6 +2721,17 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!empty_dir(inode))
 		goto end_rmdir;
 
+	handle = ext4_journal_start(dir, EXT4_HT_DIR,
+				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle)) {
+		retval = PTR_ERR(handle);
+		handle = NULL;
+		goto end_rmdir;
+	}
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_rmdir;
@@ -2767,8 +2753,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	ext4_mark_inode_dirty(handle, dir);
 
 end_rmdir:
-	ext4_journal_stop(handle);
 	brelse(bh);
+	if (handle)
+		ext4_journal_stop(handle);
 	return retval;
 }
 
@@ -2778,7 +2765,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode;
 	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de;
-	handle_t *handle;
+	handle_t *handle = NULL;
 
 	trace_ext4_unlink_enter(dir, dentry);
 	/* Initialize quotas before so that eventual writes go
@@ -2786,13 +2773,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	dquot_initialize(dir);
 	dquot_initialize(dentry->d_inode);
 
-	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		ext4_handle_sync(handle);
-
 	retval = -ENOENT;
 	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
 	if (!bh)
@@ -2804,6 +2784,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	if (le32_to_cpu(de->inode) != inode->i_ino)
 		goto end_unlink;
 
+	handle = ext4_journal_start(dir, EXT4_HT_DIR,
+				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle)) {
+		retval = PTR_ERR(handle);
+		handle = NULL;
+		goto end_unlink;
+	}
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
 	if (!inode->i_nlink) {
 		ext4_warning(inode->i_sb,
 			     "Deleting nonexistent file (%lu), %d",
@@ -2824,8 +2815,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	retval = 0;
 
 end_unlink:
-	ext4_journal_stop(handle);
 	brelse(bh);
+	if (handle)
+		ext4_journal_stop(handle);
 	trace_ext4_unlink_exit(dentry, retval);
 	return retval;
 }
@@ -2861,19 +2853,13 @@ static int ext4_symlink(struct inode *dir,
 		 * quota blocks, sb is already counted in previous macros).
 		 */
 		credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-			  EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
 	}
 retry:
-	handle = ext4_journal_start(dir, credits);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		ext4_handle_sync(handle);
-
-	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
-			       &dentry->d_name, 0, NULL);
+	inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
+					    &dentry->d_name, 0, NULL,
+					    EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
@@ -2903,7 +2889,7 @@ retry:
 		 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
 		 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
 		 */
-		handle = ext4_journal_start(dir,
+		handle = ext4_journal_start(dir, EXT4_HT_DIR,
 				EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 				EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
 		if (IS_ERR(handle)) {
@@ -2926,8 +2912,12 @@ retry:
 	}
 	EXT4_I(inode)->i_disksize = inode->i_size;
 	err = ext4_add_nondir(handle, dentry, inode);
+	if (!err && IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
 out_stop:
-	ext4_journal_stop(handle);
+	if (handle)
+		ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
@@ -2950,8 +2940,9 @@ static int ext4_link(struct dentry *old_dentry,
 	dquot_initialize(dir);
 
 retry:
-	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+	handle = ext4_journal_start(dir, EXT4_HT_DIR,
+		(EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		 EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -2965,6 +2956,11 @@ retry:
 	err = ext4_add_entry(handle, dentry, inode);
 	if (!err) {
 		ext4_mark_inode_dirty(handle, inode);
+		/* this can happen only for tmpfile being
+		 * linked the first time
+		 */
+		if (inode->i_nlink == 1)
+			ext4_orphan_del(handle, inode);
 		d_instantiate(dentry, inode);
 	} else {
 		drop_nlink(inode);
@@ -2991,13 +2987,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
 	struct buffer_head *bh;
 
 	if (!ext4_has_inline_data(inode)) {
-		if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) {
-			if (!*retval) {
-				*retval = -EIO;
-				ext4_error(inode->i_sb,
-					   "Directory hole detected on inode %lu\n",
-					   inode->i_ino);
-			}
+		bh = ext4_read_dirblock(inode, 0, EITHER);
+		if (IS_ERR(bh)) {
+			*retval = PTR_ERR(bh);
 			return NULL;
 		}
 		*parent_de = ext4_next_entry(
@@ -3034,9 +3026,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * in separate transaction */
 	if (new_dentry->d_inode)
 		dquot_initialize(new_dentry->d_inode);
-	handle = ext4_journal_start(old_dir, 2 *
-					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+	handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
+		(2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+		 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -3076,11 +3068,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 						  &inlined);
 		if (!dir_bh)
 			goto end_rename;
-		if (!inlined && !buffer_verified(dir_bh) &&
-		    !ext4_dirent_csum_verify(old_inode,
-				(struct ext4_dir_entry *)dir_bh->b_data))
-			goto end_rename;
-		set_buffer_verified(dir_bh);
 		if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
 			goto end_rename;
 		retval = -EMLINK;
@@ -3226,6 +3213,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.mkdir		= ext4_mkdir,
 	.rmdir		= ext4_rmdir,
 	.mknod		= ext4_mknod,
+	.tmpfile	= ext4_tmpfile,
 	.rename		= ext4_rename,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0016fbca2a4..48786cdb5e6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -18,84 +18,149 @@
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
+#include <linux/aio.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 
-static struct kmem_cache *io_page_cachep, *io_end_cachep;
+static struct kmem_cache *io_end_cachep;
 
 int __init ext4_init_pageio(void)
 {
-	io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
-	if (io_page_cachep == NULL)
-		return -ENOMEM;
 	io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
-	if (io_end_cachep == NULL) {
-		kmem_cache_destroy(io_page_cachep);
+	if (io_end_cachep == NULL)
 		return -ENOMEM;
-	}
 	return 0;
 }
 
 void ext4_exit_pageio(void)
 {
 	kmem_cache_destroy(io_end_cachep);
-	kmem_cache_destroy(io_page_cachep);
 }
 
-void ext4_ioend_wait(struct inode *inode)
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+	printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+			bdevname(bh->b_bdev, b),
+			(unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_finish_bio(struct bio *bio)
 {
-	wait_queue_head_t *wq = ext4_ioend_wq(inode);
+	int i;
+	int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		struct bio_vec *bvec = &bio->bi_io_vec[i];
+		struct page *page = bvec->bv_page;
+		struct buffer_head *bh, *head;
+		unsigned bio_start = bvec->bv_offset;
+		unsigned bio_end = bio_start + bvec->bv_len;
+		unsigned under_io = 0;
+		unsigned long flags;
 
-	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+		if (!page)
+			continue;
+
+		if (error) {
+			SetPageError(page);
+			set_bit(AS_EIO, &page->mapping->flags);
+		}
+		bh = head = page_buffers(page);
+		/*
+		 * We check all buffers in the page under BH_Uptodate_Lock
+		 * to avoid races with other end io clearing async_write flags
+		 */
+		local_irq_save(flags);
+		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+		do {
+			if (bh_offset(bh) < bio_start ||
+			    bh_offset(bh) + bh->b_size > bio_end) {
+				if (buffer_async_write(bh))
+					under_io++;
+				continue;
+			}
+			clear_buffer_async_write(bh);
+			if (error)
+				buffer_io_error(bh);
+		} while ((bh = bh->b_this_page) != head);
+		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+		local_irq_restore(flags);
+		if (!under_io)
+			end_page_writeback(page);
+	}
 }
 
-static void put_io_page(struct ext4_io_page *io_page)
+static void ext4_release_io_end(ext4_io_end_t *io_end)
 {
-	if (atomic_dec_and_test(&io_page->p_count)) {
-		end_page_writeback(io_page->p_page);
-		put_page(io_page->p_page);
-		kmem_cache_free(io_page_cachep, io_page);
+	struct bio *bio, *next_bio;
+
+	BUG_ON(!list_empty(&io_end->list));
+	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+	WARN_ON(io_end->handle);
+
+	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+		wake_up_all(ext4_ioend_wq(io_end->inode));
+
+	for (bio = io_end->bio; bio; bio = next_bio) {
+		next_bio = bio->bi_private;
+		ext4_finish_bio(bio);
+		bio_put(bio);
 	}
+	if (io_end->flag & EXT4_IO_END_DIRECT)
+		inode_dio_done(io_end->inode);
+	if (io_end->iocb)
+		aio_complete(io_end->iocb, io_end->result, 0);
+	kmem_cache_free(io_end_cachep, io_end);
 }
 
-void ext4_free_io_end(ext4_io_end_t *io)
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 {
-	int i;
+	struct inode *inode = io_end->inode;
 
-	BUG_ON(!io);
-	BUG_ON(!list_empty(&io->list));
-	BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
-
-	if (io->page)
-		put_page(io->page);
-	for (i = 0; i < io->num_io_pages; i++)
-		put_io_page(io->pages[i]);
-	io->num_io_pages = 0;
-	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
-		wake_up_all(ext4_ioend_wq(io->inode));
-	kmem_cache_free(io_end_cachep, io);
+	io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+	/* Wake up anyone waiting on unwritten extent conversion */
+	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+		wake_up_all(ext4_ioend_wq(inode));
 }
 
-/* check a range of space and convert unwritten extents to written. */
+/*
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
 static int ext4_end_io(ext4_io_end_t *io)
 {
 	struct inode *inode = io->inode;
 	loff_t offset = io->offset;
 	ssize_t size = io->size;
+	handle_t *handle = io->handle;
 	int ret = 0;
 
 	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
 		   "list->prev 0x%p\n",
 		   io, inode->i_ino, io->list.next, io->list.prev);
 
-	ret = ext4_convert_unwritten_extents(inode, offset, size);
+	io->handle = NULL;	/* Following call will use up the handle */
+	ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
 	if (ret < 0) {
 		ext4_msg(inode->i_sb, KERN_EMERG,
 			 "failed to convert unwritten extents to written "
@@ -103,32 +168,22 @@ static int ext4_end_io(ext4_io_end_t *io)
 			 "(inode %lu, offset %llu, size %zd, error %d)",
 			 inode->i_ino, offset, size, ret);
 	}
-	if (io->iocb)
-		aio_complete(io->iocb, io->result, 0);
-
-	if (io->flag & EXT4_IO_END_DIRECT)
-		inode_dio_done(inode);
-	/* Wake up anyone waiting on unwritten extent conversion */
-	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
-		wake_up_all(ext4_ioend_wq(inode));
+	ext4_clear_io_unwritten_flag(io);
+	ext4_release_io_end(io);
 	return ret;
 }
 
-static void dump_completed_IO(struct inode *inode)
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
 {
 #ifdef	EXT4FS_DEBUG
 	struct list_head *cur, *before, *after;
 	ext4_io_end_t *io, *io0, *io1;
-	unsigned long flags;
 
-	if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
-		ext4_debug("inode %lu completed_io list is empty\n",
-			   inode->i_ino);
+	if (list_empty(head))
 		return;
-	}
 
-	ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
-	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
+	ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
+	list_for_each_entry(io, head, list) {
 		cur = &io->list;
 		before = cur->prev;
 		io0 = container_of(before, ext4_io_end_t, list);
@@ -142,39 +197,40 @@ static void dump_completed_IO(struct inode *inode)
 }
 
 /* Add the io_end to per-inode completed end_io list. */
-void ext4_add_complete_io(ext4_io_end_t *io_end)
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
 {
 	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
 	struct workqueue_struct *wq;
 	unsigned long flags;
 
 	BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
-	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
-
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	if (list_empty(&ei->i_completed_io_list)) {
-		io_end->flag |= EXT4_IO_END_QUEUED;
-		queue_work(wq, &io_end->work);
+	if (io_end->handle) {
+		wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+		if (list_empty(&ei->i_rsv_conversion_list))
+			queue_work(wq, &ei->i_rsv_conversion_work);
+		list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+	} else {
+		wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
+		if (list_empty(&ei->i_unrsv_conversion_list))
+			queue_work(wq, &ei->i_unrsv_conversion_work);
+		list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
 	}
-	list_add_tail(&io_end->list, &ei->i_completed_io_list);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 }
 
 static int ext4_do_flush_completed_IO(struct inode *inode,
-				      ext4_io_end_t *work_io)
+				      struct list_head *head)
 {
 	ext4_io_end_t *io;
-	struct list_head unwritten, complete, to_free;
+	struct list_head unwritten;
 	unsigned long flags;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	int err, ret = 0;
 
-	INIT_LIST_HEAD(&complete);
-	INIT_LIST_HEAD(&to_free);
-
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	dump_completed_IO(inode);
-	list_replace_init(&ei->i_completed_io_list, &unwritten);
+	dump_completed_IO(inode, head);
+	list_replace_init(head, &unwritten);
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 
 	while (!list_empty(&unwritten)) {
@@ -185,54 +241,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
 		err = ext4_end_io(io);
 		if (unlikely(!ret && err))
 			ret = err;
-
-		list_add_tail(&io->list, &complete);
-	}
-	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-	while (!list_empty(&complete)) {
-		io = list_entry(complete.next, ext4_io_end_t, list);
-		io->flag &= ~EXT4_IO_END_UNWRITTEN;
-		/* end_io context can not be destroyed now because it still
-		 * used by queued worker. Worker thread will destroy it later */
-		if (io->flag & EXT4_IO_END_QUEUED)
-			list_del_init(&io->list);
-		else
-			list_move(&io->list, &to_free);
-	}
-	/* If we are called from worker context, it is time to clear queued
-	 * flag, and destroy it's end_io if it was converted already */
-	if (work_io) {
-		work_io->flag &= ~EXT4_IO_END_QUEUED;
-		if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
-			list_add_tail(&work_io->list, &to_free);
-	}
-	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-
-	while (!list_empty(&to_free)) {
-		io = list_entry(to_free.next, ext4_io_end_t, list);
-		list_del_init(&io->list);
-		ext4_free_io_end(io);
 	}
 	return ret;
 }
 
 /*
- * work on completed aio dio IO, to convert unwritten extents to extents
+ * work on completed IO, to convert unwritten extents to extents
  */
-static void ext4_end_io_work(struct work_struct *work)
+void ext4_end_io_rsv_work(struct work_struct *work)
 {
-	ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
-	ext4_do_flush_completed_IO(io->inode, io);
+	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+						  i_rsv_conversion_work);
+	ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
 }
 
-int ext4_flush_unwritten_io(struct inode *inode)
+void ext4_end_io_unrsv_work(struct work_struct *work)
 {
-	int ret;
-	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
-		     !(inode->i_state & I_FREEING));
-	ret = ext4_do_flush_completed_IO(inode, NULL);
-	ext4_unwritten_wait(inode);
-	return ret;
+	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+						  i_unrsv_conversion_work);
+	ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
 }
 
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -241,74 +268,71 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 	if (io) {
 		atomic_inc(&EXT4_I(inode)->i_ioend_count);
 		io->inode = inode;
-		INIT_WORK(&io->work, ext4_end_io_work);
 		INIT_LIST_HEAD(&io->list);
+		atomic_set(&io->count, 1);
 	}
 	return io;
 }
 
-/*
- * Print an buffer I/O error compatible with the fs/buffer.c.  This
- * provides compatibility with dmesg scrapers that look for a specific
- * buffer I/O error message.  We really need a unified error reporting
- * structure to userspace ala Digital Unix's uerf system, but it's
- * probably not going to happen in my lifetime, due to LKML politics...
- */
-static void buffer_io_error(struct buffer_head *bh)
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
 {
-	char b[BDEVNAME_SIZE];
-	printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
-			bdevname(bh->b_bdev, b),
-			(unsigned long long)bh->b_blocknr);
+	if (atomic_dec_and_test(&io_end->count)) {
+		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+			ext4_release_io_end(io_end);
+			return;
+		}
+		ext4_add_complete_io(io_end);
+	}
+}
+
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+	int err = 0;
+
+	if (atomic_dec_and_test(&io_end->count)) {
+		if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+			err = ext4_convert_unwritten_extents(io_end->handle,
+						io_end->inode, io_end->offset,
+						io_end->size);
+			io_end->handle = NULL;
+			ext4_clear_io_unwritten_flag(io_end);
+		}
+		ext4_release_io_end(io_end);
+	}
+	return err;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+	atomic_inc(&io_end->count);
+	return io_end;
 }
 
 static void ext4_end_bio(struct bio *bio, int error)
 {
 	ext4_io_end_t *io_end = bio->bi_private;
-	struct inode *inode;
-	int i;
 	sector_t bi_sector = bio->bi_sector;
 
 	BUG_ON(!io_end);
-	bio->bi_private = NULL;
 	bio->bi_end_io = NULL;
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = 0;
-	bio_put(bio);
-
-	for (i = 0; i < io_end->num_io_pages; i++) {
-		struct page *page = io_end->pages[i]->p_page;
-		struct buffer_head *bh, *head;
-		loff_t offset;
-		loff_t io_end_offset;
 
-		if (error) {
-			SetPageError(page);
-			set_bit(AS_EIO, &page->mapping->flags);
-			head = page_buffers(page);
-			BUG_ON(!head);
-
-			io_end_offset = io_end->offset + io_end->size;
-
-			offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
-			bh = head;
-			do {
-				if ((offset >= io_end->offset) &&
-				    (offset+bh->b_size <= io_end_offset))
-					buffer_io_error(bh);
-
-				offset += bh->b_size;
-				bh = bh->b_this_page;
-			} while (bh != head);
-		}
-
-		put_io_page(io_end->pages[i]);
+	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+		/*
+		 * Link bio into list hanging from io_end. We have to do it
+		 * atomically as bio completions can be racing against each
+		 * other.
+		 */
+		bio->bi_private = xchg(&io_end->bio, bio);
+	} else {
+		ext4_finish_bio(bio);
+		bio_put(bio);
 	}
-	io_end->num_io_pages = 0;
-	inode = io_end->inode;
 
 	if (error) {
-		io_end->flag |= EXT4_IO_END_ERROR;
+		struct inode *inode = io_end->inode;
+
 		ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
 			     "(offset %llu size %ld starting block %llu)",
 			     inode->i_ino,
@@ -317,13 +341,7 @@ static void ext4_end_bio(struct bio *bio, int error)
 			     (unsigned long long)
 			     bi_sector >> (inode->i_blkbits - 9));
 	}
-
-	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-		ext4_free_io_end(io_end);
-		return;
-	}
-
-	ext4_add_complete_io(io_end);
+	ext4_put_io_end_defer(io_end);
 }
 
 void ext4_io_submit(struct ext4_io_submit *io)
@@ -337,84 +355,53 @@ void ext4_io_submit(struct ext4_io_submit *io)
 		bio_put(io->io_bio);
 	}
 	io->io_bio = NULL;
-	io->io_op = 0;
+}
+
+void ext4_io_submit_init(struct ext4_io_submit *io,
+			 struct writeback_control *wbc)
+{
+	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
+	io->io_bio = NULL;
 	io->io_end = NULL;
 }
 
-static int io_submit_init(struct ext4_io_submit *io,
-			  struct inode *inode,
-			  struct writeback_control *wbc,
-			  struct buffer_head *bh)
+static int io_submit_init_bio(struct ext4_io_submit *io,
+			      struct buffer_head *bh)
 {
-	ext4_io_end_t *io_end;
-	struct page *page = bh->b_page;
 	int nvecs = bio_get_nr_vecs(bh->b_bdev);
 	struct bio *bio;
 
-	io_end = ext4_init_io_end(inode, GFP_NOFS);
-	if (!io_end)
-		return -ENOMEM;
 	bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+	if (!bio)
+		return -ENOMEM;
 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
-	bio->bi_private = io->io_end = io_end;
 	bio->bi_end_io = ext4_end_bio;
-
-	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
-
+	bio->bi_private = ext4_get_io_end(io->io_end);
 	io->io_bio = bio;
-	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
 	io->io_next_block = bh->b_blocknr;
 	return 0;
 }
 
 static int io_submit_add_bh(struct ext4_io_submit *io,
-			    struct ext4_io_page *io_page,
 			    struct inode *inode,
-			    struct writeback_control *wbc,
 			    struct buffer_head *bh)
 {
-	ext4_io_end_t *io_end;
 	int ret;
 
-	if (buffer_new(bh)) {
-		clear_buffer_new(bh);
-		unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
-	}
-
-	if (!buffer_mapped(bh) || buffer_delay(bh)) {
-		if (!buffer_mapped(bh))
-			clear_buffer_dirty(bh);
-		if (io->io_bio)
-			ext4_io_submit(io);
-		return 0;
-	}
-
 	if (io->io_bio && bh->b_blocknr != io->io_next_block) {
 submit_and_retry:
 		ext4_io_submit(io);
 	}
 	if (io->io_bio == NULL) {
-		ret = io_submit_init(io, inode, wbc, bh);
+		ret = io_submit_init_bio(io, bh);
 		if (ret)
 			return ret;
 	}
-	io_end = io->io_end;
-	if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
-	    (io_end->pages[io_end->num_io_pages-1] != io_page))
-		goto submit_and_retry;
-	if (buffer_uninit(bh))
-		ext4_set_io_unwritten_flag(inode, io_end);
-	io->io_end->size += bh->b_size;
-	io->io_next_block++;
 	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
 	if (ret != bh->b_size)
 		goto submit_and_retry;
-	if ((io_end->num_io_pages == 0) ||
-	    (io_end->pages[io_end->num_io_pages-1] != io_page)) {
-		io_end->pages[io_end->num_io_pages++] = io_page;
-		atomic_inc(&io_page->p_count);
-	}
+	io->io_next_block++;
 	return 0;
 }
 
@@ -424,33 +411,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 			struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	unsigned block_start, block_end, blocksize;
-	struct ext4_io_page *io_page;
+	unsigned block_start, blocksize;
 	struct buffer_head *bh, *head;
 	int ret = 0;
+	int nr_submitted = 0;
 
 	blocksize = 1 << inode->i_blkbits;
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(PageWriteback(page));
 
-	io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
-	if (!io_page) {
-		set_page_dirty(page);
-		unlock_page(page);
-		return -ENOMEM;
-	}
-	io_page->p_page = page;
-	atomic_set(&io_page->p_count, 1);
-	get_page(page);
 	set_page_writeback(page);
 	ClearPageError(page);
 
-	for (bh = head = page_buffers(page), block_start = 0;
-	     bh != head || !block_start;
-	     block_start = block_end, bh = bh->b_this_page) {
-
-		block_end = block_start + blocksize;
+	/*
+	 * In the first loop we prepare and mark buffers to submit. We have to
+	 * mark all buffers in the page before submitting so that
+	 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+	 * on the first buffer finishes and we are still working on submitting
+	 * the second buffer.
+	 */
+	bh = head = page_buffers(page);
+	do {
+		block_start = bh_offset(bh);
 		if (block_start >= len) {
 			/*
 			 * Comments copied from block_write_full_page_endio:
@@ -463,32 +446,57 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 			 * mapped, and writes to that region are not written
 			 * out to the file."
 			 */
-			zero_user_segment(page, block_start, block_end);
+			zero_user_segment(page, block_start,
+					  block_start + blocksize);
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
 			continue;
 		}
-		clear_buffer_dirty(bh);
-		ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+		if (!buffer_dirty(bh) || buffer_delay(bh) ||
+		    !buffer_mapped(bh) || buffer_unwritten(bh)) {
+			/* A hole? We can safely clear the dirty bit */
+			if (!buffer_mapped(bh))
+				clear_buffer_dirty(bh);
+			if (io->io_bio)
+				ext4_io_submit(io);
+			continue;
+		}
+		if (buffer_new(bh)) {
+			clear_buffer_new(bh);
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+		}
+		set_buffer_async_write(bh);
+	} while ((bh = bh->b_this_page) != head);
+
+	/* Now submit buffers to write */
+	bh = head = page_buffers(page);
+	do {
+		if (!buffer_async_write(bh))
+			continue;
+		ret = io_submit_add_bh(io, inode, bh);
 		if (ret) {
 			/*
 			 * We only get here on ENOMEM.  Not much else
 			 * we can do but mark the page as dirty, and
 			 * better luck next time.
 			 */
-			set_page_dirty(page);
+			redirty_page_for_writepage(wbc, page);
 			break;
 		}
+		nr_submitted++;
+		clear_buffer_dirty(bh);
+	} while ((bh = bh->b_this_page) != head);
+
+	/* Error stopped previous loop? Clean up buffers... */
+	if (ret) {
+		do {
+			clear_buffer_async_write(bh);
+			bh = bh->b_this_page;
+		} while (bh != head);
 	}
 	unlock_page(page);
-	/*
-	 * If the page was truncated before we could do the writeback,
-	 * or we had a memory allocation error while trying to write
-	 * the first buffer head, we won't have submitted any pages for
-	 * I/O.  In that case we need to make sure we've cleared the
-	 * PageWriteback bit from the page to prevent the system from
-	 * wedging later on.
-	 */
-	put_io_page(io_page);
+	/* Nothing submitted - we have to end page writeback */
+	if (!nr_submitted)
+		end_page_writeback(page);
 	return ret;
 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index d99387b89ed..c5adbb318a9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
 	ext4_fsblk_t end = start + input->blocks_count;
 	ext4_group_t group = input->group;
 	ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
-	unsigned overhead = ext4_group_overhead_blocks(sb, group);
-	ext4_fsblk_t metaend = start + overhead;
+	unsigned overhead;
+	ext4_fsblk_t metaend;
 	struct buffer_head *bh = NULL;
 	ext4_grpblk_t free_blocks_count, offset;
 	int err = -EINVAL;
 
+	if (group != sbi->s_groups_count) {
+		ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+			     input->group, sbi->s_groups_count);
+		return -EINVAL;
+	}
+
+	overhead = ext4_group_overhead_blocks(sb, group);
+	metaend = start + overhead;
 	input->free_blocks_count = free_blocks_count =
 		input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
 
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
 		       free_blocks_count, input->reserved_blocks);
 
 	ext4_get_group_no_and_offset(sb, start, NULL, &offset);
-	if (group != sbi->s_groups_count)
-		ext4_warning(sb, "Cannot add at group %u (only %u groups)",
-			     input->group, sbi->s_groups_count);
-	else if (offset != 0)
+	if (offset != 0)
 			ext4_warning(sb, "Last group not full");
 	else if (input->reserved_blocks > input->blocks_count / 5)
 		ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -272,7 +277,7 @@ next_group:
 		if (start_blk >= last_blk)
 			goto next_group;
 		group_data[bb_index].block_bitmap = start_blk++;
-		ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+		group = ext4_get_group_number(sb, start_blk - 1);
 		group -= group_data[0].group;
 		group_data[group].free_blocks_count--;
 		if (flexbg_size > 1)
@@ -284,7 +289,7 @@ next_group:
 		if (start_blk >= last_blk)
 			goto next_group;
 		group_data[ib_index].inode_bitmap = start_blk++;
-		ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+		group = ext4_get_group_number(sb, start_blk - 1);
 		group -= group_data[0].group;
 		group_data[group].free_blocks_count--;
 		if (flexbg_size > 1)
@@ -296,7 +301,7 @@ next_group:
 		if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
 			goto next_group;
 		group_data[it_index].inode_table = start_blk;
-		ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+		group = ext4_get_group_number(sb, start_blk - 1);
 		group -= group_data[0].group;
 		group_data[group].free_blocks_count -=
 					EXT4_SB(sb)->s_itb_per_group;
@@ -333,8 +338,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
 	int err;
 
 	bh = sb_getblk(sb, blk);
-	if (!bh)
-		return ERR_PTR(-EIO);
+	if (unlikely(!bh))
+		return ERR_PTR(-ENOMEM);
 	if ((err = ext4_journal_get_write_access(handle, bh))) {
 		brelse(bh);
 		bh = ERR_PTR(err);
@@ -392,7 +397,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
 		ext4_group_t group;
 		int err;
 
-		ext4_get_group_no_and_offset(sb, block, &group, NULL);
+		group = ext4_get_group_number(sb, block);
 		start = ext4_group_first_block_no(sb, group);
 		group -= flex_gd->groups[0].group;
 
@@ -410,8 +415,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
 			return err;
 
 		bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
-		if (!bh)
-			return -EIO;
+		if (unlikely(!bh))
+			return -ENOMEM;
 
 		err = ext4_journal_get_write_access(handle, bh);
 		if (err)
@@ -466,7 +471,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
 	meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
 
 	/* This transaction may be extended/restarted along the way */
-	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -500,8 +505,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
 				goto out;
 
 			gdb = sb_getblk(sb, block);
-			if (!gdb) {
-				err = -EIO;
+			if (unlikely(!gdb)) {
+				err = -ENOMEM;
 				goto out;
 			}
 
@@ -1031,7 +1036,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
 	handle_t *handle;
 	int err = 0, err2;
 
-	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
 	if (IS_ERR(handle)) {
 		group = 1;
 		err = PTR_ERR(handle);
@@ -1064,8 +1069,8 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
 					ext4_bg_has_super(sb, group));
 
 		bh = sb_getblk(sb, backup_block);
-		if (!bh) {
-			err = -EIO;
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
 			break;
 		}
 		ext4_debug("update metadata backup %llu(+%llu)\n",
@@ -1168,7 +1173,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
 static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
 {
 	struct buffer_head *bh = sb_getblk(sb, block);
-	if (!bh)
+	if (unlikely(!bh))
 		return NULL;
 	if (!bh_uptodate_or_lock(bh)) {
 		if (bh_submit_read(bh) < 0) {
@@ -1247,7 +1252,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
 
 		ext4_inode_table_set(sb, gdp, group_data->inode_table);
 		ext4_free_group_clusters_set(sb, gdp,
-					     EXT4_B2C(sbi, group_data->free_blocks_count));
+			EXT4_NUM_B2C(sbi, group_data->free_blocks_count));
 		ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
 		if (ext4_has_group_desc_csum(sb))
 			ext4_itable_unused_set(sb, gdp,
@@ -1341,6 +1346,8 @@ static void ext4_update_super(struct super_block *sb,
 
 	/* Update the global fs size fields */
 	sbi->s_groups_count += flex_gd->count;
+	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
 
 	/* Update the reserved block counts only once the new group is
 	 * active. */
@@ -1349,7 +1356,7 @@ static void ext4_update_super(struct super_block *sb,
 
 	/* Update the free space counts */
 	percpu_counter_add(&sbi->s_freeclusters_counter,
-			   EXT4_B2C(sbi, free_blocks));
+			   EXT4_NUM_B2C(sbi, free_blocks));
 	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
 
@@ -1360,8 +1367,8 @@ static void ext4_update_super(struct super_block *sb,
 	    sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group;
 		flex_group = ext4_flex_group(sbi, group_data[0].group);
-		atomic_add(EXT4_B2C(sbi, free_blocks),
-			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
 		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
 			   &sbi->s_flex_groups[flex_group].free_inodes);
 	}
@@ -1412,7 +1419,7 @@ static int ext4_flex_group_add(struct super_block *sb,
 	 * modify each of the reserved GDT dindirect blocks.
 	 */
 	credit = flex_gd->count * 4 + reserved_gdb;
-	handle = ext4_journal_start_sb(sb, credit);
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit);
 	if (IS_ERR(handle)) {
 		err = PTR_ERR(handle);
 		goto exit;
@@ -1506,10 +1513,12 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
 		group_data[i].blocks_count = blocks_per_group;
 		overhead = ext4_group_overhead_blocks(sb, group + i);
 		group_data[i].free_blocks_count = blocks_per_group - overhead;
-		if (ext4_has_group_desc_csum(sb))
+		if (ext4_has_group_desc_csum(sb)) {
 			flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
 					       EXT4_BG_INODE_UNINIT;
-		else
+			if (!test_opt(sb, INIT_INODE_TABLE))
+				flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED;
+		} else
 			flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
 	}
 
@@ -1547,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
 		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
 	struct inode *inode = NULL;
-	int gdb_off, gdb_num;
+	int gdb_off;
 	int err;
 	__u16 bg_flags = 0;
 
-	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
 	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
 
 	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1594,7 +1602,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	err = ext4_alloc_flex_bg_array(sb, input->group + 1);
 	if (err)
-		return err;
+		goto out;
 
 	err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
 	if (err)
@@ -1622,7 +1630,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,
 	/* We will update the superblock, one block bitmap, and
 	 * one group descriptor via ext4_group_add_blocks().
 	 */
-	handle = ext4_journal_start_sb(sb, 3);
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3);
 	if (IS_ERR(handle)) {
 		err = PTR_ERR(handle);
 		ext4_warning(sb, "error %d on journal start", err);
@@ -1652,12 +1660,10 @@ errout:
 		err = err2;
 
 	if (!err) {
-		ext4_fsblk_t first_block;
-		first_block = ext4_group_first_block_no(sb, 0);
 		if (test_opt(sb, DEBUG))
 			printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
 			       "blocks\n", ext4_blocks_count(es));
-		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block,
+		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
 			       (char *)es, sizeof(struct ext4_super_block), 0);
 	}
 	return err;
@@ -1786,7 +1792,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
 		credits += 3;	/* block bitmap, bg descriptor, resize inode */
 	}
 
-	handle = ext4_journal_start_sb(sb, credits);
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -1877,7 +1883,11 @@ retry:
 		/* Nothing need to do */
 		return 0;
 
-	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+	n_group = ext4_get_group_number(sb, n_blocks_count - 1);
+	if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+		ext4_warning(sb, "resize would cause inodes_count overflow");
+		return -EINVAL;
+	}
 	ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
 
 	n_desc_blocks = num_desc_blocks(sb, n_group + 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3d4fb81bacd..85b3dd60169 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,8 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 static void ext4_clear_journal_err(struct super_block *sb,
 				   struct ext4_super_block *es);
 static int ext4_sync_fs(struct super_block *sb, int wait);
-static const char *ext4_decode_error(struct super_block *sb, int errno,
-				     char nbuf[16]);
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
 static int ext4_remount(struct super_block *sb, int *flags, char *data);
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
@@ -83,6 +82,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
 
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
@@ -92,6 +92,8 @@ static struct file_system_type ext2_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("ext2");
+MODULE_ALIAS("ext2");
 #define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
 #else
 #define IS_EXT2_SB(sb) (0)
@@ -106,6 +108,8 @@ static struct file_system_type ext3_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("ext3");
+MODULE_ALIAS("ext3");
 #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
 #else
 #define IS_EXT3_SB(sb) (0)
@@ -296,107 +300,6 @@ void ext4_itable_unused_set(struct super_block *sb,
 }
 
 
-/* Just increment the non-pointer handle value */
-static handle_t *ext4_get_nojournal(void)
-{
-	handle_t *handle = current->journal_info;
-	unsigned long ref_cnt = (unsigned long)handle;
-
-	BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
-
-	ref_cnt++;
-	handle = (handle_t *)ref_cnt;
-
-	current->journal_info = handle;
-	return handle;
-}
-
-
-/* Decrement the non-pointer handle value */
-static void ext4_put_nojournal(handle_t *handle)
-{
-	unsigned long ref_cnt = (unsigned long)handle;
-
-	BUG_ON(ref_cnt == 0);
-
-	ref_cnt--;
-	handle = (handle_t *)ref_cnt;
-
-	current->journal_info = handle;
-}
-
-/*
- * Wrappers for jbd2_journal_start/end.
- */
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
-{
-	journal_t *journal;
-
-	trace_ext4_journal_start(sb, nblocks, _RET_IP_);
-	if (sb->s_flags & MS_RDONLY)
-		return ERR_PTR(-EROFS);
-
-	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
-	journal = EXT4_SB(sb)->s_journal;
-	if (!journal)
-		return ext4_get_nojournal();
-	/*
-	 * Special case here: if the journal has aborted behind our
-	 * backs (eg. EIO in the commit thread), then we still need to
-	 * take the FS itself readonly cleanly.
-	 */
-	if (is_journal_aborted(journal)) {
-		ext4_abort(sb, "Detected aborted journal");
-		return ERR_PTR(-EROFS);
-	}
-	return jbd2_journal_start(journal, nblocks);
-}
-
-int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
-{
-	struct super_block *sb;
-	int err;
-	int rc;
-
-	if (!ext4_handle_valid(handle)) {
-		ext4_put_nojournal(handle);
-		return 0;
-	}
-	sb = handle->h_transaction->t_journal->j_private;
-	err = handle->h_err;
-	rc = jbd2_journal_stop(handle);
-
-	if (!err)
-		err = rc;
-	if (err)
-		__ext4_std_error(sb, where, line, err);
-	return err;
-}
-
-void ext4_journal_abort_handle(const char *caller, unsigned int line,
-			       const char *err_fn, struct buffer_head *bh,
-			       handle_t *handle, int err)
-{
-	char nbuf[16];
-	const char *errstr = ext4_decode_error(NULL, err, nbuf);
-
-	BUG_ON(!ext4_handle_valid(handle));
-
-	if (bh)
-		BUFFER_TRACE(bh, "abort");
-
-	if (!handle->h_err)
-		handle->h_err = err;
-
-	if (is_handle_aborted(handle))
-		return;
-
-	printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
-	       caller, line, errstr, err_fn);
-
-	jbd2_journal_abort_handle(handle);
-}
-
 static void __save_error_info(struct super_block *sb, const char *func,
 			    unsigned int line)
 {
@@ -452,10 +355,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 	struct super_block		*sb = journal->j_private;
 	struct ext4_sb_info		*sbi = EXT4_SB(sb);
 	int				error = is_journal_aborted(journal);
-	struct ext4_journal_cb_entry	*jce, *tmp;
+	struct ext4_journal_cb_entry	*jce;
 
+	BUG_ON(txn->t_state == T_FINISHED);
 	spin_lock(&sbi->s_md_lock);
-	list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+	while (!list_empty(&txn->t_private_list)) {
+		jce = list_entry(txn->t_private_list.next,
+				 struct ext4_journal_cb_entry, jce_list);
 		list_del_init(&jce->jce_list);
 		spin_unlock(&sbi->s_md_lock);
 		jce->jce_func(sb, jce, error);
@@ -493,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
 	}
 	if (test_opt(sb, ERRORS_RO)) {
 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+		/*
+		 * Make sure updated value of ->s_mount_flags will be visible
+		 * before ->s_flags update
+		 */
+		smp_wmb();
 		sb->s_flags |= MS_RDONLY;
 	}
 	if (test_opt(sb, ERRORS_PANIC))
@@ -517,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
 	ext4_handle_error(sb);
 }
 
-void ext4_error_inode(struct inode *inode, const char *function,
-		      unsigned int line, ext4_fsblk_t block,
-		      const char *fmt, ...)
+void __ext4_error_inode(struct inode *inode, const char *function,
+			unsigned int line, ext4_fsblk_t block,
+			const char *fmt, ...)
 {
 	va_list args;
 	struct va_format vaf;
@@ -546,14 +457,14 @@ void ext4_error_inode(struct inode *inode, const char *function,
 	ext4_handle_error(inode->i_sb);
 }
 
-void ext4_error_file(struct file *file, const char *function,
-		     unsigned int line, ext4_fsblk_t block,
-		     const char *fmt, ...)
+void __ext4_error_file(struct file *file, const char *function,
+		       unsigned int line, ext4_fsblk_t block,
+		       const char *fmt, ...)
 {
 	va_list args;
 	struct va_format vaf;
 	struct ext4_super_block *es;
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	char pathname[80], *path;
 
 	es = EXT4_SB(inode->i_sb)->s_es;
@@ -582,8 +493,8 @@ void ext4_error_file(struct file *file, const char *function,
 	ext4_handle_error(inode->i_sb);
 }
 
-static const char *ext4_decode_error(struct super_block *sb, int errno,
-				     char nbuf[16])
+const char *ext4_decode_error(struct super_block *sb, int errno,
+			      char nbuf[16])
 {
 	char *errstr = NULL;
 
@@ -665,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
 
 	if ((sb->s_flags & MS_RDONLY) == 0) {
 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
-		sb->s_flags |= MS_RDONLY;
 		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		/*
+		 * Make sure updated value of ->s_mount_flags will be visible
+		 * before ->s_flags update
+		 */
+		smp_wmb();
+		sb->s_flags |= MS_RDONLY;
 		if (EXT4_SB(sb)->s_journal)
 			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 		save_error_info(sb, function, line);
@@ -675,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
 		panic("EXT4-fs panic from previous error\n");
 }
 
-void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+void __ext4_msg(struct super_block *sb,
+		const char *prefix, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
@@ -798,22 +715,19 @@ fail:
 /*
  * Release the journal device
  */
-static int ext4_blkdev_put(struct block_device *bdev)
+static void ext4_blkdev_put(struct block_device *bdev)
 {
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
-static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
 {
 	struct block_device *bdev;
-	int ret = -ENODEV;
-
 	bdev = sbi->journal_bdev;
 	if (bdev) {
-		ret = ext4_blkdev_put(bdev);
+		ext4_blkdev_put(bdev);
 		sbi->journal_bdev = NULL;
 	}
-	return ret;
 }
 
 static inline struct inode *orphan_list_entry(struct list_head *l)
@@ -848,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_unregister_li_request(sb);
 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
-	flush_workqueue(sbi->dio_unwritten_wq);
-	destroy_workqueue(sbi->dio_unwritten_wq);
+	flush_workqueue(sbi->unrsv_conversion_wq);
+	flush_workqueue(sbi->rsv_conversion_wq);
+	destroy_workqueue(sbi->unrsv_conversion_wq);
+	destroy_workqueue(sbi->rsv_conversion_wq);
 
 	if (sbi->s_journal) {
 		err = jbd2_journal_destroy(sbi->s_journal);
@@ -858,6 +774,7 @@ static void ext4_put_super(struct super_block *sb)
 			ext4_abort(sb, "Couldn't clean up the journal");
 	}
 
+	ext4_es_unregister_shrinker(sbi);
 	del_timer(&sbi->s_err_report);
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
@@ -885,6 +802,7 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	percpu_counter_destroy(&sbi->s_extent_cache_cnt);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -939,11 +857,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 		return NULL;
 
 	ei->vfs_inode.i_version = 1;
-	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
 	ext4_es_init_tree(&ei->i_es_tree);
 	rwlock_init(&ei->i_es_lock);
+	INIT_LIST_HEAD(&ei->i_es_lru);
+	ei->i_es_lru_nr = 0;
+	ei->i_touch_when = 0;
 	ei->i_reserved_data_blocks = 0;
 	ei->i_reserved_meta_blocks = 0;
 	ei->i_allocated_meta_blocks = 0;
@@ -954,12 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_reserved_quota = 0;
 #endif
 	ei->jinode = NULL;
-	INIT_LIST_HEAD(&ei->i_completed_io_list);
+	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+	INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
 	spin_lock_init(&ei->i_completed_io_lock);
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
 	atomic_set(&ei->i_ioend_count, 0);
 	atomic_set(&ei->i_unwritten, 0);
+	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+	INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
 
 	return &ei->vfs_inode;
 }
@@ -1031,6 +954,7 @@ void ext4_clear_inode(struct inode *inode)
 	dquot_drop(inode);
 	ext4_discard_preallocations(inode);
 	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+	ext4_es_lru_del(inode);
 	if (EXT4_I(inode)->jinode) {
 		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
 					       EXT4_I(inode)->jinode);
@@ -1186,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
 	.dirty_inode	= ext4_dirty_inode,
 	.drop_inode	= ext4_drop_inode,
 	.evict_inode	= ext4_evict_inode,
+	.sync_fs	= ext4_sync_fs_nojournal,
 	.put_super	= ext4_put_super,
 	.statfs		= ext4_statfs,
 	.remount_fs	= ext4_remount,
@@ -1280,8 +1205,8 @@ static const match_table_t tokens = {
 	{Opt_stripe, "stripe=%u"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
-	{Opt_mblk_io_submit, "mblk_io_submit"},
-	{Opt_nomblk_io_submit, "nomblk_io_submit"},
+	{Opt_removed, "mblk_io_submit"},
+	{Opt_removed, "nomblk_io_submit"},
 	{Opt_block_validity, "block_validity"},
 	{Opt_noblock_validity, "noblock_validity"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1337,6 +1262,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	char *qname;
+	int ret = -1;
 
 	if (sb_any_quota_loaded(sb) &&
 		!sbi->s_qf_names[qtype]) {
@@ -1345,29 +1271,37 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
 			"quota options when quota turned on");
 		return -1;
 	}
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+		ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
+			 "when QUOTA feature is enabled");
+		return -1;
+	}
 	qname = match_strdup(args);
 	if (!qname) {
 		ext4_msg(sb, KERN_ERR,
 			"Not enough memory for storing quotafile name");
 		return -1;
 	}
-	if (sbi->s_qf_names[qtype] &&
-		strcmp(sbi->s_qf_names[qtype], qname)) {
-		ext4_msg(sb, KERN_ERR,
-			"%s quota file already specified", QTYPE2NAME(qtype));
-		kfree(qname);
-		return -1;
+	if (sbi->s_qf_names[qtype]) {
+		if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+			ret = 1;
+		else
+			ext4_msg(sb, KERN_ERR,
+				 "%s quota file already specified",
+				 QTYPE2NAME(qtype));
+		goto errout;
 	}
-	sbi->s_qf_names[qtype] = qname;
-	if (strchr(sbi->s_qf_names[qtype], '/')) {
+	if (strchr(qname, '/')) {
 		ext4_msg(sb, KERN_ERR,
 			"quotafile must be on filesystem root");
-		kfree(sbi->s_qf_names[qtype]);
-		sbi->s_qf_names[qtype] = NULL;
-		return -1;
+		goto errout;
 	}
+	sbi->s_qf_names[qtype] = qname;
 	set_opt(sb, QUOTA);
 	return 1;
+errout:
+	kfree(qname);
+	return ret;
 }
 
 static int clear_qf_name(struct super_block *sb, int qtype)
@@ -1381,10 +1315,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
 			" when quota turned on");
 		return -1;
 	}
-	/*
-	 * The space will be released later when all options are confirmed
-	 * to be correct
-	 */
+	kfree(sbi->s_qf_names[qtype]);
 	sbi->s_qf_names[qtype] = NULL;
 	return 1;
 }
@@ -1404,6 +1335,9 @@ static int clear_qf_name(struct super_block *sb, int qtype)
 #define MOPT_QFMT	MOPT_NOSUPPORT
 #endif
 #define MOPT_DATAJ	0x0080
+#define MOPT_NO_EXT2	0x0100
+#define MOPT_NO_EXT3	0x0200
+#define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
 
 static const struct mount_opts {
 	int	token;
@@ -1414,25 +1348,31 @@ static const struct mount_opts {
 	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
 	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
 	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
-	{Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
-	{Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
 	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
 	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
-	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
-	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR},
+	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
+	 MOPT_EXT4_ONLY | MOPT_SET},
+	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
+	 MOPT_EXT4_ONLY | MOPT_CLEAR},
 	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
 	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
-	{Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT},
-	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT},
-	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET},
+	{Opt_delalloc, EXT4_MOUNT_DELALLOC,
+	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
+	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
+	 MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT},
+	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
+	 MOPT_EXT4_ONLY | MOPT_SET},
 	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
-				    EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET},
-	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET},
+				    EXT4_MOUNT_JOURNAL_CHECKSUM),
+	 MOPT_EXT4_ONLY | MOPT_SET},
+	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
 	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
 	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
 	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
-	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET},
-	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR},
+	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
+	 MOPT_NO_EXT2 | MOPT_SET},
+	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
+	 MOPT_NO_EXT2 | MOPT_CLEAR},
 	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
 	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
 	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
@@ -1444,9 +1384,14 @@ static const struct mount_opts {
 	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
 	{Opt_init_itable, 0, MOPT_GTE0},
 	{Opt_stripe, 0, MOPT_GTE0},
-	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
-	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
-	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
+	{Opt_resuid, 0, MOPT_GTE0},
+	{Opt_resgid, 0, MOPT_GTE0},
+	{Opt_journal_dev, 0, MOPT_GTE0},
+	{Opt_journal_ioprio, 0, MOPT_GTE0},
+	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
+	 MOPT_NO_EXT2 | MOPT_DATAJ},
 	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
 	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1496,8 +1441,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 	else if (token == Opt_offgrpjquota)
 		return clear_qf_name(sb, GRPQUOTA);
 #endif
-	if (args->from && match_int(args, &arg))
-		return -1;
 	switch (token) {
 	case Opt_noacl:
 	case Opt_nouser_xattr:
@@ -1506,138 +1449,156 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 	case Opt_sb:
 		return 1;	/* handled by get_sb_block() */
 	case Opt_removed:
-		ext4_msg(sb, KERN_WARNING,
-			 "Ignoring removed %s option", opt);
+		ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
 		return 1;
-	case Opt_resuid:
+	case Opt_abort:
+		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		return 1;
+	case Opt_i_version:
+		sb->s_flags |= MS_I_VERSION;
+		return 1;
+	}
+
+	for (m = ext4_mount_opts; m->token != Opt_err; m++)
+		if (token == m->token)
+			break;
+
+	if (m->token == Opt_err) {
+		ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+			 "or missing value", opt);
+		return -1;
+	}
+
+	if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Mount option \"%s\" incompatible with ext2", opt);
+		return -1;
+	}
+	if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Mount option \"%s\" incompatible with ext3", opt);
+		return -1;
+	}
+
+	if (args->from && match_int(args, &arg))
+		return -1;
+	if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+		return -1;
+	if (m->flags & MOPT_EXPLICIT)
+		set_opt2(sb, EXPLICIT_DELALLOC);
+	if (m->flags & MOPT_CLEAR_ERR)
+		clear_opt(sb, ERRORS_MASK);
+	if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+		ext4_msg(sb, KERN_ERR, "Cannot change quota "
+			 "options when quota turned on");
+		return -1;
+	}
+
+	if (m->flags & MOPT_NOSUPPORT) {
+		ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+	} else if (token == Opt_commit) {
+		if (arg == 0)
+			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+		sbi->s_commit_interval = HZ * arg;
+	} else if (token == Opt_max_batch_time) {
+		if (arg == 0)
+			arg = EXT4_DEF_MAX_BATCH_TIME;
+		sbi->s_max_batch_time = arg;
+	} else if (token == Opt_min_batch_time) {
+		sbi->s_min_batch_time = arg;
+	} else if (token == Opt_inode_readahead_blks) {
+		if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
+			ext4_msg(sb, KERN_ERR,
+				 "EXT4-fs: inode_readahead_blks must be "
+				 "0 or a power of 2 smaller than 2^31");
+			return -1;
+		}
+		sbi->s_inode_readahead_blks = arg;
+	} else if (token == Opt_init_itable) {
+		set_opt(sb, INIT_INODE_TABLE);
+		if (!args->from)
+			arg = EXT4_DEF_LI_WAIT_MULT;
+		sbi->s_li_wait_mult = arg;
+	} else if (token == Opt_max_dir_size_kb) {
+		sbi->s_max_dir_size_kb = arg;
+	} else if (token == Opt_stripe) {
+		sbi->s_stripe = arg;
+	} else if (token == Opt_resuid) {
 		uid = make_kuid(current_user_ns(), arg);
 		if (!uid_valid(uid)) {
 			ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
 			return -1;
 		}
 		sbi->s_resuid = uid;
-		return 1;
-	case Opt_resgid:
+	} else if (token == Opt_resgid) {
 		gid = make_kgid(current_user_ns(), arg);
 		if (!gid_valid(gid)) {
 			ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
 			return -1;
 		}
 		sbi->s_resgid = gid;
-		return 1;
-	case Opt_abort:
-		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
-		return 1;
-	case Opt_i_version:
-		sb->s_flags |= MS_I_VERSION;
-		return 1;
-	case Opt_journal_dev:
+	} else if (token == Opt_journal_dev) {
 		if (is_remount) {
 			ext4_msg(sb, KERN_ERR,
 				 "Cannot specify journal on remount");
 			return -1;
 		}
 		*journal_devnum = arg;
-		return 1;
-	case Opt_journal_ioprio:
-		if (arg < 0 || arg > 7)
-			return -1;
-		*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
-		return 1;
-	}
-
-	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
-		if (token != m->token)
-			continue;
-		if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
-			return -1;
-		if (m->flags & MOPT_EXPLICIT)
-			set_opt2(sb, EXPLICIT_DELALLOC);
-		if (m->flags & MOPT_CLEAR_ERR)
-			clear_opt(sb, ERRORS_MASK);
-		if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
-			ext4_msg(sb, KERN_ERR, "Cannot change quota "
-				 "options when quota turned on");
+	} else if (token == Opt_journal_ioprio) {
+		if (arg > 7) {
+			ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
+				 " (must be 0-7)");
 			return -1;
 		}
-
-		if (m->flags & MOPT_NOSUPPORT) {
-			ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
-		} else if (token == Opt_commit) {
-			if (arg == 0)
-				arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
-			sbi->s_commit_interval = HZ * arg;
-		} else if (token == Opt_max_batch_time) {
-			if (arg == 0)
-				arg = EXT4_DEF_MAX_BATCH_TIME;
-			sbi->s_max_batch_time = arg;
-		} else if (token == Opt_min_batch_time) {
-			sbi->s_min_batch_time = arg;
-		} else if (token == Opt_inode_readahead_blks) {
-			if (arg > (1 << 30))
-				return -1;
-			if (arg && !is_power_of_2(arg)) {
+		*journal_ioprio =
+			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+	} else if (m->flags & MOPT_DATAJ) {
+		if (is_remount) {
+			if (!sbi->s_journal)
+				ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+			else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
 				ext4_msg(sb, KERN_ERR,
-					 "EXT4-fs: inode_readahead_blks"
-					 " must be a power of 2");
-				return -1;
-			}
-			sbi->s_inode_readahead_blks = arg;
-		} else if (token == Opt_init_itable) {
-			set_opt(sb, INIT_INODE_TABLE);
-			if (!args->from)
-				arg = EXT4_DEF_LI_WAIT_MULT;
-			sbi->s_li_wait_mult = arg;
-		} else if (token == Opt_max_dir_size_kb) {
-			sbi->s_max_dir_size_kb = arg;
-		} else if (token == Opt_stripe) {
-			sbi->s_stripe = arg;
-		} else if (m->flags & MOPT_DATAJ) {
-			if (is_remount) {
-				if (!sbi->s_journal)
-					ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
-				else if (test_opt(sb, DATA_FLAGS) !=
-					 m->mount_opt) {
-					ext4_msg(sb, KERN_ERR,
 					 "Cannot change data mode on remount");
-					return -1;
-				}
-			} else {
-				clear_opt(sb, DATA_FLAGS);
-				sbi->s_mount_opt |= m->mount_opt;
-			}
-#ifdef CONFIG_QUOTA
-		} else if (m->flags & MOPT_QFMT) {
-			if (sb_any_quota_loaded(sb) &&
-			    sbi->s_jquota_fmt != m->mount_opt) {
-				ext4_msg(sb, KERN_ERR, "Cannot "
-					 "change journaled quota options "
-					 "when quota turned on");
 				return -1;
 			}
-			sbi->s_jquota_fmt = m->mount_opt;
-#endif
 		} else {
-			if (!args->from)
-				arg = 1;
-			if (m->flags & MOPT_CLEAR)
-				arg = !arg;
-			else if (unlikely(!(m->flags & MOPT_SET))) {
-				ext4_msg(sb, KERN_WARNING,
-					 "buggy handling of option %s", opt);
-				WARN_ON(1);
-				return -1;
-			}
-			if (arg != 0)
-				sbi->s_mount_opt |= m->mount_opt;
-			else
-				sbi->s_mount_opt &= ~m->mount_opt;
+			clear_opt(sb, DATA_FLAGS);
+			sbi->s_mount_opt |= m->mount_opt;
 		}
-		return 1;
+#ifdef CONFIG_QUOTA
+	} else if (m->flags & MOPT_QFMT) {
+		if (sb_any_quota_loaded(sb) &&
+		    sbi->s_jquota_fmt != m->mount_opt) {
+			ext4_msg(sb, KERN_ERR, "Cannot change journaled "
+				 "quota options when quota turned on");
+			return -1;
+		}
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					       EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Cannot set journaled quota options "
+				 "when QUOTA feature is enabled");
+			return -1;
+		}
+		sbi->s_jquota_fmt = m->mount_opt;
+#endif
+	} else {
+		if (!args->from)
+			arg = 1;
+		if (m->flags & MOPT_CLEAR)
+			arg = !arg;
+		else if (unlikely(!(m->flags & MOPT_SET))) {
+			ext4_msg(sb, KERN_WARNING,
+				 "buggy handling of option %s", opt);
+			WARN_ON(1);
+			return -1;
+		}
+		if (arg != 0)
+			sbi->s_mount_opt |= m->mount_opt;
+		else
+			sbi->s_mount_opt &= ~m->mount_opt;
 	}
-	ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
-		 "or missing value", opt);
-	return -1;
+	return 1;
 }
 
 static int parse_options(char *options, struct super_block *sb,
@@ -1667,6 +1628,12 @@ static int parse_options(char *options, struct super_block *sb,
 			return 0;
 	}
 #ifdef CONFIG_QUOTA
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+	    (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
+		ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
+			 "feature is enabled");
+		return 0;
+	}
 	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
 		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
 			clear_opt(sb, USRQUOTA);
@@ -1854,7 +1821,7 @@ static int options_seq_show(struct seq_file *seq, void *offset)
 
 static int options_open_fs(struct inode *inode, struct file *file)
 {
-	return single_open(file, options_seq_show, PDE(inode)->data);
+	return single_open(file, options_seq_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ext4_seq_options_fops = {
@@ -1959,7 +1926,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *gdp = NULL;
 	ext4_group_t flex_group;
-	unsigned int groups_per_flex = 0;
 	int i, err;
 
 	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1967,7 +1933,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		sbi->s_log_groups_per_flex = 0;
 		return 1;
 	}
-	groups_per_flex = 1U << sbi->s_log_groups_per_flex;
 
 	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
 	if (err)
@@ -1979,8 +1944,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		flex_group = ext4_flex_group(sbi, i);
 		atomic_add(ext4_free_inodes_count(sb, gdp),
 			   &sbi->s_flex_groups[flex_group].free_inodes);
-		atomic_add(ext4_free_group_clusters(sb, gdp),
-			   &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic64_add(ext4_free_group_clusters(sb, gdp),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
 		atomic_add(ext4_used_dirs_count(sb, gdp),
 			   &sbi->s_flex_groups[flex_group].used_dirs);
 	}
@@ -2000,16 +1965,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 	if ((sbi->s_es->s_feature_ro_compat &
 	     cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
 		/* Use new metadata_csum algorithm */
-		__u16 old_csum;
+		__le16 save_csum;
 		__u32 csum32;
 
-		old_csum = gdp->bg_checksum;
+		save_csum = gdp->bg_checksum;
 		gdp->bg_checksum = 0;
 		csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
 				     sizeof(le_group));
 		csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
 				     sbi->s_desc_size);
-		gdp->bg_checksum = old_csum;
+		gdp->bg_checksum = save_csum;
 
 		crc = csum32 & 0xFFFF;
 		goto out;
@@ -2215,19 +2180,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
 		dquot_initialize(inode);
 		if (inode->i_nlink) {
-			ext4_msg(sb, KERN_DEBUG,
-				"%s: truncating inode %lu to %lld bytes",
-				__func__, inode->i_ino, inode->i_size);
+			if (test_opt(sb, DEBUG))
+				ext4_msg(sb, KERN_DEBUG,
+					"%s: truncating inode %lu to %lld bytes",
+					__func__, inode->i_ino, inode->i_size);
 			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
 			mutex_lock(&inode->i_mutex);
+			truncate_inode_pages(inode->i_mapping, inode->i_size);
 			ext4_truncate(inode);
 			mutex_unlock(&inode->i_mutex);
 			nr_truncates++;
 		} else {
-			ext4_msg(sb, KERN_DEBUG,
-				"%s: deleting unreferenced inode %lu",
-				__func__, inode->i_ino);
+			if (test_opt(sb, DEBUG))
+				ext4_msg(sb, KERN_DEBUG,
+					"%s: deleting unreferenced inode %lu",
+					__func__, inode->i_ino);
 			jbd_debug(2, "deleting unreferenced inode %lu\n",
 				  inode->i_ino);
 			nr_orphans++;
@@ -2428,20 +2396,21 @@ struct ext4_attr {
 	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
 	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
 			 const char *, size_t);
-	int offset;
+	union {
+		int offset;
+		int deprecated_val;
+	} u;
 };
 
-static int parse_strtoul(const char *buf,
-		unsigned long max, unsigned long *value)
+static int parse_strtoull(const char *buf,
+		unsigned long long max, unsigned long long *value)
 {
-	char *endp;
-
-	*value = simple_strtoul(skip_spaces(buf), &endp, 0);
-	endp = skip_spaces(endp);
-	if (*endp || *value > max)
-		return -EINVAL;
+	int ret;
 
-	return 0;
+	ret = kstrtoull(skip_spaces(buf), 0, value);
+	if (!ret && *value > max)
+		ret = -EINVAL;
+	return ret;
 }
 
 static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
@@ -2483,11 +2452,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 					  const char *buf, size_t count)
 {
 	unsigned long t;
+	int ret;
 
-	if (parse_strtoul(buf, 0x40000000, &t))
-		return -EINVAL;
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
+	if (ret)
+		return ret;
 
-	if (t && !is_power_of_2(t))
+	if (t && (!is_power_of_2(t) || t > 0x40000000))
 		return -EINVAL;
 
 	sbi->s_inode_readahead_blks = t;
@@ -2497,7 +2468,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 static ssize_t sbi_ui_show(struct ext4_attr *a,
 			   struct ext4_sb_info *sbi, char *buf)
 {
-	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
 
 	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
 }
@@ -2506,15 +2477,38 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
 			    struct ext4_sb_info *sbi,
 			    const char *buf, size_t count)
 {
-	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
 	unsigned long t;
+	int ret;
 
-	if (parse_strtoul(buf, 0xffffffff, &t))
-		return -EINVAL;
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
+	if (ret)
+		return ret;
 	*ui = t;
 	return count;
 }
 
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+				  struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi,
+				   const char *buf, size_t count)
+{
+	unsigned long long val;
+	int ret;
+
+	if (parse_strtoull(buf, -1ULL, &val))
+		return -EINVAL;
+	ret = ext4_reserve_clusters(sbi, val);
+
+	return ret ? ret : count;
+}
+
 static ssize_t trigger_test_error(struct ext4_attr *a,
 				  struct ext4_sb_info *sbi,
 				  const char *buf, size_t count)
@@ -2532,12 +2526,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
 	return count;
 }
 
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
+
 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
 static struct ext4_attr ext4_attr_##_name = {			\
 	.attr = {.name = __stringify(_name), .mode = _mode },	\
 	.show	= _show,					\
 	.store	= _store,					\
-	.offset = offsetof(struct ext4_sb_info, _elname),	\
+	.u = {							\
+		.offset = offsetof(struct ext4_sb_info, _elname),\
+	},							\
 }
 #define EXT4_ATTR(name, mode, show, store) \
 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2548,10 +2550,19 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 #define EXT4_RW_ATTR_SBI_UI(name, elname)	\
 	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
 #define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val)	\
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = 0444 },	\
+	.show	= sbi_deprecated_show,				\
+	.u = {							\
+		.deprecated_val = _val,				\
+	},							\
+}
 
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2561,7 +2572,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
 EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
 EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
 
@@ -2569,6 +2580,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(reserved_clusters),
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
@@ -2776,7 +2788,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
 			break;
 	}
 
-	if (group == ngroups)
+	if (group >= ngroups)
 		ret = 1;
 
 	if (!ret) {
@@ -3016,33 +3028,34 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
 	return elr;
 }
 
-static int ext4_register_li_request(struct super_block *sb,
-				    ext4_group_t first_not_zeroed)
+int ext4_register_li_request(struct super_block *sb,
+			     ext4_group_t first_not_zeroed)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_li_request *elr;
+	struct ext4_li_request *elr = NULL;
 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
 	int ret = 0;
 
+	mutex_lock(&ext4_li_mtx);
 	if (sbi->s_li_request != NULL) {
 		/*
 		 * Reset timeout so it can be computed again, because
 		 * s_li_wait_mult might have changed.
 		 */
 		sbi->s_li_request->lr_timeout = 0;
-		return 0;
+		goto out;
 	}
 
 	if (first_not_zeroed == ngroups ||
 	    (sb->s_flags & MS_RDONLY) ||
 	    !test_opt(sb, INIT_INODE_TABLE))
-		return 0;
+		goto out;
 
 	elr = ext4_li_request_new(sb, first_not_zeroed);
-	if (!elr)
-		return -ENOMEM;
-
-	mutex_lock(&ext4_li_mtx);
+	if (!elr) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	if (NULL == ext4_li_info) {
 		ret = ext4_li_info_new();
@@ -3235,7 +3248,7 @@ int ext4_calculate_overhead(struct super_block *sb)
 	}
 	/* Add the journal blocks as well */
 	if (sbi->s_journal)
-		overhead += EXT4_B2C(sbi, sbi->s_journal->j_maxlen);
+		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
 
 	sbi->s_overhead = overhead;
 	smp_wmb();
@@ -3243,6 +3256,40 @@ int ext4_calculate_overhead(struct super_block *sb)
 	return 0;
 }
 
+
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+{
+	ext4_fsblk_t resv_clusters;
+
+	/*
+	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
+	 * This should cover the situations where we can not afford to run
+	 * out of space like for example punch hole, or converting
+	 * uninitialized extents in delalloc path. In most cases such
+	 * allocation would require 1, or 2 blocks, higher numbers are
+	 * very rare.
+	 */
+	resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+
+	do_div(resv_clusters, 50);
+	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+
+	return resv_clusters;
+}
+
+
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+	ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+				sbi->s_cluster_bits;
+
+	if (count >= clusters)
+		return -EINVAL;
+
+	atomic64_set(&sbi->s_resv_clusters, count);
+	return 0;
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 {
 	char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3379,7 +3426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	set_opt(sb, POSIX_ACL);
 #endif
-	set_opt(sb, MBLK_IO_SUBMIT);
 	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
 		set_opt(sb, JOURNAL_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3578,6 +3624,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
 	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
 
+	/* Do we have standard group size of blocksize * 8 blocks ? */
+	if (sbi->s_blocks_per_group == blocksize << 3)
+		set_opt2(sb, STD_GROUP_SIZE);
+
 	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3750,6 +3800,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_err_report.function = print_daily_error_info;
 	sbi->s_err_report.data = (unsigned long) sb;
 
+	/* Register extent status tree shrinker */
+	ext4_es_register_shrinker(sbi);
+
 	err = percpu_counter_init(&sbi->s_freeclusters_counter,
 			ext4_count_free_clusters(sb));
 	if (!err) {
@@ -3763,13 +3816,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (!err) {
 		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
 	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0);
+	}
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount3;
 	}
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
-	sbi->s_max_writeback_mb_bump = 128;
 	sbi->s_extent_max_zeroout_kb = 32;
 
 	/*
@@ -3783,13 +3838,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_export_op = &ext4_export_ops;
 	sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
-	sb->s_qcop = &ext4_qctl_operations;
 	sb->dq_op = &ext4_quota_operations;
-
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
-		/* Use qctl operations for hidden quota files. */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
 		sb->s_qcop = &ext4_qctl_sysfile_operations;
-	}
+	else
+		sb->s_qcop = &ext4_qctl_operations;
 #endif
 	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
 
@@ -3899,12 +3952,20 @@ no_journal:
 	 * The maximum number of concurrent works can be high and
 	 * concurrency isn't really necessary.  Limit it to 1.
 	 */
-	EXT4_SB(sb)->dio_unwritten_wq =
-		alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
-	if (!EXT4_SB(sb)->dio_unwritten_wq) {
-		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+	EXT4_SB(sb)->rsv_conversion_wq =
+		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!EXT4_SB(sb)->rsv_conversion_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
 		ret = -ENOMEM;
-		goto failed_mount_wq;
+		goto failed_mount4;
+	}
+
+	EXT4_SB(sb)->unrsv_conversion_wq =
+		alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!EXT4_SB(sb)->unrsv_conversion_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+		ret = -ENOMEM;
+		goto failed_mount4;
 	}
 
 	/*
@@ -3959,6 +4020,13 @@ no_journal:
 			 "available");
 	}
 
+	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+			 "reserved pool", ext4_calculate_resv_clusters(sbi));
+		goto failed_mount4a;
+	}
+
 	err = ext4_setup_system_zone(sb);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -3985,6 +4053,16 @@ no_journal:
 	if (err)
 		goto failed_mount7;
 
+#ifdef CONFIG_QUOTA
+	/* Enable quota usage during mount. */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+	    !(sb->s_flags & MS_RDONLY)) {
+		err = ext4_enable_quotas(sb);
+		if (err)
+			goto failed_mount8;
+	}
+#endif  /* CONFIG_QUOTA */
+
 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
@@ -4002,16 +4080,6 @@ no_journal:
 	} else
 		descr = "out journal";
 
-#ifdef CONFIG_QUOTA
-	/* Enable quota usage during mount. */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
-	    !(sb->s_flags & MS_RDONLY)) {
-		err = ext4_enable_quotas(sb);
-		if (err)
-			goto failed_mount7;
-	}
-#endif  /* CONFIG_QUOTA */
-
 	if (test_opt(sb, DISCARD)) {
 		struct request_queue *q = bdev_get_queue(sb->s_bdev);
 		if (!blk_queue_discard(q))
@@ -4035,6 +4103,10 @@ cantfind_ext4:
 		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
 	goto failed_mount;
 
+#ifdef CONFIG_QUOTA
+failed_mount8:
+	kobject_del(&sbi->s_kobj);
+#endif
 failed_mount7:
 	ext4_unregister_li_request(sb);
 failed_mount6:
@@ -4047,13 +4119,17 @@ failed_mount4a:
 	sb->s_root = NULL;
 failed_mount4:
 	ext4_msg(sb, KERN_ERR, "mount failed");
-	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+	if (EXT4_SB(sb)->rsv_conversion_wq)
+		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+	if (EXT4_SB(sb)->unrsv_conversion_wq)
+		destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
 failed_mount_wq:
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
 failed_mount3:
+	ext4_es_unregister_shrinker(sbi);
 	del_timer(&sbi->s_err_report);
 	if (sbi->s_flex_groups)
 		ext4_kvfree(sbi->s_flex_groups);
@@ -4061,6 +4137,7 @@ failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	percpu_counter_destroy(&sbi->s_extent_cache_cnt);
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
@@ -4220,7 +4297,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 		goto out_bdev;
 	}
 	journal->j_private = sb;
-	ll_rw_block(READ, 1, &journal->j_sb_buffer);
+	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
 	wait_on_buffer(journal->j_sb_buffer);
 	if (!buffer_uptodate(journal->j_sb_buffer)) {
 		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
@@ -4476,35 +4553,64 @@ static void ext4_clear_journal_err(struct super_block *sb,
 int ext4_force_commit(struct super_block *sb)
 {
 	journal_t *journal;
-	int ret = 0;
 
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
-	if (journal)
-		ret = ext4_journal_force_commit(journal);
-
-	return ret;
+	return ext4_journal_force_commit(journal);
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
 	int ret = 0;
 	tid_t target;
+	bool needs_barrier = false;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	trace_ext4_sync_fs(sb, wait);
-	flush_workqueue(sbi->dio_unwritten_wq);
+	flush_workqueue(sbi->rsv_conversion_wq);
+	flush_workqueue(sbi->unrsv_conversion_wq);
 	/*
 	 * Writeback quota in non-journalled quota case - journalled quota has
 	 * no dirty dquots
 	 */
 	dquot_writeback_dquots(sb, -1);
+	/*
+	 * Data writeback is possible w/o journal transaction, so barrier must
+	 * being sent at the end of the function. But we can skip it if
+	 * transaction_commit will do it for us.
+	 */
+	target = jbd2_get_latest_transaction(sbi->s_journal);
+	if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+	    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+		needs_barrier = true;
+
 	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
 		if (wait)
-			jbd2_log_wait_commit(sbi->s_journal, target);
+			ret = jbd2_log_wait_commit(sbi->s_journal, target);
 	}
+	if (needs_barrier) {
+		int err;
+		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+		if (!ret)
+			ret = err;
+	}
+
+	return ret;
+}
+
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
+{
+	int ret = 0;
+
+	trace_ext4_sync_fs(sb, wait);
+	flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+	flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
+	dquot_writeback_dquots(sb, -1);
+	if (wait && test_opt(sb, BARRIER))
+		ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
 	return ret;
 }
 
@@ -4588,7 +4694,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 	int err = 0;
 #ifdef CONFIG_QUOTA
-	int i;
+	int i, j;
 #endif
 	char *orig_data = kstrdup(data, GFP_KERNEL);
 
@@ -4604,7 +4710,17 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #ifdef CONFIG_QUOTA
 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++)
-		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+		if (sbi->s_qf_names[i]) {
+			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+							 GFP_KERNEL);
+			if (!old_opts.s_qf_names[i]) {
+				for (j = 0; j < i; j++)
+					kfree(old_opts.s_qf_names[j]);
+				kfree(orig_data);
+				return -ENOMEM;
+			}
+		} else
+			old_opts.s_qf_names[i] = NULL;
 #endif
 	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
 		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
@@ -4737,9 +4853,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
 	for (i = 0; i < MAXQUOTAS; i++)
-		if (old_opts.s_qf_names[i] &&
-		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
-			kfree(old_opts.s_qf_names[i]);
+		kfree(old_opts.s_qf_names[i]);
 	if (enable_quota) {
 		if (sb_any_quota_suspended(sb))
 			dquot_resume(sb, -1);
@@ -4768,9 +4882,7 @@ restore_opts:
 #ifdef CONFIG_QUOTA
 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
-		if (sbi->s_qf_names[i] &&
-		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
-			kfree(sbi->s_qf_names[i]);
+		kfree(sbi->s_qf_names[i]);
 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
 	}
 #endif
@@ -4783,9 +4895,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct super_block *sb = dentry->d_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
-	ext4_fsblk_t overhead = 0;
+	ext4_fsblk_t overhead = 0, resv_blocks;
 	u64 fsid;
 	s64 bfree;
+	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
 
 	if (!test_opt(sb, MINIX_DF))
 		overhead = sbi->s_overhead;
@@ -4797,8 +4910,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
 	/* prevent underflow in case that few free space is available */
 	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
-	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
-	if (buf->f_bfree < ext4_r_blocks_count(es))
+	buf->f_bavail = buf->f_bfree -
+			(ext4_r_blocks_count(es) + resv_blocks);
+	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
@@ -4835,7 +4949,7 @@ static int ext4_write_dquot(struct dquot *dquot)
 	struct inode *inode;
 
 	inode = dquot_to_inode(dquot);
-	handle = ext4_journal_start(inode,
+	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
 				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -4851,7 +4965,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
 	int ret, err;
 	handle_t *handle;
 
-	handle = ext4_journal_start(dquot_to_inode(dquot),
+	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
 				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -4867,7 +4981,7 @@ static int ext4_release_dquot(struct dquot *dquot)
 	int ret, err;
 	handle_t *handle;
 
-	handle = ext4_journal_start(dquot_to_inode(dquot),
+	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
 				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle)) {
 		/* Release dquot anyway to avoid endless cycle in dqput() */
@@ -4883,9 +4997,12 @@ static int ext4_release_dquot(struct dquot *dquot)
 
 static int ext4_mark_dquot_dirty(struct dquot *dquot)
 {
+	struct super_block *sb = dquot->dq_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
 	/* Are we journaling quotas? */
-	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
-	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+	    sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
 		dquot_mark_dquot_dirty(dquot);
 		return ext4_write_dquot(dquot);
 	} else {
@@ -4899,7 +5016,7 @@ static int ext4_write_info(struct super_block *sb, int type)
 	handle_t *handle;
 
 	/* Data block + inode block */
-	handle = ext4_journal_start(sb->s_root->d_inode, 2);
+	handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_commit_info(sb, type);
@@ -4983,6 +5100,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 		return PTR_ERR(qf_inode);
 	}
 
+	/* Don't account quota for quota files to avoid recursion */
+	qf_inode->i_flags |= S_NOQUOTA;
 	err = dquot_enable(qf_inode, type, format_id, flags);
 	iput(qf_inode);
 
@@ -5005,9 +5124,9 @@ static int ext4_enable_quotas(struct super_block *sb)
 						DQUOT_USAGE_ENABLED);
 			if (err) {
 				ext4_warning(sb,
-					"Failed to enable quota (type=%d) "
-					"tracking. Please run e2fsck to fix.",
-					type);
+					"Failed to enable quota tracking "
+					"(type=%d, err=%d). Please run "
+					"e2fsck to fix.", type, err);
 				return err;
 			}
 		}
@@ -5045,7 +5164,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
 
 	/* Update modification times of quota files when userspace can
 	 * start looking at them */
-	handle = ext4_journal_start(inode, 1);
+	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
 	if (IS_ERR(handle))
 		goto out;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -5194,7 +5313,6 @@ static inline int ext2_feature_set_ok(struct super_block *sb)
 		return 0;
 	return 1;
 }
-MODULE_ALIAS("ext2");
 #else
 static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
@@ -5227,7 +5345,6 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
 		return 0;
 	return 1;
 }
-MODULE_ALIAS("ext3");
 #else
 static inline void register_as_ext3(void) { }
 static inline void unregister_as_ext3(void) { }
@@ -5241,6 +5358,7 @@ static struct file_system_type ext4_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("ext4");
 
 static int __init ext4_init_feat_adverts(void)
 {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a91ebc2b66..c081e34f717 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
 				    struct ext4_xattr_header *hdr)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	__u32 csum, old;
+	__u32 csum;
+	__le32 save_csum;
+	__le64 dsk_block_nr = cpu_to_le64(block_nr);
 
-	old = hdr->h_checksum;
+	save_csum = hdr->h_checksum;
 	hdr->h_checksum = 0;
-	block_nr = cpu_to_le64(block_nr);
-	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
-			   sizeof(block_nr));
+	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+			   sizeof(dsk_block_nr));
 	csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
 			   EXT4_BLOCK_SIZE(inode->i_sb));
 
-	hdr->h_checksum = old;
+	hdr->h_checksum = save_csum;
 	return cpu_to_le32(csum);
 }
 
@@ -549,7 +550,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		error = ext4_handle_dirty_xattr_block(handle, inode, bh);
 		if (IS_SYNC(inode))
 			ext4_handle_sync(handle);
-		dquot_free_block(inode, 1);
+		dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
 	}
@@ -832,7 +833,8 @@ inserted:
 			else {
 				/* The old block is released after updating
 				   the inode. */
-				error = dquot_alloc_block(inode, 1);
+				error = dquot_alloc_block(inode,
+						EXT4_C2B(EXT4_SB(sb), 1));
 				if (error)
 					goto cleanup;
 				error = ext4_journal_get_write_access(handle,
@@ -886,17 +888,18 @@ inserted:
 				  (unsigned long long)block);
 
 			new_bh = sb_getblk(sb, block);
-			if (!new_bh) {
+			if (unlikely(!new_bh)) {
+				error = -ENOMEM;
 getblk_failed:
 				ext4_free_blocks(handle, inode, NULL, block, 1,
 						 EXT4_FREE_BLOCKS_METADATA);
-				error = -EIO;
 				goto cleanup;
 			}
 			lock_buffer(new_bh);
 			error = ext4_journal_get_create_access(handle, new_bh);
 			if (error) {
 				unlock_buffer(new_bh);
+				error = -EIO;
 				goto getblk_failed;
 			}
 			memcpy(new_bh->b_data, s->base, new_bh->b_size);
@@ -928,7 +931,7 @@ cleanup:
 	return error;
 
 cleanup_dquot:
-	dquot_free_block(inode, 1);
+	dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1));
 	goto cleanup;
 
 bad_block:
@@ -1164,17 +1167,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 {
 	handle_t *handle;
 	int error, retries = 0;
-	int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+	int credits = ext4_jbd2_credits_xattr(inode);
 
 retry:
-	/*
-	 * In case of inline data, we may push out the data to a block,
-	 * So reserve the journal space first.
-	 */
-	if (ext4_has_inline_data(inode))
-		credits += ext4_writepage_trans_blocks(inode) + 1;
-
-	handle = ext4_journal_start(inode, credits);
+	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
 	} else {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 69eda787a96..c767dbdd7fc 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -22,6 +22,7 @@
 #define	EXT4_XATTR_INDEX_LUSTRE			5
 #define EXT4_XATTR_INDEX_SECURITY	        6
 #define EXT4_XATTR_INDEX_SYSTEM			7
+#define EXT4_XATTR_INDEX_RICHACL		8
 
 struct ext4_xattr_header {
 	__le32	h_magic;	/* magic number for identification */
@@ -125,74 +126,6 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 				       struct ext4_xattr_info *i,
 				       struct ext4_xattr_ibody_find *is);
 
-extern int ext4_has_inline_data(struct inode *inode);
-extern int ext4_get_inline_size(struct inode *inode);
-extern int ext4_get_max_inline_size(struct inode *inode);
-extern int ext4_find_inline_data_nolock(struct inode *inode);
-extern void ext4_write_inline_data(struct inode *inode,
-				   struct ext4_iloc *iloc,
-				   void *buffer, loff_t pos,
-				   unsigned int len);
-extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
-				    unsigned int len);
-extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
-				 unsigned int len);
-extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
-
-extern int ext4_readpage_inline(struct inode *inode, struct page *page);
-extern int ext4_try_to_write_inline_data(struct address_space *mapping,
-					 struct inode *inode,
-					 loff_t pos, unsigned len,
-					 unsigned flags,
-					 struct page **pagep);
-extern int ext4_write_inline_data_end(struct inode *inode,
-				      loff_t pos, unsigned len,
-				      unsigned copied,
-				      struct page *page);
-extern struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
-				  unsigned len,
-				  struct page *page);
-extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
-					   struct inode *inode,
-					   loff_t pos, unsigned len,
-					   unsigned flags,
-					   struct page **pagep,
-					   void **fsdata);
-extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
-					 unsigned len, unsigned copied,
-					 struct page *page);
-extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
-				     struct inode *inode);
-extern int ext4_try_create_inline_dir(handle_t *handle,
-				      struct inode *parent,
-				      struct inode *inode);
-extern int ext4_read_inline_dir(struct file *filp,
-				void *dirent, filldir_t filldir,
-				int *has_inline_data);
-extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
-					const struct qstr *d_name,
-					struct ext4_dir_entry_2 **res_dir,
-					int *has_inline_data);
-extern int ext4_delete_inline_entry(handle_t *handle,
-				    struct inode *dir,
-				    struct ext4_dir_entry_2 *de_del,
-				    struct buffer_head *bh,
-				    int *has_inline_data);
-extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
-extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
-					struct ext4_dir_entry_2 **parent_de,
-					int *retval);
-extern int ext4_inline_data_fiemap(struct inode *inode,
-				   struct fiemap_extent_info *fieinfo,
-				   int *has_inline);
-extern int ext4_try_to_evict_inline_data(handle_t *handle,
-					 struct inode *inode,
-					 int needed);
-extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
-
-extern int ext4_convert_inline_data(struct inode *inode);
-
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
 			      struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index fd27e7e6326..e06e0995e00 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -51,3 +51,15 @@ config F2FS_FS_POSIX_ACL
 	  Linux website <http://acl.bestbits.at/>.
 
 	  If you don't know what Access Control Lists are, say N
+
+config F2FS_FS_SECURITY
+	bool "F2FS Security Labels"
+	depends on F2FS_FS_XATTR
+	help
+	  Security labels provide an access control facility to support Linux
+	  Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+	  Linux. This option enables an extended attribute handler for file
+	  security labels in the f2fs filesystem, so that it requires enabling
+	  the extended attribute support in advance.
+
+	  If you are not using a security module, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 137af4255da..b7826ec1b47 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,7 +250,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		}
 	}
 
-	error = f2fs_setxattr(inode, name_index, "", value, size);
+	error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
 
 	kfree(value);
 	if (!error)
@@ -299,7 +299,7 @@ int f2fs_acl_chmod(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct posix_acl *acl;
 	int error;
-	mode_t mode = get_inode_mode(inode);
+	umode_t mode = get_inode_mode(inode);
 
 	if (!test_opt(sbi, POSIX_ACL))
 		return 0;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index ff3c8439af8..66a6b85a51d 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -20,6 +20,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include <trace/events/f2fs.h>
 
 static struct kmem_cache *orphan_entry_slab;
 static struct kmem_cache *inode_entry_slab;
@@ -57,13 +58,19 @@ repeat:
 		cond_resched();
 		goto repeat;
 	}
-	if (f2fs_readpage(sbi, page, index, READ_SYNC)) {
+	if (PageUptodate(page))
+		goto out;
+
+	if (f2fs_readpage(sbi, page, index, READ_SYNC))
+		goto repeat;
+
+	lock_page(page);
+	if (page->mapping != mapping) {
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
+out:
 	mark_page_accessed(page);
-
-	/* We do not allow returning an errorneous page */
 	return page;
 }
 
@@ -72,22 +79,22 @@ static int f2fs_write_meta_page(struct page *page,
 {
 	struct inode *inode = page->mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	int err;
 
-	wait_on_page_writeback(page);
-
-	err = write_meta_page(sbi, page, wbc);
-	if (err) {
+	/* Should not write any meta pages, if any IO error was occurred */
+	if (wbc->for_reclaim ||
+			is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) {
+		dec_page_count(sbi, F2FS_DIRTY_META);
 		wbc->pages_skipped++;
 		set_page_dirty(page);
+		return AOP_WRITEPAGE_ACTIVATE;
 	}
 
-	dec_page_count(sbi, F2FS_DIRTY_META);
+	wait_on_page_writeback(page);
 
-	/* In this case, we should not unlock this page */
-	if (err != AOP_WRITEPAGE_ACTIVATE)
-		unlock_page(page);
-	return err;
+	write_meta_page(sbi, page);
+	dec_page_count(sbi, F2FS_DIRTY_META);
+	unlock_page(page);
+	return 0;
 }
 
 static int f2fs_write_meta_pages(struct address_space *mapping,
@@ -138,7 +145,10 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
 			BUG_ON(page->mapping != mapping);
 			BUG_ON(!PageDirty(page));
 			clear_page_dirty_for_io(page);
-			f2fs_write_meta_page(page, &wbc);
+			if (f2fs_write_meta_page(page, &wbc)) {
+				unlock_page(page);
+				break;
+			}
 			if (nwritten++ >= nr_to_write)
 				break;
 		}
@@ -161,7 +171,6 @@ static int f2fs_set_meta_page_dirty(struct page *page)
 	if (!PageDirty(page)) {
 		__set_page_dirty_nobuffers(page);
 		inc_page_count(sbi, F2FS_DIRTY_META);
-		F2FS_SET_SB_DIRT(sbi);
 		return 1;
 	}
 	return 0;
@@ -216,19 +225,11 @@ retry:
 	new->ino = ino;
 
 	/* add new_oentry into list which is sorted by inode number */
-	if (orphan) {
-		struct orphan_inode_entry *prev;
-
-		/* get previous entry */
-		prev = list_entry(orphan->list.prev, typeof(*prev), list);
-		if (&prev->list != head)
-			/* insert new orphan inode entry */
-			list_add(&new->list, &prev->list);
-		else
-			list_add(&new->list, head);
-	} else {
+	if (orphan)
+		list_add(&new->list, this->prev);
+	else
 		list_add_tail(&new->list, head);
-	}
+
 	sbi->n_orphans++;
 out:
 	mutex_unlock(&sbi->orphan_inode_mutex);
@@ -356,8 +357,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
 	unsigned long blk_size = sbi->blocksize;
 	struct f2fs_checkpoint *cp_block;
 	unsigned long long cur_version = 0, pre_version = 0;
-	unsigned int crc = 0;
 	size_t crc_offset;
+	__u32 crc = 0;
 
 	/* Read the 1st cp block in this CP pack */
 	cp_page_1 = get_meta_page(sbi, cp_addr);
@@ -368,7 +369,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
 	if (crc_offset >= blk_size)
 		goto invalid_cp1;
 
-	crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+	crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
 	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
 		goto invalid_cp1;
 
@@ -383,7 +384,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
 	if (crc_offset >= blk_size)
 		goto invalid_cp2;
 
-	crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+	crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
 	if (!f2fs_crc_valid(crc, cp_block, crc_offset))
 		goto invalid_cp2;
 
@@ -449,13 +450,30 @@ fail_no_cp:
 	return -EINVAL;
 }
 
-void set_dirty_dir_page(struct inode *inode, struct page *page)
+static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct list_head *head = &sbi->dir_inode_list;
-	struct dir_inode_entry *new;
 	struct list_head *this;
 
+	list_for_each(this, head) {
+		struct dir_inode_entry *entry;
+		entry = list_entry(this, struct dir_inode_entry, list);
+		if (entry->inode == inode)
+			return -EEXIST;
+	}
+	list_add_tail(&new->list, head);
+#ifdef CONFIG_F2FS_STAT_FS
+	sbi->n_dirty_dirs++;
+#endif
+	return 0;
+}
+
+void set_dirty_dir_page(struct inode *inode, struct page *page)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct dir_inode_entry *new;
+
 	if (!S_ISDIR(inode->i_mode))
 		return;
 retry:
@@ -468,23 +486,31 @@ retry:
 	INIT_LIST_HEAD(&new->list);
 
 	spin_lock(&sbi->dir_inode_lock);
-	list_for_each(this, head) {
-		struct dir_inode_entry *entry;
-		entry = list_entry(this, struct dir_inode_entry, list);
-		if (entry->inode == inode) {
-			kmem_cache_free(inode_entry_slab, new);
-			goto out;
-		}
-	}
-	list_add_tail(&new->list, head);
-	sbi->n_dirty_dirs++;
+	if (__add_dirty_inode(inode, new))
+		kmem_cache_free(inode_entry_slab, new);
 
-	BUG_ON(!S_ISDIR(inode->i_mode));
-out:
 	inc_page_count(sbi, F2FS_DIRTY_DENTS);
 	inode_inc_dirty_dents(inode);
 	SetPagePrivate(page);
+	spin_unlock(&sbi->dir_inode_lock);
+}
 
+void add_dirty_dir_inode(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct dir_inode_entry *new;
+retry:
+	new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+	if (!new) {
+		cond_resched();
+		goto retry;
+	}
+	new->inode = inode;
+	INIT_LIST_HEAD(&new->list);
+
+	spin_lock(&sbi->dir_inode_lock);
+	if (__add_dirty_inode(inode, new))
+		kmem_cache_free(inode_entry_slab, new);
 	spin_unlock(&sbi->dir_inode_lock);
 }
 
@@ -498,8 +524,10 @@ void remove_dirty_dir_inode(struct inode *inode)
 		return;
 
 	spin_lock(&sbi->dir_inode_lock);
-	if (atomic_read(&F2FS_I(inode)->dirty_dents))
-		goto out;
+	if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
+		spin_unlock(&sbi->dir_inode_lock);
+		return;
+	}
 
 	list_for_each(this, head) {
 		struct dir_inode_entry *entry;
@@ -507,12 +535,38 @@ void remove_dirty_dir_inode(struct inode *inode)
 		if (entry->inode == inode) {
 			list_del(&entry->list);
 			kmem_cache_free(inode_entry_slab, entry);
+#ifdef CONFIG_F2FS_STAT_FS
 			sbi->n_dirty_dirs--;
+#endif
+			break;
+		}
+	}
+	spin_unlock(&sbi->dir_inode_lock);
+
+	/* Only from the recovery routine */
+	if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
+		clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+		iput(inode);
+	}
+}
+
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+	struct list_head *head = &sbi->dir_inode_list;
+	struct list_head *this;
+	struct inode *inode = NULL;
+
+	spin_lock(&sbi->dir_inode_lock);
+	list_for_each(this, head) {
+		struct dir_inode_entry *entry;
+		entry = list_entry(this, struct dir_inode_entry, list);
+		if (entry->inode->i_ino == ino) {
+			inode = entry->inode;
 			break;
 		}
 	}
-out:
 	spin_unlock(&sbi->dir_inode_lock);
+	return inode;
 }
 
 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
@@ -545,56 +599,46 @@ retry:
 /*
  * Freeze all the FS-operations for checkpoint.
  */
-void block_operations(struct f2fs_sb_info *sbi)
+static void block_operations(struct f2fs_sb_info *sbi)
 {
-	int t;
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
 		.for_reclaim = 0,
 	};
+	struct blk_plug plug;
 
-	/* Stop renaming operation */
-	mutex_lock_op(sbi, RENAME);
-	mutex_lock_op(sbi, DENTRY_OPS);
+	blk_start_plug(&plug);
 
-retry_dents:
-	/* write all the dirty dentry pages */
-	sync_dirty_dir_inodes(sbi);
+retry_flush_dents:
+	mutex_lock_all(sbi);
 
-	mutex_lock_op(sbi, DATA_WRITE);
+	/* write all the dirty dentry pages */
 	if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
-		mutex_unlock_op(sbi, DATA_WRITE);
-		goto retry_dents;
+		mutex_unlock_all(sbi);
+		sync_dirty_dir_inodes(sbi);
+		goto retry_flush_dents;
 	}
 
-	/* block all the operations */
-	for (t = DATA_NEW; t <= NODE_TRUNC; t++)
-		mutex_lock_op(sbi, t);
-
-	mutex_lock(&sbi->write_inode);
-
 	/*
 	 * POR: we should ensure that there is no dirty node pages
 	 * until finishing nat/sit flush.
 	 */
-retry:
-	sync_node_pages(sbi, 0, &wbc);
-
-	mutex_lock_op(sbi, NODE_WRITE);
+retry_flush_nodes:
+	mutex_lock(&sbi->node_write);
 
 	if (get_pages(sbi, F2FS_DIRTY_NODES)) {
-		mutex_unlock_op(sbi, NODE_WRITE);
-		goto retry;
+		mutex_unlock(&sbi->node_write);
+		sync_node_pages(sbi, 0, &wbc);
+		goto retry_flush_nodes;
 	}
-	mutex_unlock(&sbi->write_inode);
+	blk_finish_plug(&plug);
 }
 
 static void unblock_operations(struct f2fs_sb_info *sbi)
 {
-	int t;
-	for (t = NODE_WRITE; t >= RENAME; t--)
-		mutex_unlock_op(sbi, t);
+	mutex_unlock(&sbi->node_write);
+	mutex_unlock_all(sbi);
 }
 
 static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
@@ -604,7 +648,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	block_t start_blk;
 	struct page *cp_page;
 	unsigned int data_sum_blocks, orphan_blocks;
-	unsigned int crc32 = 0;
+	__u32 crc32 = 0;
 	void *kaddr;
 	int i;
 
@@ -673,8 +717,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
 
 	crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
-	*(__le32 *)((unsigned char *)ckpt +
-				le32_to_cpu(ckpt->checksum_offset))
+	*((__le32 *)((unsigned char *)ckpt +
+				le32_to_cpu(ckpt->checksum_offset)))
 				= cpu_to_le32(crc32);
 
 	start_blk = __start_cp_addr(sbi);
@@ -717,27 +761,28 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 	sbi->alloc_valid_block_count = 0;
 
 	/* Here, we only have one bio having CP pack */
-	if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))
-		sbi->sb->s_flags |= MS_RDONLY;
-	else
-		sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
+	sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
 
-	clear_prefree_segments(sbi);
-	F2FS_RESET_SB_DIRT(sbi);
+	if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+		clear_prefree_segments(sbi);
+		F2FS_RESET_SB_DIRT(sbi);
+	}
 }
 
 /*
  * We guarantee that this checkpoint procedure should not fail.
  */
-void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount)
+void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	unsigned long long ckpt_ver;
 
-	if (!blocked) {
-		mutex_lock(&sbi->cp_mutex);
-		block_operations(sbi);
-	}
+	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops");
+
+	mutex_lock(&sbi->cp_mutex);
+	block_operations(sbi);
+
+	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
 
 	f2fs_submit_bio(sbi, DATA, true);
 	f2fs_submit_bio(sbi, NODE, true);
@@ -755,13 +800,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount)
 	flush_nat_entries(sbi);
 	flush_sit_entries(sbi);
 
-	reset_victim_segmap(sbi);
-
 	/* unlock all the fs_lock[] in do_checkpoint() */
 	do_checkpoint(sbi, is_umount);
 
 	unblock_operations(sbi);
 	mutex_unlock(&sbi->cp_mutex);
+
+	trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
 }
 
 void init_orphan_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7bd22a20112..035f9a345cd 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,6 +12,7 @@
 #include <linux/f2fs_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
+#include <linux/aio.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
@@ -21,6 +22,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include <trace/events/f2fs.h>
 
 /*
  * Lock ordering for the change of data block address:
@@ -54,6 +56,8 @@ int reserve_new_block(struct dnode_of_data *dn)
 	if (!inc_valid_block_count(sbi, dn->inode, 1))
 		return -ENOSPC;
 
+	trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
+
 	__set_data_blkaddr(dn, NEW_ADDR);
 	dn->data_blkaddr = NEW_ADDR;
 	sync_inode_page(dn);
@@ -64,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
 					struct buffer_head *bh_result)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
+#ifdef CONFIG_F2FS_STAT_FS
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+#endif
 	pgoff_t start_fofs, end_fofs;
 	block_t start_blkaddr;
 
@@ -74,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
 		return 0;
 	}
 
+#ifdef CONFIG_F2FS_STAT_FS
 	sbi->total_hit_ext++;
+#endif
 	start_fofs = fi->ext.fofs;
 	end_fofs = fi->ext.fofs + fi->ext.len - 1;
 	start_blkaddr = fi->ext.blk_addr;
@@ -92,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
 		else
 			bh_result->b_size = UINT_MAX;
 
+#ifdef CONFIG_F2FS_STAT_FS
 		sbi->read_hit_ext++;
+#endif
 		read_unlock(&fi->ext.ext_lock);
 		return 1;
 	}
@@ -133,7 +143,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
 		goto end_update;
 	}
 
-	/* Frone merge */
+	/* Front merge */
 	if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
 		fi->ext.fofs--;
 		fi->ext.blk_addr--;
@@ -169,7 +179,7 @@ end_update:
 	return;
 }
 
-struct page *find_data_page(struct inode *inode, pgoff_t index)
+struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
@@ -183,7 +193,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
 	f2fs_put_page(page, 0);
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+	err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
 	if (err)
 		return ERR_PTR(err);
 	f2fs_put_dnode(&dn);
@@ -195,16 +205,24 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
 	if (dn.data_blkaddr == NEW_ADDR)
 		return ERR_PTR(-EINVAL);
 
-	page = grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
-	if (err) {
-		f2fs_put_page(page, 1);
-		return ERR_PTR(err);
+	if (PageUptodate(page)) {
+		unlock_page(page);
+		return page;
+	}
+
+	err = f2fs_readpage(sbi, page, dn.data_blkaddr,
+					sync ? READ_SYNC : READA);
+	if (sync) {
+		wait_on_page_locked(page);
+		if (!PageUptodate(page)) {
+			f2fs_put_page(page, 0);
+			return ERR_PTR(-EIO);
+		}
 	}
-	unlock_page(page);
 	return page;
 }
 
@@ -221,18 +239,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
 	struct page *page;
 	int err;
 
+repeat:
+	page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, index, RDONLY_NODE);
-	if (err)
+	err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+	if (err) {
+		f2fs_put_page(page, 1);
 		return ERR_PTR(err);
+	}
 	f2fs_put_dnode(&dn);
 
-	if (dn.data_blkaddr == NULL_ADDR)
+	if (dn.data_blkaddr == NULL_ADDR) {
+		f2fs_put_page(page, 1);
 		return ERR_PTR(-ENOENT);
-
-	page = grab_cache_page(mapping, index);
-	if (!page)
-		return ERR_PTR(-ENOMEM);
+	}
 
 	if (PageUptodate(page))
 		return page;
@@ -241,9 +264,17 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
 	BUG_ON(dn.data_blkaddr == NULL_ADDR);
 
 	err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
-	if (err) {
-		f2fs_put_page(page, 1);
+	if (err)
 		return ERR_PTR(err);
+
+	lock_page(page);
+	if (!PageUptodate(page)) {
+		f2fs_put_page(page, 1);
+		return ERR_PTR(-EIO);
+	}
+	if (page->mapping != mapping) {
+		f2fs_put_page(page, 1);
+		goto repeat;
 	}
 	return page;
 }
@@ -251,9 +282,13 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
 /*
  * Caller ensures that this data page is never allocated.
  * A new zero-filled data page is allocated in the page cache.
+ *
+ * Also, caller should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op().
+ * Note that, npage is set only by make_empty_dir.
  */
-struct page *get_new_data_page(struct inode *inode, pgoff_t index,
-						bool new_i_size)
+struct page *get_new_data_page(struct inode *inode,
+		struct page *npage, pgoff_t index, bool new_i_size)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct address_space *mapping = inode->i_mapping;
@@ -261,19 +296,21 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
 	struct dnode_of_data dn;
 	int err;
 
-	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, index, 0);
+	set_new_dnode(&dn, inode, npage, npage, 0);
+	err = get_dnode_of_data(&dn, index, ALLOC_NODE);
 	if (err)
 		return ERR_PTR(err);
 
 	if (dn.data_blkaddr == NULL_ADDR) {
 		if (reserve_new_block(&dn)) {
-			f2fs_put_dnode(&dn);
+			if (!npage)
+				f2fs_put_dnode(&dn);
 			return ERR_PTR(-ENOSPC);
 		}
 	}
-	f2fs_put_dnode(&dn);
-
+	if (!npage)
+		f2fs_put_dnode(&dn);
+repeat:
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
@@ -283,18 +320,27 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
 
 	if (dn.data_blkaddr == NEW_ADDR) {
 		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
 	} else {
 		err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
-		if (err) {
-			f2fs_put_page(page, 1);
+		if (err)
 			return ERR_PTR(err);
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			f2fs_put_page(page, 1);
+			return ERR_PTR(-EIO);
+		}
+		if (page->mapping != mapping) {
+			f2fs_put_page(page, 1);
+			goto repeat;
 		}
 	}
-	SetPageUptodate(page);
 
 	if (new_i_size &&
 		i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
 		i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+		/* Only the directory inode sets new_i_size */
+		set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
 		mark_inode_dirty_sync(inode);
 	}
 	return page;
@@ -325,21 +371,15 @@ static void read_end_io(struct bio *bio, int err)
 
 /*
  * Fill the locked page with data located in the block address.
- * Read operation is synchronous, and caller must unlock the page.
+ * Return unlocked page.
  */
 int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
 					block_t blk_addr, int type)
 {
 	struct block_device *bdev = sbi->sb->s_bdev;
-	bool sync = (type == READ_SYNC);
 	struct bio *bio;
 
-	/* This page can be already read by other threads */
-	if (PageUptodate(page)) {
-		if (!sync)
-			unlock_page(page);
-		return 0;
-	}
+	trace_f2fs_readpage(page, blk_addr, type);
 
 	down_read(&sbi->bio_sem);
 
@@ -354,18 +394,12 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
 		kfree(bio->bi_private);
 		bio_put(bio);
 		up_read(&sbi->bio_sem);
+		f2fs_put_page(page, 1);
 		return -EFAULT;
 	}
 
 	submit_bio(type, bio);
 	up_read(&sbi->bio_sem);
-
-	/* wait for read completion if sync */
-	if (sync) {
-		lock_page(page);
-		if (PageError(page))
-			return -EIO;
-	}
 	return 0;
 }
 
@@ -387,14 +421,18 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock,
 	/* Get the page offset from the block offset(iblock) */
 	pgofs =	(pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
 
-	if (check_extent_cache(inode, pgofs, bh_result))
+	if (check_extent_cache(inode, pgofs, bh_result)) {
+		trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
 		return 0;
+	}
 
 	/* When reading holes, we need its node page */
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE);
-	if (err)
+	err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+	if (err) {
+		trace_f2fs_get_data_block(inode, iblock, bh_result, err);
 		return (err == -ENOENT) ? 0 : err;
+	}
 
 	/* It does not support data allocation */
 	BUG_ON(create);
@@ -419,6 +457,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock,
 		bh_result->b_size = (i << blkbits);
 	}
 	f2fs_put_dnode(&dn);
+	trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
 	return 0;
 }
 
@@ -437,13 +476,12 @@ static int f2fs_read_data_pages(struct file *file,
 int do_write_data_page(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	block_t old_blk_addr, new_blk_addr;
 	struct dnode_of_data dn;
 	int err = 0;
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, page->index, RDONLY_NODE);
+	err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
 	if (err)
 		return err;
 
@@ -459,16 +497,15 @@ int do_write_data_page(struct page *page)
 	 * If current allocation needs SSR,
 	 * it had better in-place writes for updated data.
 	 */
-	if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
-				need_inplace_update(inode)) {
+	if (unlikely(old_blk_addr != NEW_ADDR &&
+			!is_cold_data(page) &&
+			need_inplace_update(inode))) {
 		rewrite_data_page(F2FS_SB(inode->i_sb), page,
 						old_blk_addr);
 	} else {
 		write_data_page(inode, page, &dn,
 				old_blk_addr, &new_blk_addr);
 		update_extent_cache(new_blk_addr, &dn);
-		F2FS_I(inode)->data_version =
-			le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
 	}
 out_writepage:
 	f2fs_put_dnode(&dn);
@@ -484,10 +521,11 @@ static int f2fs_write_data_page(struct page *page,
 	const pgoff_t end_index = ((unsigned long long) i_size)
 							>> PAGE_CACHE_SHIFT;
 	unsigned offset;
+	bool need_balance_fs = false;
 	int err = 0;
 
 	if (page->index < end_index)
-		goto out;
+		goto write;
 
 	/*
 	 * If the offset is out-of-range of file size,
@@ -499,50 +537,46 @@ static int f2fs_write_data_page(struct page *page,
 			dec_page_count(sbi, F2FS_DIRTY_DENTS);
 			inode_dec_dirty_dents(inode);
 		}
-		goto unlock_out;
+		goto out;
 	}
 
 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-out:
-	if (sbi->por_doing)
-		goto redirty_out;
-
-	if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page))
+write:
+	if (sbi->por_doing) {
+		err = AOP_WRITEPAGE_ACTIVATE;
 		goto redirty_out;
+	}
 
-	mutex_lock_op(sbi, DATA_WRITE);
+	/* Dentry blocks are controlled by checkpoint */
 	if (S_ISDIR(inode->i_mode)) {
 		dec_page_count(sbi, F2FS_DIRTY_DENTS);
 		inode_dec_dirty_dents(inode);
+		err = do_write_data_page(page);
+	} else {
+		int ilock = mutex_lock_op(sbi);
+		err = do_write_data_page(page);
+		mutex_unlock_op(sbi, ilock);
+		need_balance_fs = true;
 	}
-	err = do_write_data_page(page);
-	if (err && err != -ENOENT) {
-		wbc->pages_skipped++;
-		set_page_dirty(page);
-	}
-	mutex_unlock_op(sbi, DATA_WRITE);
+	if (err == -ENOENT)
+		goto out;
+	else if (err)
+		goto redirty_out;
 
 	if (wbc->for_reclaim)
 		f2fs_submit_bio(sbi, DATA, true);
 
-	if (err == -ENOENT)
-		goto unlock_out;
-
 	clear_cold_data(page);
+out:
 	unlock_page(page);
-
-	if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode))
+	if (need_balance_fs)
 		f2fs_balance_fs(sbi);
 	return 0;
 
-unlock_out:
-	unlock_page(page);
-	return (err == -ENOENT) ? 0 : err;
-
 redirty_out:
 	wbc->pages_skipped++;
 	set_page_dirty(page);
-	return AOP_WRITEPAGE_ACTIVATE;
+	return err;
 }
 
 #define MAX_DESIRED_PAGES_WP	4096
@@ -561,19 +595,26 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	bool locked = false;
 	int ret;
 	long excess_nrtw = 0, desired_nrtw;
 
+	/* deal with chardevs and other special file */
+	if (!mapping->a_ops->writepage)
+		return 0;
+
 	if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) {
 		desired_nrtw = MAX_DESIRED_PAGES_WP;
 		excess_nrtw = desired_nrtw - wbc->nr_to_write;
 		wbc->nr_to_write = desired_nrtw;
 	}
 
-	if (!S_ISDIR(inode->i_mode))
+	if (!S_ISDIR(inode->i_mode)) {
 		mutex_lock(&sbi->writepages);
+		locked = true;
+	}
 	ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
-	if (!S_ISDIR(inode->i_mode))
+	if (locked)
 		mutex_unlock(&sbi->writepages);
 	f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
 
@@ -593,39 +634,33 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
 	struct dnode_of_data dn;
 	int err = 0;
+	int ilock;
 
 	/* for nobh_write_end */
 	*fsdata = NULL;
 
 	f2fs_balance_fs(sbi);
-
+repeat:
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
 
-	mutex_lock_op(sbi, DATA_NEW);
+	ilock = mutex_lock_op(sbi);
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, index, 0);
-	if (err) {
-		mutex_unlock_op(sbi, DATA_NEW);
-		f2fs_put_page(page, 1);
-		return err;
-	}
+	err = get_dnode_of_data(&dn, index, ALLOC_NODE);
+	if (err)
+		goto err;
 
-	if (dn.data_blkaddr == NULL_ADDR) {
+	if (dn.data_blkaddr == NULL_ADDR)
 		err = reserve_new_block(&dn);
-		if (err) {
-			f2fs_put_dnode(&dn);
-			mutex_unlock_op(sbi, DATA_NEW);
-			f2fs_put_page(page, 1);
-			return err;
-		}
-	}
+
 	f2fs_put_dnode(&dn);
+	if (err)
+		goto err;
 
-	mutex_unlock_op(sbi, DATA_NEW);
+	mutex_unlock_op(sbi, ilock);
 
 	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
 		return 0;
@@ -636,21 +671,55 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 
 		/* Reading beyond i_size is simple: memset to zero */
 		zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
-		return 0;
+		goto out;
 	}
 
 	if (dn.data_blkaddr == NEW_ADDR) {
 		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
 	} else {
 		err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
-		if (err) {
-			f2fs_put_page(page, 1);
+		if (err)
 			return err;
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			f2fs_put_page(page, 1);
+			return -EIO;
+		}
+		if (page->mapping != mapping) {
+			f2fs_put_page(page, 1);
+			goto repeat;
 		}
 	}
+out:
 	SetPageUptodate(page);
 	clear_cold_data(page);
 	return 0;
+
+err:
+	mutex_unlock_op(sbi, ilock);
+	f2fs_put_page(page, 1);
+	return err;
+}
+
+static int f2fs_write_end(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+
+	SetPageUptodate(page);
+	set_page_dirty(page);
+
+	if (pos + copied > i_size_read(inode)) {
+		i_size_write(inode, pos + copied);
+		mark_inode_dirty(inode);
+		update_inode_page(inode);
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+	return copied;
 }
 
 static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
@@ -667,7 +736,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
 						  get_data_block_ro);
 }
 
-static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
+				      unsigned int length)
 {
 	struct inode *inode = page->mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -681,7 +751,7 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
 static int f2fs_release_data_page(struct page *page, gfp_t wait)
 {
 	ClearPagePrivate(page);
-	return 0;
+	return 1;
 }
 
 static int f2fs_set_data_page_dirty(struct page *page)
@@ -709,7 +779,7 @@ const struct address_space_operations f2fs_dblock_aops = {
 	.writepage	= f2fs_write_data_page,
 	.writepages	= f2fs_write_data_pages,
 	.write_begin	= f2fs_write_begin,
-	.write_end	= nobh_write_end,
+	.write_end	= f2fs_write_end,
 	.set_page_dirty	= f2fs_set_data_page_dirty,
 	.invalidatepage	= f2fs_invalidate_data_page,
 	.releasepage	= f2fs_release_data_page,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index c8c37307b32..0d6c6aafb23 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -13,7 +13,6 @@
 
 #include <linux/fs.h>
 #include <linux/backing-dev.h>
-#include <linux/proc_fs.h>
 #include <linux/f2fs_fs.h>
 #include <linux/blkdev.h>
 #include <linux/debugfs.h>
@@ -106,7 +105,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
 		}
 	}
 	mutex_unlock(&sit_i->sentry_lock);
-	dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100;
+	dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
 	si->bimodal = bimodal / dist;
 	if (si->dirty_count)
 		si->avg_vblocks = total_vblocks / ndirty;
@@ -138,14 +137,13 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
 	si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
 	if (sbi->segs_per_sec > 1)
-		si->base_mem += sbi->total_sections *
-			sizeof(struct sec_entry);
+		si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry);
 	si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
 
 	/* build free segmap */
 	si->base_mem += sizeof(struct free_segmap_info);
 	si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
-	si->base_mem += f2fs_bitmap_size(sbi->total_sections);
+	si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
 
 	/* build curseg */
 	si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
@@ -154,7 +152,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	/* build dirty segmap */
 	si->base_mem += sizeof(struct dirty_seglist_info);
 	si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
-	si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi));
+	si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi));
 
 	/* buld nm */
 	si->base_mem += sizeof(struct f2fs_nm_info);
@@ -177,16 +175,18 @@ get_cache:
 
 static int stat_show(struct seq_file *s, void *v)
 {
-	struct f2fs_stat_info *si, *next;
+	struct f2fs_stat_info *si;
 	int i = 0;
 	int j;
 
 	mutex_lock(&f2fs_stat_mutex);
-	list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+	list_for_each_entry(si, &f2fs_stat_list, stat_list) {
+		char devname[BDEVNAME_SIZE];
 
 		update_general_status(si->sbi);
 
-		seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++);
+		seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n",
+			bdevname(si->sbi->sb->s_bdev, devname), i++);
 		seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
 			   si->sit_area_segs, si->nat_area_segs);
 		seq_printf(s, "[SSA: %d] [MAIN: %d",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 989980e16d0..62f0d5977c6 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -13,6 +13,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "acl.h"
+#include "xattr.h"
 
 static unsigned long dir_blocks(struct inode *inode)
 {
@@ -60,7 +61,7 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
 
 static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
 {
-	mode_t mode = inode->i_mode;
+	umode_t mode = inode->i_mode;
 	de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
 }
 
@@ -148,7 +149,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 
 	for (; bidx < end_block; bidx++) {
 		/* no need to allocate new dentry pages to all the indices */
-		dentry_page = find_data_page(dir, bidx);
+		dentry_page = find_data_page(dir, bidx, true);
 		if (IS_ERR(dentry_page)) {
 			room = true;
 			continue;
@@ -189,6 +190,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 	unsigned int max_depth;
 	unsigned int level;
 
+	if (namelen > F2FS_NAME_LEN)
+		return NULL;
+
 	if (npages == 0)
 		return NULL;
 
@@ -212,9 +216,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 
 struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
 {
-	struct page *page = NULL;
-	struct f2fs_dir_entry *de = NULL;
-	struct f2fs_dentry_block *dentry_blk = NULL;
+	struct page *page;
+	struct f2fs_dir_entry *de;
+	struct f2fs_dentry_block *dentry_blk;
 
 	page = get_lock_data_page(dir, 0);
 	if (IS_ERR(page))
@@ -246,9 +250,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
 void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 		struct page *page, struct inode *inode)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-
-	mutex_lock_op(sbi, DENTRY_OPS);
 	lock_page(page);
 	wait_on_page_writeback(page);
 	de->ino = cpu_to_le32(inode->i_ino);
@@ -262,84 +263,128 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 	F2FS_I(inode)->i_pino = dir->i_ino;
 
 	f2fs_put_page(page, 1);
-	mutex_unlock_op(sbi, DENTRY_OPS);
 }
 
-void init_dent_inode(struct dentry *dentry, struct page *ipage)
+static void init_dent_inode(const struct qstr *name, struct page *ipage)
 {
 	struct f2fs_node *rn;
 
-	if (IS_ERR(ipage))
-		return;
-
-	wait_on_page_writeback(ipage);
-
-	/* copy dentry info. to this inode page */
+	/* copy name info. to this inode page */
 	rn = (struct f2fs_node *)page_address(ipage);
-	rn->i.i_namelen = cpu_to_le32(dentry->d_name.len);
-	memcpy(rn->i.i_name, dentry->d_name.name, dentry->d_name.len);
+	rn->i.i_namelen = cpu_to_le32(name->len);
+	memcpy(rn->i.i_name, name->name, name->len);
 	set_page_dirty(ipage);
 }
 
-static int init_inode_metadata(struct inode *inode, struct dentry *dentry)
+static int make_empty_dir(struct inode *inode,
+		struct inode *parent, struct page *page)
 {
-	struct inode *dir = dentry->d_parent->d_inode;
+	struct page *dentry_page;
+	struct f2fs_dentry_block *dentry_blk;
+	struct f2fs_dir_entry *de;
+	void *kaddr;
+
+	dentry_page = get_new_data_page(inode, page, 0, true);
+	if (IS_ERR(dentry_page))
+		return PTR_ERR(dentry_page);
+
+	kaddr = kmap_atomic(dentry_page);
+	dentry_blk = (struct f2fs_dentry_block *)kaddr;
+
+	de = &dentry_blk->dentry[0];
+	de->name_len = cpu_to_le16(1);
+	de->hash_code = 0;
+	de->ino = cpu_to_le32(inode->i_ino);
+	memcpy(dentry_blk->filename[0], ".", 1);
+	set_de_type(de, inode);
+
+	de = &dentry_blk->dentry[1];
+	de->hash_code = 0;
+	de->name_len = cpu_to_le16(2);
+	de->ino = cpu_to_le32(parent->i_ino);
+	memcpy(dentry_blk->filename[1], "..", 2);
+	set_de_type(de, inode);
+
+	test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
+	test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
+	kunmap_atomic(kaddr);
+
+	set_page_dirty(dentry_page);
+	f2fs_put_page(dentry_page, 1);
+	return 0;
+}
+
+static struct page *init_inode_metadata(struct inode *inode,
+		struct inode *dir, const struct qstr *name)
+{
+	struct page *page;
+	int err;
 
 	if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
-		int err;
-		err = new_inode_page(inode, dentry);
-		if (err)
-			return err;
+		page = new_inode_page(inode, name);
+		if (IS_ERR(page))
+			return page;
 
 		if (S_ISDIR(inode->i_mode)) {
-			err = f2fs_make_empty(inode, dir);
-			if (err) {
-				remove_inode_page(inode);
-				return err;
-			}
+			err = make_empty_dir(inode, dir, page);
+			if (err)
+				goto error;
 		}
 
 		err = f2fs_init_acl(inode, dir);
-		if (err) {
-			remove_inode_page(inode);
-			return err;
-		}
+		if (err)
+			goto error;
+
+		err = f2fs_init_security(inode, dir, name, page);
+		if (err)
+			goto error;
+
+		wait_on_page_writeback(page);
 	} else {
-		struct page *ipage;
-		ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
-		if (IS_ERR(ipage))
-			return PTR_ERR(ipage);
-		set_cold_node(inode, ipage);
-		init_dent_inode(dentry, ipage);
-		f2fs_put_page(ipage, 1);
+		page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+		if (IS_ERR(page))
+			return page;
+
+		wait_on_page_writeback(page);
+		set_cold_node(inode, page);
 	}
+
+	init_dent_inode(name, page);
+
+	/*
+	 * This file should be checkpointed during fsync.
+	 * We lost i_pino from now on.
+	 */
 	if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+		file_lost_pino(inode);
 		inc_nlink(inode);
-		f2fs_write_inode(inode, NULL);
 	}
-	return 0;
+	return page;
+
+error:
+	f2fs_put_page(page, 1);
+	remove_inode_page(inode);
+	return ERR_PTR(err);
 }
 
 static void update_parent_metadata(struct inode *dir, struct inode *inode,
 						unsigned int current_depth)
 {
-	bool need_dir_update = false;
-
 	if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
 		if (S_ISDIR(inode->i_mode)) {
 			inc_nlink(dir);
-			need_dir_update = true;
+			set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
 		}
 		clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
 	}
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 	if (F2FS_I(dir)->i_current_depth != current_depth) {
 		F2FS_I(dir)->i_current_depth = current_depth;
-		need_dir_update = true;
+		set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
 	}
 
-	if (need_dir_update)
-		f2fs_write_inode(dir, NULL);
+	if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
+		update_inode_page(dir);
 	else
 		mark_inode_dirty(dir);
 
@@ -371,7 +416,11 @@ next:
 	goto next;
 }
 
-int f2fs_add_link(struct dentry *dentry, struct inode *inode)
+/*
+ * Caller should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op().
+ */
+int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode)
 {
 	unsigned int bit_pos;
 	unsigned int level;
@@ -380,17 +429,15 @@ int f2fs_add_link(struct dentry *dentry, struct inode *inode)
 	f2fs_hash_t dentry_hash;
 	struct f2fs_dir_entry *de;
 	unsigned int nbucket, nblock;
-	struct inode *dir = dentry->d_parent->d_inode;
-	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-	const char *name = dentry->d_name.name;
-	size_t namelen = dentry->d_name.len;
+	size_t namelen = name->len;
 	struct page *dentry_page = NULL;
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	int slots = GET_DENTRY_SLOTS(namelen);
+	struct page *page;
 	int err = 0;
 	int i;
 
-	dentry_hash = f2fs_dentry_hash(name, dentry->d_name.len);
+	dentry_hash = f2fs_dentry_hash(name->name, name->len);
 	level = 0;
 	current_depth = F2FS_I(dir)->i_current_depth;
 	if (F2FS_I(dir)->chash == dentry_hash) {
@@ -412,12 +459,9 @@ start:
 	bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
 
 	for (block = bidx; block <= (bidx + nblock - 1); block++) {
-		mutex_lock_op(sbi, DENTRY_OPS);
-		dentry_page = get_new_data_page(dir, block, true);
-		if (IS_ERR(dentry_page)) {
-			mutex_unlock_op(sbi, DENTRY_OPS);
+		dentry_page = get_new_data_page(dir, NULL, block, true);
+		if (IS_ERR(dentry_page))
 			return PTR_ERR(dentry_page);
-		}
 
 		dentry_blk = kmap(dentry_page);
 		bit_pos = room_for_filename(dentry_blk, slots);
@@ -426,37 +470,39 @@ start:
 
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
-		mutex_unlock_op(sbi, DENTRY_OPS);
 	}
 
 	/* Move to next level to find the empty slot for new dentry */
 	++level;
 	goto start;
 add_dentry:
-	err = init_inode_metadata(inode, dentry);
-	if (err)
-		goto fail;
-
 	wait_on_page_writeback(dentry_page);
 
+	page = init_inode_metadata(inode, dir, name);
+	if (IS_ERR(page)) {
+		err = PTR_ERR(page);
+		goto fail;
+	}
 	de = &dentry_blk->dentry[bit_pos];
 	de->hash_code = dentry_hash;
 	de->name_len = cpu_to_le16(namelen);
-	memcpy(dentry_blk->filename[bit_pos], name, namelen);
+	memcpy(dentry_blk->filename[bit_pos], name->name, name->len);
 	de->ino = cpu_to_le32(inode->i_ino);
 	set_de_type(de, inode);
 	for (i = 0; i < slots; i++)
 		test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
 	set_page_dirty(dentry_page);
 
-	update_parent_metadata(dir, inode, current_depth);
-
-	/* update parent inode number before releasing dentry page */
+	/* we don't need to mark_inode_dirty now */
 	F2FS_I(inode)->i_pino = dir->i_ino;
+	update_inode(inode, page);
+	f2fs_put_page(page, 1);
+
+	update_parent_metadata(dir, inode, current_depth);
 fail:
+	clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
 	kunmap(dentry_page);
 	f2fs_put_page(dentry_page, 1);
-	mutex_unlock_op(sbi, DENTRY_OPS);
 	return err;
 }
 
@@ -476,8 +522,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	void *kaddr = page_address(page);
 	int i;
 
-	mutex_lock_op(sbi, DENTRY_OPS);
-
 	lock_page(page);
 	wait_on_page_writeback(page);
 
@@ -497,7 +541,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 
 	if (inode && S_ISDIR(inode->i_mode)) {
 		drop_nlink(dir);
-		f2fs_write_inode(dir, NULL);
+		update_inode_page(dir);
 	} else {
 		mark_inode_dirty(dir);
 	}
@@ -509,7 +553,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 			drop_nlink(inode);
 			i_size_write(inode, 0);
 		}
-		f2fs_write_inode(inode, NULL);
+		update_inode_page(inode);
+
 		if (inode->i_nlink == 0)
 			add_orphan_inode(sbi, inode->i_ino);
 	}
@@ -522,45 +567,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 		inode_dec_dirty_dents(dir);
 	}
 	f2fs_put_page(page, 1);
-
-	mutex_unlock_op(sbi, DENTRY_OPS);
-}
-
-int f2fs_make_empty(struct inode *inode, struct inode *parent)
-{
-	struct page *dentry_page;
-	struct f2fs_dentry_block *dentry_blk;
-	struct f2fs_dir_entry *de;
-	void *kaddr;
-
-	dentry_page = get_new_data_page(inode, 0, true);
-	if (IS_ERR(dentry_page))
-		return PTR_ERR(dentry_page);
-
-	kaddr = kmap_atomic(dentry_page);
-	dentry_blk = (struct f2fs_dentry_block *)kaddr;
-
-	de = &dentry_blk->dentry[0];
-	de->name_len = cpu_to_le16(1);
-	de->hash_code = f2fs_dentry_hash(".", 1);
-	de->ino = cpu_to_le32(inode->i_ino);
-	memcpy(dentry_blk->filename[0], ".", 1);
-	set_de_type(de, inode);
-
-	de = &dentry_blk->dentry[1];
-	de->hash_code = f2fs_dentry_hash("..", 2);
-	de->name_len = cpu_to_le16(2);
-	de->ino = cpu_to_le32(parent->i_ino);
-	memcpy(dentry_blk->filename[1], "..", 2);
-	set_de_type(de, inode);
-
-	test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
-	test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
-	kunmap_atomic(kaddr);
-
-	set_page_dirty(dentry_page);
-	f2fs_put_page(dentry_page, 1);
-	return 0;
 }
 
 bool f2fs_empty_dir(struct inode *dir)
@@ -600,34 +606,26 @@ bool f2fs_empty_dir(struct inode *dir)
 	return true;
 }
 
-static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 {
-	unsigned long pos = file->f_pos;
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	unsigned long npages = dir_blocks(inode);
-	unsigned char *types = NULL;
-	unsigned int bit_pos = 0, start_bit_pos = 0;
-	int over = 0;
+	unsigned int bit_pos = 0;
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	struct f2fs_dir_entry *de = NULL;
 	struct page *dentry_page = NULL;
-	unsigned int n = 0;
+	unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
 	unsigned char d_type = DT_UNKNOWN;
-	int slots;
 
-	types = f2fs_filetype_table;
-	bit_pos = (pos % NR_DENTRY_IN_BLOCK);
-	n = (pos / NR_DENTRY_IN_BLOCK);
+	bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
 
 	for ( ; n < npages; n++) {
 		dentry_page = get_lock_data_page(inode, n);
 		if (IS_ERR(dentry_page))
 			continue;
 
-		start_bit_pos = bit_pos;
 		dentry_blk = kmap(dentry_page);
 		while (bit_pos < NR_DENTRY_IN_BLOCK) {
-			d_type = DT_UNKNOWN;
 			bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
 							NR_DENTRY_IN_BLOCK,
 							bit_pos);
@@ -635,28 +633,26 @@ static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
 				break;
 
 			de = &dentry_blk->dentry[bit_pos];
-			if (types && de->file_type < F2FS_FT_MAX)
-				d_type = types[de->file_type];
-
-			over = filldir(dirent,
+			if (de->file_type < F2FS_FT_MAX)
+				d_type = f2fs_filetype_table[de->file_type];
+			else
+				d_type = DT_UNKNOWN;
+			if (!dir_emit(ctx,
 					dentry_blk->filename[bit_pos],
 					le16_to_cpu(de->name_len),
-					(n * NR_DENTRY_IN_BLOCK) + bit_pos,
-					le32_to_cpu(de->ino), d_type);
-			if (over) {
-				file->f_pos += bit_pos - start_bit_pos;
-				goto success;
-			}
-			slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
-			bit_pos += slots;
+					le32_to_cpu(de->ino), d_type))
+				goto stop;
+
+			bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+			ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos;
 		}
 		bit_pos = 0;
-		file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK;
+		ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
 		dentry_page = NULL;
 	}
-success:
+stop:
 	if (dentry_page && !IS_ERR(dentry_page)) {
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
@@ -668,7 +664,7 @@ success:
 const struct file_operations f2fs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= f2fs_readdir,
+	.iterate	= f2fs_readdir,
 	.fsync		= f2fs_sync_file,
 	.unlocked_ioctl	= f2fs_ioctl,
 };
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c8e2d751ef9..467d42d65c4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -37,21 +37,35 @@
 		typecheck(unsigned long long, b) &&			\
 		((long long)((a) - (b)) > 0))
 
-typedef u64 block_t;
+typedef u32 block_t;	/*
+			 * should not change u32, since it is the on-disk block
+			 * address format, __le32.
+			 */
 typedef u32 nid_t;
 
 struct f2fs_mount_info {
 	unsigned int	opt;
 };
 
-static inline __u32 f2fs_crc32(void *buff, size_t len)
+#define CRCPOLY_LE 0xedb88320
+
+static inline __u32 f2fs_crc32(void *buf, size_t len)
 {
-	return crc32_le(F2FS_SUPER_MAGIC, buff, len);
+	unsigned char *p = (unsigned char *)buf;
+	__u32 crc = F2FS_SUPER_MAGIC;
+	int i;
+
+	while (len--) {
+		crc ^= *p++;
+		for (i = 0; i < 8; i++)
+			crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
+	}
+	return crc;
 }
 
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
 {
-	return f2fs_crc32(buff, buff_size) == blk_crc;
+	return f2fs_crc32(buf, buf_size) == blk_crc;
 }
 
 /*
@@ -104,6 +118,20 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
 }
 
 /*
+ * ioctl commands
+ */
+#define F2FS_IOC_GETFLAGS               FS_IOC_GETFLAGS
+#define F2FS_IOC_SETFLAGS               FS_IOC_SETFLAGS
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define F2FS_IOC32_GETFLAGS             FS_IOC32_GETFLAGS
+#define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS
+#endif
+
+/*
  * For INODE and NODE manager
  */
 #define XATTR_NODE_OFFSET	(-1)	/*
@@ -111,11 +139,15 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
 					 * file keeping -1 as its node offset to
 					 * distinguish from index node blocks.
 					 */
-#define RDONLY_NODE		1	/*
-					 * specify a read-only mode when getting
-					 * a node block. 0 is read-write mode.
-					 * used by get_dnode_of_data().
+enum {
+	ALLOC_NODE,			/* allocate a new node page if needed */
+	LOOKUP_NODE,			/* look up a node without readahead */
+	LOOKUP_NODE_RA,			/*
+					 * look up a node with readahead called
+					 * by get_datablock_ro.
 					 */
+};
+
 #define F2FS_LINK_MAX		32000	/* maximum link count per file */
 
 /* for in-memory extent cache entry */
@@ -123,13 +155,14 @@ struct extent_info {
 	rwlock_t ext_lock;	/* rwlock for consistency */
 	unsigned int fofs;	/* start offset in a file */
 	u32 blk_addr;		/* start block address of the extent */
-	unsigned int len;	/* lenth of the extent */
+	unsigned int len;	/* length of the extent */
 };
 
 /*
  * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
  */
 #define FADVISE_COLD_BIT	0x01
+#define FADVISE_LOST_PINO_BIT	0x02
 
 struct f2fs_inode_info {
 	struct inode vfs_inode;		/* serve a vfs inode */
@@ -141,7 +174,6 @@ struct f2fs_inode_info {
 
 	/* Use below internally in f2fs*/
 	unsigned long flags;		/* use to pass per-file flags */
-	unsigned long long data_version;/* lastes version of data for fsync */
 	atomic_t dirty_dents;		/* # of dirty dentry pages */
 	f2fs_hash_t chash;		/* hash value of given file name */
 	unsigned int clevel;		/* maximum level of given file name */
@@ -172,7 +204,6 @@ static inline void set_raw_extent(struct extent_info *ext,
 struct f2fs_nm_info {
 	block_t nat_blkaddr;		/* base disk address of NAT */
 	nid_t max_nid;			/* maximum possible node ids */
-	nid_t init_scan_nid;		/* the first nid to be scanned */
 	nid_t next_scan_nid;		/* the next nid to be scanned */
 
 	/* NAT cache management */
@@ -291,23 +322,12 @@ enum count_type {
 };
 
 /*
- * FS_LOCK nesting subclasses for the lock validator:
- *
- * The locking order between these classes is
- * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW
- *    -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC
+ * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS].
+ * The checkpoint procedure blocks all the locks in this fs_lock array.
+ * Some FS operations grab free locks, and if there is no free lock,
+ * then wait to grab a lock in a round-robin manner.
  */
-enum lock_type {
-	RENAME,		/* for renaming operations */
-	DENTRY_OPS,	/* for directory operations */
-	DATA_WRITE,	/* for data write */
-	DATA_NEW,	/* for data allocation */
-	DATA_TRUNC,	/* for data truncate */
-	NODE_NEW,	/* for node allocation */
-	NODE_TRUNC,	/* for node truncate */
-	NODE_WRITE,	/* for node write */
-	NR_LOCK_TYPE,
-};
+#define NR_GLOBAL_LOCKS	8
 
 /*
  * The below are the page types of bios used in submti_bio().
@@ -347,11 +367,13 @@ struct f2fs_sb_info {
 	/* for checkpoint */
 	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */
 	struct inode *meta_inode;		/* cache meta blocks */
-	struct mutex cp_mutex;			/* for checkpoint procedure */
-	struct mutex fs_lock[NR_LOCK_TYPE];	/* for blocking FS operations */
-	struct mutex write_inode;		/* mutex for write inode */
+	struct mutex cp_mutex;			/* checkpoint procedure lock */
+	struct mutex fs_lock[NR_GLOBAL_LOCKS];	/* blocking FS operations */
+	struct mutex node_write;		/* locking node writes */
 	struct mutex writepages;		/* mutex for writepages() */
+	unsigned char next_lock_num;		/* round-robin global locks */
 	int por_doing;				/* recovery is doing or not */
+	int on_build_free_nids;			/* build_free_nids is doing */
 
 	/* for orphan inode management */
 	struct list_head orphan_inode_list;	/* orphan inode list */
@@ -361,7 +383,6 @@ struct f2fs_sb_info {
 	/* for directory inode management */
 	struct list_head dir_inode_list;	/* dir inode list */
 	spinlock_t dir_inode_lock;		/* for dir inode list lock */
-	unsigned int n_dirty_dirs;		/* # of dir inodes */
 
 	/* basic file system units */
 	unsigned int log_sectors_per_block;	/* log2 sectors per block */
@@ -392,17 +413,21 @@ struct f2fs_sb_info {
 	/* for cleaning operations */
 	struct mutex gc_mutex;			/* mutex for GC */
 	struct f2fs_gc_kthread	*gc_thread;	/* GC thread */
+	unsigned int cur_victim_sec;		/* current victim section num */
 
 	/*
 	 * for stat information.
 	 * one is for the LFS mode, and the other is for the SSR mode.
 	 */
+#ifdef CONFIG_F2FS_STAT_FS
 	struct f2fs_stat_info *stat_info;	/* FS status information */
 	unsigned int segment_count[2];		/* # of allocated segments */
 	unsigned int block_count[2];		/* # of allocated blocks */
-	unsigned int last_victim[2];		/* last victim segment # */
 	int total_hit_ext, read_hit_ext;	/* extent cache hit ratio */
 	int bg_gc;				/* background gc calls */
+	unsigned int n_dirty_dirs;		/* # of dir inodes */
+#endif
+	unsigned int last_victim[2];		/* last victim segment # */
 	spinlock_t stat_lock;			/* lock for stat operations */
 };
 
@@ -484,22 +509,59 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
 	cp->ckpt_flags = cpu_to_le32(ckpt_flags);
 }
 
-static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
 {
-	mutex_lock_nested(&sbi->fs_lock[t], t);
+	int i;
+
+	for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
+		/*
+		 * This is the only time we take multiple fs_lock[]
+		 * instances; the order is immaterial since we
+		 * always hold cp_mutex, which serializes multiple
+		 * such operations.
+		 */
+		mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
+	}
+}
+
+static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
+{
+	int i = 0;
+	for (; i < NR_GLOBAL_LOCKS; i++)
+		mutex_unlock(&sbi->fs_lock[i]);
+}
+
+static inline int mutex_lock_op(struct f2fs_sb_info *sbi)
+{
+	unsigned char next_lock = sbi->next_lock_num % NR_GLOBAL_LOCKS;
+	int i = 0;
+
+	for (; i < NR_GLOBAL_LOCKS; i++)
+		if (mutex_trylock(&sbi->fs_lock[i]))
+			return i;
+
+	mutex_lock(&sbi->fs_lock[next_lock]);
+	sbi->next_lock_num++;
+	return next_lock;
 }
 
-static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock)
 {
-	mutex_unlock(&sbi->fs_lock[t]);
+	if (ilock < 0)
+		return;
+	BUG_ON(ilock >= NR_GLOBAL_LOCKS);
+	mutex_unlock(&sbi->fs_lock[ilock]);
 }
 
 /*
  * Check whether the given nid is within node id range.
  */
-static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
+static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
 {
-	BUG_ON((nid >= NM_I(sbi)->max_nid));
+	WARN_ON((nid >= NM_I(sbi)->max_nid));
+	if (nid >= NM_I(sbi)->max_nid)
+		return -EINVAL;
+	return 0;
 }
 
 #define F2FS_DEFAULT_ALLOCATED_BLOCKS	1
@@ -573,6 +635,14 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
 	return atomic_read(&sbi->nr_pages[count_type]);
 }
 
+static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
+{
+	unsigned int pages_per_sec = sbi->segs_per_sec *
+					(1 << sbi->log_blocks_per_seg);
+	return ((get_pages(sbi, block_type) + pages_per_sec - 1)
+			>> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+}
+
 static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
 {
 	block_t ret;
@@ -797,10 +867,12 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
 /* used for f2fs_inode_info->flags */
 enum {
 	FI_NEW_INODE,		/* indicate newly allocated inode */
-	FI_NEED_CP,		/* need to do checkpoint during fsync */
+	FI_DIRTY_INODE,		/* indicate inode is dirty or not */
 	FI_INC_LINK,		/* need to increment i_nlink */
 	FI_ACL_MODE,		/* indicate acl mode */
 	FI_NO_ALLOC,		/* should not allocate any blocks */
+	FI_UPDATE_DIR,		/* should update inode block for consistency */
+	FI_DELAY_IPUT,		/* used for the recovery */
 };
 
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -833,23 +905,31 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
 	return 0;
 }
 
+static inline int f2fs_readonly(struct super_block *sb)
+{
+	return sb->s_flags & MS_RDONLY;
+}
+
 /*
  * file.c
  */
 int f2fs_sync_file(struct file *, loff_t, loff_t, int);
 void truncate_data_blocks(struct dnode_of_data *);
 void f2fs_truncate(struct inode *);
+int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int f2fs_setattr(struct dentry *, struct iattr *);
 int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+int truncate_data_blocks_range(struct dnode_of_data *, int);
 long f2fs_ioctl(struct file *, unsigned int, unsigned long);
+long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /*
  * inode.c
  */
 void f2fs_set_inode_flags(struct inode *);
-struct inode *f2fs_iget_nowait(struct super_block *, unsigned long);
 struct inode *f2fs_iget(struct super_block *, unsigned long);
 void update_inode(struct inode *, struct page *);
+int update_inode_page(struct inode *);
 int f2fs_write_inode(struct inode *, struct writeback_control *);
 void f2fs_evict_inode(struct inode *);
 
@@ -867,12 +947,17 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
 ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
 void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
 				struct page *, struct inode *);
-void init_dent_inode(struct dentry *, struct page *);
-int f2fs_add_link(struct dentry *, struct inode *);
+int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
 void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
 int f2fs_make_empty(struct inode *, struct inode *);
 bool f2fs_empty_dir(struct inode *);
 
+static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name,
+				inode);
+}
+
 /*
  * super.c
  */
@@ -896,8 +981,8 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
 int remove_inode_page(struct inode *);
-int new_inode_page(struct inode *, struct dentry *);
-struct page *new_node_page(struct dnode_of_data *, unsigned int);
+struct page *new_inode_page(struct inode *, const struct qstr *);
+struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
 void ra_node_page(struct f2fs_sb_info *, nid_t);
 struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_node_page_ra(struct page *, int);
@@ -922,15 +1007,13 @@ void destroy_node_manager_caches(void);
  */
 void f2fs_balance_fs(struct f2fs_sb_info *);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
-void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
 void clear_prefree_segments(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
 struct bio *f2fs_bio_alloc(struct block_device *, int);
 void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync);
-int write_meta_page(struct f2fs_sb_info *, struct page *,
-					struct writeback_control *);
+void write_meta_page(struct f2fs_sb_info *, struct page *);
 void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
 					block_t, block_t *);
 void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
@@ -946,7 +1029,6 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *,
 					int, unsigned int, int);
 void flush_sit_entries(struct f2fs_sb_info *);
 int build_segment_manager(struct f2fs_sb_info *);
-void reset_victim_segmap(struct f2fs_sb_info *);
 void destroy_segment_manager(struct f2fs_sb_info *);
 
 /*
@@ -961,10 +1043,11 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
 int recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
 void set_dirty_dir_page(struct inode *, struct page *);
+void add_dirty_dir_inode(struct inode *);
 void remove_dirty_dir_inode(struct inode *);
+struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t);
 void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void block_operations(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, bool, bool);
+void write_checkpoint(struct f2fs_sb_info *, bool);
 void init_orphan_info(struct f2fs_sb_info *);
 int __init create_checkpoint_caches(void);
 void destroy_checkpoint_caches(void);
@@ -974,9 +1057,9 @@ void destroy_checkpoint_caches(void);
  */
 int reserve_new_block(struct dnode_of_data *);
 void update_extent_cache(block_t, struct dnode_of_data *);
-struct page *find_data_page(struct inode *, pgoff_t);
+struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
-struct page *get_new_data_page(struct inode *, pgoff_t, bool);
+struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
 int do_write_data_page(struct page *);
 
@@ -994,7 +1077,7 @@ void destroy_gc_caches(void);
 /*
  * recovery.c
  */
-void recover_fsync_data(struct f2fs_sb_info *);
+int recover_fsync_data(struct f2fs_sb_info *);
 bool space_for_roll_forward(struct f2fs_sb_info *);
 
 /*
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3191b52aafb..d2d2b7dbdcc 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -13,8 +13,10 @@
 #include <linux/stat.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 #include <linux/falloc.h>
 #include <linux/types.h>
+#include <linux/compat.h>
 #include <linux/uaccess.h>
 #include <linux/mount.h>
 
@@ -23,28 +25,28 @@
 #include "segment.h"
 #include "xattr.h"
 #include "acl.h"
+#include <trace/events/f2fs.h>
 
 static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 						struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(vma->vm_file);
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	block_t old_blk_addr;
 	struct dnode_of_data dn;
-	int err;
+	int err, ilock;
 
 	f2fs_balance_fs(sbi);
 
 	sb_start_pagefault(inode->i_sb);
 
-	mutex_lock_op(sbi, DATA_NEW);
-
 	/* block allocation */
+	ilock = mutex_lock_op(sbi);
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, page->index, 0);
+	err = get_dnode_of_data(&dn, page->index, ALLOC_NODE);
 	if (err) {
-		mutex_unlock_op(sbi, DATA_NEW);
+		mutex_unlock_op(sbi, ilock);
 		goto out;
 	}
 
@@ -54,17 +56,17 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 		err = reserve_new_block(&dn);
 		if (err) {
 			f2fs_put_dnode(&dn);
-			mutex_unlock_op(sbi, DATA_NEW);
+			mutex_unlock_op(sbi, ilock);
 			goto out;
 		}
 	}
 	f2fs_put_dnode(&dn);
+	mutex_unlock_op(sbi, ilock);
 
-	mutex_unlock_op(sbi, DATA_NEW);
-
+	file_update_time(vma->vm_file);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping ||
-			page_offset(page) >= i_size_read(inode) ||
+			page_offset(page) > i_size_read(inode) ||
 			!PageUptodate(page)) {
 		unlock_page(page);
 		err = -EFAULT;
@@ -75,10 +77,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	 * check to see if the page is mapped already (no holes)
 	 */
 	if (PageMappedToDisk(page))
-		goto out;
-
-	/* fill the page */
-	wait_on_page_writeback(page);
+		goto mapped;
 
 	/* page is wholly or partially inside EOF */
 	if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
@@ -89,7 +88,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	set_page_dirty(page);
 	SetPageUptodate(page);
 
-	file_update_time(vma->vm_file);
+mapped:
+	/* fill the page */
+	wait_on_page_writeback(page);
 out:
 	sb_end_pagefault(inode->i_sb);
 	return block_page_mkwrite_return(err);
@@ -101,28 +102,28 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
 	.remap_pages	= generic_file_remap_pages,
 };
 
-static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode)
+static int get_parent_ino(struct inode *inode, nid_t *pino)
 {
 	struct dentry *dentry;
-	nid_t pino;
 
 	inode = igrab(inode);
 	dentry = d_find_any_alias(inode);
-	if (!dentry) {
-		iput(inode);
+	iput(inode);
+	if (!dentry)
 		return 0;
-	}
-	pino = dentry->d_parent->d_inode->i_ino;
+
+	inode = igrab(dentry->d_parent->d_inode);
 	dput(dentry);
+
+	*pino = inode->i_ino;
 	iput(inode);
-	return !is_checkpointed_node(sbi, pino);
+	return 1;
 }
 
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = file->f_mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	unsigned long long cur_version;
 	int ret = 0;
 	bool need_cp = false;
 	struct writeback_control wbc = {
@@ -131,12 +132,15 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		.for_reclaim = 0,
 	};
 
-	if (inode->i_sb->s_flags & MS_RDONLY)
+	if (f2fs_readonly(inode->i_sb))
 		return 0;
 
+	trace_f2fs_sync_file_enter(inode);
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (ret)
+	if (ret) {
+		trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
 		return ret;
+	}
 
 	/* guarantee free sections for fsync */
 	f2fs_balance_fs(sbi);
@@ -146,40 +150,44 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 		goto out;
 
-	mutex_lock(&sbi->cp_mutex);
-	cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
-	mutex_unlock(&sbi->cp_mutex);
-
-	if (F2FS_I(inode)->data_version != cur_version &&
-					!(inode->i_state & I_DIRTY))
-		goto out;
-	F2FS_I(inode)->data_version--;
-
 	if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
 		need_cp = true;
-	if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
+	else if (file_wrong_pino(inode))
 		need_cp = true;
-	if (!space_for_roll_forward(sbi))
+	else if (!space_for_roll_forward(sbi))
 		need_cp = true;
-	if (need_to_sync_dir(sbi, inode))
+	else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
 		need_cp = true;
 
 	if (need_cp) {
+		nid_t pino;
+
 		/* all the dirty node pages should be flushed for POR */
 		ret = f2fs_sync_fs(inode->i_sb, 1);
-		clear_inode_flag(F2FS_I(inode), FI_NEED_CP);
+		if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
+					get_parent_ino(inode, &pino)) {
+			F2FS_I(inode)->i_pino = pino;
+			file_got_pino(inode);
+			mark_inode_dirty_sync(inode);
+			ret = f2fs_write_inode(inode, NULL);
+			if (ret)
+				goto out;
+		}
 	} else {
 		/* if there is no written node page, write its inode page */
 		while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+			mark_inode_dirty_sync(inode);
 			ret = f2fs_write_inode(inode, NULL);
 			if (ret)
 				goto out;
 		}
 		filemap_fdatawait_range(sbi->node_inode->i_mapping,
 							0, LONG_MAX);
+		ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
+	trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
 	return ret;
 }
 
@@ -190,7 +198,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 {
 	int nr_free = 0, ofs = dn->ofs_in_node;
 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
@@ -207,14 +215,17 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 
 		update_extent_cache(NULL_ADDR, dn);
 		invalidate_blocks(sbi, blkaddr);
-		dec_valid_block_count(sbi, dn->inode, 1);
 		nr_free++;
 	}
 	if (nr_free) {
+		dec_valid_block_count(sbi, dn->inode, nr_free);
 		set_page_dirty(dn->node_page);
 		sync_inode_page(dn);
 	}
 	dn->ofs_in_node = ofs;
+
+	trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
+					 dn->ofs_in_node, nr_free);
 	return nr_free;
 }
 
@@ -231,11 +242,15 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
 	if (!offset)
 		return;
 
-	page = find_data_page(inode, from >> PAGE_CACHE_SHIFT);
+	page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false);
 	if (IS_ERR(page))
 		return;
 
 	lock_page(page);
+	if (page->mapping != inode->i_mapping) {
+		f2fs_put_page(page, 1);
+		return;
+	}
 	wait_on_page_writeback(page);
 	zero_user(page, offset, PAGE_CACHE_SIZE - offset);
 	set_page_dirty(page);
@@ -248,20 +263,22 @@ static int truncate_blocks(struct inode *inode, u64 from)
 	unsigned int blocksize = inode->i_sb->s_blocksize;
 	struct dnode_of_data dn;
 	pgoff_t free_from;
-	int count = 0;
+	int count = 0, ilock = -1;
 	int err;
 
+	trace_f2fs_truncate_blocks_enter(inode, from);
+
 	free_from = (pgoff_t)
 			((from + blocksize - 1) >> (sbi->log_blocksize));
 
-	mutex_lock_op(sbi, DATA_TRUNC);
-
+	ilock = mutex_lock_op(sbi);
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	err = get_dnode_of_data(&dn, free_from, RDONLY_NODE);
+	err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
 	if (err) {
 		if (err == -ENOENT)
 			goto free_next;
-		mutex_unlock_op(sbi, DATA_TRUNC);
+		mutex_unlock_op(sbi, ilock);
+		trace_f2fs_truncate_blocks_exit(inode, err);
 		return err;
 	}
 
@@ -272,6 +289,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
 
 	count -= dn.ofs_in_node;
 	BUG_ON(count < 0);
+
 	if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
 		truncate_data_blocks_range(&dn, count);
 		free_from += count;
@@ -280,11 +298,12 @@ static int truncate_blocks(struct inode *inode, u64 from)
 	f2fs_put_dnode(&dn);
 free_next:
 	err = truncate_inode_blocks(inode, free_from);
-	mutex_unlock_op(sbi, DATA_TRUNC);
+	mutex_unlock_op(sbi, ilock);
 
 	/* lastly zero out the first data page */
 	truncate_partial_data_page(inode, from);
 
+	trace_f2fs_truncate_blocks_exit(inode, err);
 	return err;
 }
 
@@ -294,15 +313,15 @@ void f2fs_truncate(struct inode *inode)
 				S_ISLNK(inode->i_mode)))
 		return;
 
+	trace_f2fs_truncate(inode);
+
 	if (!truncate_blocks(inode, i_size_read(inode))) {
 		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		mark_inode_dirty(inode);
 	}
-
-	f2fs_balance_fs(F2FS_SB(inode->i_sb));
 }
 
-static int f2fs_getattr(struct vfsmount *mnt,
+int f2fs_getattr(struct vfsmount *mnt,
 			 struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
@@ -356,6 +375,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 			attr->ia_size != i_size_read(inode)) {
 		truncate_setsize(inode, attr->ia_size);
 		f2fs_truncate(inode);
+		f2fs_balance_fs(F2FS_SB(inode->i_sb));
 	}
 
 	__setattr_copy(inode, attr);
@@ -387,12 +407,18 @@ const struct inode_operations f2fs_file_inode_operations = {
 static void fill_zero(struct inode *inode, pgoff_t index,
 					loff_t start, loff_t len)
 {
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct page *page;
+	int ilock;
 
 	if (!len)
 		return;
 
-	page = get_new_data_page(inode, index, false);
+	f2fs_balance_fs(sbi);
+
+	ilock = mutex_lock_op(sbi);
+	page = get_new_data_page(inode, NULL, index, false);
+	mutex_unlock_op(sbi, ilock);
 
 	if (!IS_ERR(page)) {
 		wait_on_page_writeback(page);
@@ -409,15 +435,10 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
 
 	for (index = pg_start; index < pg_end; index++) {
 		struct dnode_of_data dn;
-		struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-
-		f2fs_balance_fs(sbi);
 
-		mutex_lock_op(sbi, DATA_TRUNC);
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
-		err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+		err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
 		if (err) {
-			mutex_unlock_op(sbi, DATA_TRUNC);
 			if (err == -ENOENT)
 				continue;
 			return err;
@@ -426,7 +447,6 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
 		if (dn.data_blkaddr != NULL_ADDR)
 			truncate_data_blocks_range(&dn, 1);
 		f2fs_put_dnode(&dn);
-		mutex_unlock_op(sbi, DATA_TRUNC);
 	}
 	return 0;
 }
@@ -456,12 +476,19 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
 		if (pg_start < pg_end) {
 			struct address_space *mapping = inode->i_mapping;
 			loff_t blk_start, blk_end;
+			struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+			int ilock;
+
+			f2fs_balance_fs(sbi);
 
 			blk_start = pg_start << PAGE_CACHE_SHIFT;
 			blk_end = pg_end << PAGE_CACHE_SHIFT;
 			truncate_inode_pages_range(mapping, blk_start,
 					blk_end - 1);
+
+			ilock = mutex_lock_op(sbi);
 			ret = truncate_hole(inode, pg_start, pg_end);
+			mutex_unlock_op(sbi, ilock);
 		}
 	}
 
@@ -495,13 +522,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 
 	for (index = pg_start; index <= pg_end; index++) {
 		struct dnode_of_data dn;
+		int ilock;
 
-		mutex_lock_op(sbi, DATA_NEW);
-
+		ilock = mutex_lock_op(sbi);
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
-		ret = get_dnode_of_data(&dn, index, 0);
+		ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
 		if (ret) {
-			mutex_unlock_op(sbi, DATA_NEW);
+			mutex_unlock_op(sbi, ilock);
 			break;
 		}
 
@@ -509,13 +536,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 			ret = reserve_new_block(&dn);
 			if (ret) {
 				f2fs_put_dnode(&dn);
-				mutex_unlock_op(sbi, DATA_NEW);
+				mutex_unlock_op(sbi, ilock);
 				break;
 			}
 		}
 		f2fs_put_dnode(&dn);
-
-		mutex_unlock_op(sbi, DATA_NEW);
+		mutex_unlock_op(sbi, ilock);
 
 		if (pg_start == pg_end)
 			new_size = offset + len;
@@ -539,7 +565,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 static long f2fs_fallocate(struct file *file, int mode,
 				loff_t offset, loff_t len)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	long ret;
 
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -554,6 +580,7 @@ static long f2fs_fallocate(struct file *file, int mode,
 		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		mark_inode_dirty(inode);
 	}
+	trace_f2fs_fallocate(inode, mode, offset, len, ret);
 	return ret;
 }
 
@@ -572,20 +599,20 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
 
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	unsigned int flags;
 	int ret;
 
 	switch (cmd) {
-	case FS_IOC_GETFLAGS:
+	case F2FS_IOC_GETFLAGS:
 		flags = fi->i_flags & FS_FL_USER_VISIBLE;
 		return put_user(flags, (int __user *) arg);
-	case FS_IOC_SETFLAGS:
+	case F2FS_IOC_SETFLAGS:
 	{
 		unsigned int oldflags;
 
-		ret = mnt_want_write(filp->f_path.mnt);
+		ret = mnt_want_write_file(filp);
 		if (ret)
 			return ret;
 
@@ -622,7 +649,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		inode->i_ctime = CURRENT_TIME;
 		mark_inode_dirty(inode);
 out:
-		mnt_drop_write(filp->f_path.mnt);
+		mnt_drop_write_file(filp);
 		return ret;
 	}
 	default:
@@ -630,6 +657,23 @@ out:
 	}
 }
 
+#ifdef CONFIG_COMPAT
+long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case F2FS_IOC32_GETFLAGS:
+		cmd = F2FS_IOC_GETFLAGS;
+		break;
+	case F2FS_IOC32_SETFLAGS:
+		cmd = F2FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return f2fs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
 const struct file_operations f2fs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -641,6 +685,9 @@ const struct file_operations f2fs_file_operations = {
 	.fsync		= f2fs_sync_file,
 	.fallocate	= f2fs_fallocate,
 	.unlocked_ioctl	= f2fs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= f2fs_compat_ioctl,
+#endif
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 };
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index c386910dacc..35f9b1a196a 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -11,7 +11,6 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
-#include <linux/proc_fs.h>
 #include <linux/init.h>
 #include <linux/f2fs_fs.h>
 #include <linux/kthread.h>
@@ -23,6 +22,7 @@
 #include "node.h"
 #include "segment.h"
 #include "gc.h"
+#include <trace/events/f2fs.h>
 
 static struct kmem_cache *winode_slab;
 
@@ -44,10 +44,10 @@ static int gc_thread_func(void *data)
 		if (kthread_should_stop())
 			break;
 
-		f2fs_balance_fs(sbi);
-
-		if (!test_opt(sbi, BG_GC))
+		if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
+			wait_ms = GC_THREAD_MAX_SLEEP_TIME;
 			continue;
+		}
 
 		/*
 		 * [GC triggering condition]
@@ -76,13 +76,13 @@ static int gc_thread_func(void *data)
 		else
 			wait_ms = increase_sleep_time(wait_ms);
 
+#ifdef CONFIG_F2FS_STAT_FS
 		sbi->bg_gc++;
+#endif
 
-		if (f2fs_gc(sbi) == GC_NONE)
+		/* if return value is not zero, no victim was selected */
+		if (f2fs_gc(sbi))
 			wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
-		else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME)
-			wait_ms = GC_THREAD_MAX_SLEEP_TIME;
-
 	} while (!kthread_should_stop());
 	return 0;
 }
@@ -90,20 +90,29 @@ static int gc_thread_func(void *data)
 int start_gc_thread(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_gc_kthread *gc_th;
+	dev_t dev = sbi->sb->s_bdev->bd_dev;
+	int err = 0;
 
+	if (!test_opt(sbi, BG_GC))
+		goto out;
 	gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
-	if (!gc_th)
-		return -ENOMEM;
+	if (!gc_th) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	sbi->gc_thread = gc_th;
 	init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
 	sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
-				GC_THREAD_NAME);
+			"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(gc_th->f2fs_gc_task)) {
+		err = PTR_ERR(gc_th->f2fs_gc_task);
 		kfree(gc_th);
-		return -ENOMEM;
+		sbi->gc_thread = NULL;
 	}
-	return 0;
+
+out:
+	return err;
 }
 
 void stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -126,7 +135,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 
-	if (p->alloc_mode) {
+	if (p->alloc_mode == SSR) {
 		p->gc_mode = GC_GREEDY;
 		p->dirty_segmap = dirty_i->dirty_segmap[type];
 		p->ofs_unit = 1;
@@ -141,6 +150,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
 static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
 				struct victim_sel_policy *p)
 {
+	/* SSR allocates in a segment unit */
+	if (p->alloc_mode == SSR)
+		return 1 << sbi->log_blocks_per_seg;
 	if (p->gc_mode == GC_GREEDY)
 		return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
 	else if (p->gc_mode == GC_CB)
@@ -152,18 +164,21 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned int segno;
+	unsigned int hint = 0;
+	unsigned int secno;
 
 	/*
 	 * If the gc_type is FG_GC, we can select victim segments
 	 * selected by background GC before.
 	 * Those segments guarantee they have small valid blocks.
 	 */
-	segno = find_next_bit(dirty_i->victim_segmap[BG_GC],
-						TOTAL_SEGS(sbi), 0);
-	if (segno < TOTAL_SEGS(sbi)) {
-		clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
-		return segno;
+next:
+	secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++);
+	if (secno < TOTAL_SECS(sbi)) {
+		if (sec_usage_check(sbi, secno))
+			goto next;
+		clear_bit(secno, dirty_i->victim_secmap);
+		return secno * sbi->segs_per_sec;
 	}
 	return NULL_SEGNO;
 }
@@ -214,7 +229,7 @@ static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno,
 }
 
 /*
- * This function is called from two pathes.
+ * This function is called from two paths.
  * One is garbage collection and the other is SSR segment selection.
  * When it is called during GC, it just gets a victim segment
  * and it does not remove it from dirty seglist.
@@ -226,14 +241,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct victim_sel_policy p;
-	unsigned int segno;
+	unsigned int secno, max_cost;
 	int nsearched = 0;
 
 	p.alloc_mode = alloc_mode;
 	select_policy(sbi, gc_type, type, &p);
 
 	p.min_segno = NULL_SEGNO;
-	p.min_cost = get_max_cost(sbi, &p);
+	p.min_cost = max_cost = get_max_cost(sbi, &p);
 
 	mutex_lock(&dirty_i->seglist_lock);
 
@@ -245,6 +260,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 
 	while (1) {
 		unsigned long cost;
+		unsigned int segno;
 
 		segno = find_next_bit(p.dirty_segmap,
 						TOTAL_SEGS(sbi), p.offset);
@@ -257,13 +273,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 			break;
 		}
 		p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit;
+		secno = GET_SECNO(sbi, segno);
 
-		if (test_bit(segno, dirty_i->victim_segmap[FG_GC]))
-			continue;
-		if (gc_type == BG_GC &&
-				test_bit(segno, dirty_i->victim_segmap[BG_GC]))
+		if (sec_usage_check(sbi, secno))
 			continue;
-		if (IS_CURSEC(sbi, GET_SECNO(sbi, segno)))
+		if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
 			continue;
 
 		cost = get_gc_cost(sbi, segno, &p);
@@ -273,7 +287,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 			p.min_cost = cost;
 		}
 
-		if (cost == get_max_cost(sbi, &p))
+		if (cost == max_cost)
 			continue;
 
 		if (nsearched++ >= MAX_VICTIM_SEARCH) {
@@ -281,15 +295,20 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
 			break;
 		}
 	}
-got_it:
 	if (p.min_segno != NULL_SEGNO) {
-		*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+got_it:
 		if (p.alloc_mode == LFS) {
-			int i;
-			for (i = 0; i < p.ofs_unit; i++)
-				set_bit(*result + i,
-					dirty_i->victim_segmap[gc_type]);
+			secno = GET_SECNO(sbi, p.min_segno);
+			if (gc_type == FG_GC)
+				sbi->cur_victim_sec = secno;
+			else
+				set_bit(secno, dirty_i->victim_secmap);
 		}
+		*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+
+		trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
+				sbi->cur_victim_sec,
+				prefree_segments(sbi), free_segments(sbi));
 	}
 	mutex_unlock(&dirty_i->seglist_lock);
 
@@ -302,28 +321,21 @@ static const struct victim_selection default_v_ops = {
 
 static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
 {
-	struct list_head *this;
 	struct inode_entry *ie;
 
-	list_for_each(this, ilist) {
-		ie = list_entry(this, struct inode_entry, list);
+	list_for_each_entry(ie, ilist, list)
 		if (ie->inode->i_ino == ino)
 			return ie->inode;
-	}
 	return NULL;
 }
 
 static void add_gc_inode(struct inode *inode, struct list_head *ilist)
 {
-	struct list_head *this;
-	struct inode_entry *new_ie, *ie;
+	struct inode_entry *new_ie;
 
-	list_for_each(this, ilist) {
-		ie = list_entry(this, struct inode_entry, list);
-		if (ie->inode == inode) {
-			iput(inode);
-			return;
-		}
+	if (inode == find_gc_inode(inode->i_ino, ilist)) {
+		iput(inode);
+		return;
 	}
 repeat:
 	new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
@@ -356,7 +368,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
 	sentry = get_seg_entry(sbi, segno);
 	ret = f2fs_test_bit(offset, sentry->cur_valid_map);
 	mutex_unlock(&sit_i->sentry_lock);
-	return ret ? GC_OK : GC_NEXT;
+	return ret;
 }
 
 /*
@@ -364,7 +376,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
  * On validity, copy that node with cold status, otherwise (invalid node)
  * ignore that.
  */
-static int gc_node_segment(struct f2fs_sb_info *sbi,
+static void gc_node_segment(struct f2fs_sb_info *sbi,
 		struct f2fs_summary *sum, unsigned int segno, int gc_type)
 {
 	bool initial = true;
@@ -373,24 +385,16 @@ static int gc_node_segment(struct f2fs_sb_info *sbi,
 
 next_step:
 	entry = sum;
+
 	for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
 		nid_t nid = le32_to_cpu(entry->nid);
 		struct page *node_page;
-		int err;
 
-		/*
-		 * It makes sure that free segments are able to write
-		 * all the dirty node pages before CP after this CP.
-		 * So let's check the space of dirty node pages.
-		 */
-		if (should_do_checkpoint(sbi)) {
-			mutex_lock(&sbi->cp_mutex);
-			block_operations(sbi);
-			return GC_BLOCKED;
-		}
+		/* stop BG_GC if there is not enough free sections. */
+		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+			return;
 
-		err = check_valid_map(sbi, segno, off);
-		if (err == GC_NEXT)
+		if (check_valid_map(sbi, segno, off) == 0)
 			continue;
 
 		if (initial) {
@@ -402,11 +406,18 @@ next_step:
 			continue;
 
 		/* set page dirty and write it */
-		if (!PageWriteback(node_page))
+		if (gc_type == FG_GC) {
+			f2fs_submit_bio(sbi, NODE, true);
+			wait_on_page_writeback(node_page);
 			set_page_dirty(node_page);
+		} else {
+			if (!PageWriteback(node_page))
+				set_page_dirty(node_page);
+		}
 		f2fs_put_page(node_page, 1);
 		stat_inc_node_blk_count(sbi, 1);
 	}
+
 	if (initial) {
 		initial = false;
 		goto next_step;
@@ -419,8 +430,14 @@ next_step:
 			.for_reclaim = 0,
 		};
 		sync_node_pages(sbi, 0, &wbc);
+
+		/*
+		 * In the case of FG_GC, it'd be better to reclaim this victim
+		 * completely.
+		 */
+		if (get_valid_blocks(sbi, segno, 1) != 0)
+			goto next_step;
 	}
-	return GC_DONE;
 }
 
 /*
@@ -463,13 +480,13 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 
 	node_page = get_node_page(sbi, nid);
 	if (IS_ERR(node_page))
-		return GC_NEXT;
+		return 0;
 
 	get_node_info(sbi, nid, dni);
 
 	if (sum->version != dni->version) {
 		f2fs_put_page(node_page, 1);
-		return GC_NEXT;
+		return 0;
 	}
 
 	*nofs = ofs_of_node(node_page);
@@ -477,27 +494,25 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	f2fs_put_page(node_page, 1);
 
 	if (source_blkaddr != blkaddr)
-		return GC_NEXT;
-	return GC_OK;
+		return 0;
+	return 1;
 }
 
 static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 {
-	if (page->mapping != inode->i_mapping)
-		goto out;
-
-	if (inode != page->mapping->host)
-		goto out;
-
-	if (PageWriteback(page))
-		goto out;
-
 	if (gc_type == BG_GC) {
+		if (PageWriteback(page))
+			goto out;
 		set_page_dirty(page);
 		set_cold_data(page);
 	} else {
 		struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-		mutex_lock_op(sbi, DATA_WRITE);
+
+		if (PageWriteback(page)) {
+			f2fs_submit_bio(sbi, DATA, true);
+			wait_on_page_writeback(page);
+		}
+
 		if (clear_page_dirty_for_io(page) &&
 			S_ISDIR(inode->i_mode)) {
 			dec_page_count(sbi, F2FS_DIRTY_DENTS);
@@ -505,7 +520,6 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
 		}
 		set_cold_data(page);
 		do_write_data_page(page);
-		mutex_unlock_op(sbi, DATA_WRITE);
 		clear_cold_data(page);
 	}
 out:
@@ -519,19 +533,20 @@ out:
  * If the parent node is not valid or the data block address is different,
  * the victim data block is ignored.
  */
-static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		struct list_head *ilist, unsigned int segno, int gc_type)
 {
 	struct super_block *sb = sbi->sb;
 	struct f2fs_summary *entry;
 	block_t start_addr;
-	int err, off;
+	int off;
 	int phase = 0;
 
 	start_addr = START_BLOCK(sbi, segno);
 
 next_step:
 	entry = sum;
+
 	for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
 		struct page *data_page;
 		struct inode *inode;
@@ -539,20 +554,11 @@ next_step:
 		unsigned int ofs_in_node, nofs;
 		block_t start_bidx;
 
-		/*
-		 * It makes sure that free segments are able to write
-		 * all the dirty node pages before CP after this CP.
-		 * So let's check the space of dirty node pages.
-		 */
-		if (should_do_checkpoint(sbi)) {
-			mutex_lock(&sbi->cp_mutex);
-			block_operations(sbi);
-			err = GC_BLOCKED;
-			goto stop;
-		}
+		/* stop BG_GC if there is not enough free sections. */
+		if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
+			return;
 
-		err = check_valid_map(sbi, segno, off);
-		if (err == GC_NEXT)
+		if (check_valid_map(sbi, segno, off) == 0)
 			continue;
 
 		if (phase == 0) {
@@ -561,8 +567,7 @@ next_step:
 		}
 
 		/* Get an inode by ino with checking validity */
-		err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs);
-		if (err == GC_NEXT)
+		if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0)
 			continue;
 
 		if (phase == 1) {
@@ -574,12 +579,12 @@ next_step:
 		ofs_in_node = le16_to_cpu(entry->ofs_in_node);
 
 		if (phase == 2) {
-			inode = f2fs_iget_nowait(sb, dni.ino);
+			inode = f2fs_iget(sb, dni.ino);
 			if (IS_ERR(inode))
 				continue;
 
 			data_page = find_data_page(inode,
-					start_bidx + ofs_in_node);
+					start_bidx + ofs_in_node, false);
 			if (IS_ERR(data_page))
 				goto next_iput;
 
@@ -600,13 +605,22 @@ next_step:
 next_iput:
 		iput(inode);
 	}
+
 	if (++phase < 4)
 		goto next_step;
-	err = GC_DONE;
-stop:
-	if (gc_type == FG_GC)
+
+	if (gc_type == FG_GC) {
 		f2fs_submit_bio(sbi, DATA, true);
-	return err;
+
+		/*
+		 * In the case of FG_GC, it'd be better to reclaim this victim
+		 * completely.
+		 */
+		if (get_valid_blocks(sbi, segno, 1) != 0) {
+			phase = 2;
+			goto next_step;
+		}
+	}
 }
 
 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -620,39 +634,36 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
 	return ret;
 }
 
-static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
 				struct list_head *ilist, int gc_type)
 {
 	struct page *sum_page;
 	struct f2fs_summary_block *sum;
-	int ret = GC_DONE;
+	struct blk_plug plug;
 
 	/* read segment summary of victim */
 	sum_page = get_sum_page(sbi, segno);
 	if (IS_ERR(sum_page))
-		return GC_ERROR;
+		return;
+
+	blk_start_plug(&plug);
 
-	/*
-	 * CP needs to lock sum_page. In this time, we don't need
-	 * to lock this page, because this summary page is not gone anywhere.
-	 * Also, this page is not gonna be updated before GC is done.
-	 */
-	unlock_page(sum_page);
 	sum = page_address(sum_page);
 
 	switch (GET_SUM_TYPE((&sum->footer))) {
 	case SUM_TYPE_NODE:
-		ret = gc_node_segment(sbi, sum->entries, segno, gc_type);
+		gc_node_segment(sbi, sum->entries, segno, gc_type);
 		break;
 	case SUM_TYPE_DATA:
-		ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
+		gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
 		break;
 	}
+	blk_finish_plug(&plug);
+
 	stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
 	stat_inc_call_count(sbi->stat_info);
 
-	f2fs_put_page(sum_page, 0);
-	return ret;
+	f2fs_put_page(sum_page, 1);
 }
 
 int f2fs_gc(struct f2fs_sb_info *sbi)
@@ -660,40 +671,42 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
 	struct list_head ilist;
 	unsigned int segno, i;
 	int gc_type = BG_GC;
-	int gc_status = GC_NONE;
+	int nfree = 0;
+	int ret = -1;
 
 	INIT_LIST_HEAD(&ilist);
 gc_more:
 	if (!(sbi->sb->s_flags & MS_ACTIVE))
 		goto stop;
 
-	if (has_not_enough_free_secs(sbi))
+	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
 		gc_type = FG_GC;
+		write_checkpoint(sbi, false);
+	}
 
 	if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
 		goto stop;
+	ret = 0;
 
-	for (i = 0; i < sbi->segs_per_sec; i++) {
-		/*
-		 * do_garbage_collect will give us three gc_status:
-		 * GC_ERROR, GC_DONE, and GC_BLOCKED.
-		 * If GC is finished uncleanly, we have to return
-		 * the victim to dirty segment list.
-		 */
-		gc_status = do_garbage_collect(sbi, segno + i, &ilist, gc_type);
-		if (gc_status != GC_DONE)
-			break;
-	}
-	if (has_not_enough_free_secs(sbi)) {
-		write_checkpoint(sbi, (gc_status == GC_BLOCKED), false);
-		if (has_not_enough_free_secs(sbi))
-			goto gc_more;
+	for (i = 0; i < sbi->segs_per_sec; i++)
+		do_garbage_collect(sbi, segno + i, &ilist, gc_type);
+
+	if (gc_type == FG_GC) {
+		sbi->cur_victim_sec = NULL_SEGNO;
+		nfree++;
+		WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec));
 	}
+
+	if (has_not_enough_free_secs(sbi, nfree))
+		goto gc_more;
+
+	if (gc_type == FG_GC)
+		write_checkpoint(sbi, false);
 stop:
 	mutex_unlock(&sbi->gc_mutex);
 
 	put_gc_inode(&ilist);
-	return gc_status;
+	return ret;
 }
 
 void build_gc_manager(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index b026d9354cc..2c6a6bd0832 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -8,30 +8,20 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#define GC_THREAD_NAME	"f2fs_gc_task"
 #define GC_THREAD_MIN_WB_PAGES		1	/*
 						 * a threshold to determine
 						 * whether IO subsystem is idle
 						 * or not
 						 */
-#define GC_THREAD_MIN_SLEEP_TIME	10000 /* milliseconds */
-#define GC_THREAD_MAX_SLEEP_TIME	30000
-#define GC_THREAD_NOGC_SLEEP_TIME	10000
+#define GC_THREAD_MIN_SLEEP_TIME	30000	/* milliseconds */
+#define GC_THREAD_MAX_SLEEP_TIME	60000
+#define GC_THREAD_NOGC_SLEEP_TIME	300000	/* wait 5 min */
 #define LIMIT_INVALID_BLOCK	40 /* percentage over total user space */
 #define LIMIT_FREE_BLOCK	40 /* percentage over invalid + free space */
 
 /* Search max. number of dirty segments to select a victim segment */
 #define MAX_VICTIM_SEARCH	20
 
-enum {
-	GC_NONE = 0,
-	GC_ERROR,
-	GC_OK,
-	GC_NEXT,
-	GC_BLOCKED,
-	GC_DONE,
-};
-
 struct f2fs_gc_kthread {
 	struct task_struct *f2fs_gc_task;
 	wait_queue_head_t gc_wait_queue_head;
@@ -68,6 +58,9 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
 
 static inline long increase_sleep_time(long wait)
 {
+	if (wait == GC_THREAD_NOGC_SLEEP_TIME)
+		return wait;
+
 	wait += GC_THREAD_MIN_SLEEP_TIME;
 	if (wait > GC_THREAD_MAX_SLEEP_TIME)
 		wait = GC_THREAD_MAX_SLEEP_TIME;
@@ -76,6 +69,9 @@ static inline long increase_sleep_time(long wait)
 
 static inline long decrease_sleep_time(long wait)
 {
+	if (wait == GC_THREAD_NOGC_SLEEP_TIME)
+		wait = GC_THREAD_MAX_SLEEP_TIME;
+
 	wait -= GC_THREAD_MIN_SLEEP_TIME;
 	if (wait <= GC_THREAD_MIN_SLEEP_TIME)
 		wait = GC_THREAD_MIN_SLEEP_TIME;
@@ -104,14 +100,3 @@ static inline int is_idle(struct f2fs_sb_info *sbi)
 	struct request_list *rl = &q->root_rl;
 	return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
 }
-
-static inline bool should_do_checkpoint(struct f2fs_sb_info *sbi)
-{
-	unsigned int pages_per_sec = sbi->segs_per_sec *
-					(1 << sbi->log_blocks_per_seg);
-	int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
-			>> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
-	int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
-			>> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
-	return free_sections(sbi) <= (node_secs + 2 * dent_secs + 2);
-}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 79424177732..2b2d45d19e3 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -16,10 +16,7 @@
 #include "f2fs.h"
 #include "node.h"
 
-struct f2fs_iget_args {
-	u64 ino;
-	int on_free;
-};
+#include <trace/events/f2fs.h>
 
 void f2fs_set_inode_flags(struct inode *inode)
 {
@@ -40,34 +37,6 @@ void f2fs_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_DIRSYNC;
 }
 
-static int f2fs_iget_test(struct inode *inode, void *data)
-{
-	struct f2fs_iget_args *args = data;
-
-	if (inode->i_ino != args->ino)
-		return 0;
-	if (inode->i_state & (I_FREEING | I_WILL_FREE)) {
-		args->on_free = 1;
-		return 0;
-	}
-	return 1;
-}
-
-struct inode *f2fs_iget_nowait(struct super_block *sb, unsigned long ino)
-{
-	struct f2fs_iget_args args = {
-		.ino = ino,
-		.on_free = 0
-	};
-	struct inode *inode = ilookup5(sb, ino, f2fs_iget_test, &args);
-
-	if (inode)
-		return inode;
-	if (!args.on_free)
-		return f2fs_iget(sb, ino);
-	return ERR_PTR(-ENOENT);
-}
-
 static int do_read_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -77,7 +46,11 @@ static int do_read_inode(struct inode *inode)
 	struct f2fs_inode *ri;
 
 	/* Check if ino is within scope */
-	check_nid_range(sbi, inode->i_ino);
+	if (check_nid_range(sbi, inode->i_ino)) {
+		f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu",
+			 (unsigned long) inode->i_ino);
+		return -EINVAL;
+	}
 
 	node_page = get_node_page(sbi, inode->i_ino);
 	if (IS_ERR(node_page))
@@ -100,12 +73,15 @@ static int do_read_inode(struct inode *inode)
 	inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
 	inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
 	inode->i_generation = le32_to_cpu(ri->i_generation);
+	if (ri->i_addr[0])
+		inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+	else
+		inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1]));
 
 	fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
 	fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
 	fi->i_flags = le32_to_cpu(ri->i_flags);
 	fi->flags = 0;
-	fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1;
 	fi->i_advise = ri->i_advise;
 	fi->i_pino = le32_to_cpu(ri->i_pino);
 	get_extent_info(&fi->ext, ri->i_ext);
@@ -117,25 +93,22 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	struct inode *inode;
-	int ret;
+	int ret = 0;
 
 	inode = iget_locked(sb, ino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW))
+
+	if (!(inode->i_state & I_NEW)) {
+		trace_f2fs_iget(inode);
 		return inode;
+	}
 	if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
 		goto make_now;
 
 	ret = do_read_inode(inode);
 	if (ret)
 		goto bad_inode;
-
-	if (!sbi->por_doing && inode->i_nlink == 0) {
-		ret = -ENOENT;
-		goto bad_inode;
-	}
-
 make_now:
 	if (ino == F2FS_NODE_INO(sbi)) {
 		inode->i_mapping->a_ops = &f2fs_node_aops;
@@ -151,8 +124,7 @@ make_now:
 		inode->i_op = &f2fs_dir_inode_operations;
 		inode->i_fop = &f2fs_dir_operations;
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
-		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
-				__GFP_ZERO);
+		mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &f2fs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -165,11 +137,12 @@ make_now:
 		goto bad_inode;
 	}
 	unlock_new_inode(inode);
-
+	trace_f2fs_iget(inode);
 	return inode;
 
 bad_inode:
 	iget_failed(inode);
+	trace_f2fs_iget_exit(inode, ret);
 	return ERR_PTR(ret);
 }
 
@@ -203,51 +176,73 @@ void update_inode(struct inode *inode, struct page *node_page)
 	ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
 	ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
 	ri->i_generation = cpu_to_le32(inode->i_generation);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			ri->i_addr[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			ri->i_addr[1] = 0;
+		} else {
+			ri->i_addr[0] = 0;
+			ri->i_addr[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			ri->i_addr[2] = 0;
+		}
+	}
+
 	set_cold_node(inode, node_page);
 	set_page_dirty(node_page);
+	clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
 }
 
-int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
+int update_inode_page(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct page *node_page;
-	bool need_lock = false;
-
-	if (inode->i_ino == F2FS_NODE_INO(sbi) ||
-			inode->i_ino == F2FS_META_INO(sbi))
-		return 0;
-
-	if (wbc)
-		f2fs_balance_fs(sbi);
 
 	node_page = get_node_page(sbi, inode->i_ino);
 	if (IS_ERR(node_page))
 		return PTR_ERR(node_page);
 
-	if (!PageDirty(node_page)) {
-		need_lock = true;
-		f2fs_put_page(node_page, 1);
-		mutex_lock(&sbi->write_inode);
-		node_page = get_node_page(sbi, inode->i_ino);
-		if (IS_ERR(node_page)) {
-			mutex_unlock(&sbi->write_inode);
-			return PTR_ERR(node_page);
-		}
-	}
 	update_inode(inode, node_page);
 	f2fs_put_page(node_page, 1);
-	if (need_lock)
-		mutex_unlock(&sbi->write_inode);
 	return 0;
 }
 
+int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	int ret, ilock;
+
+	if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+			inode->i_ino == F2FS_META_INO(sbi))
+		return 0;
+
+	if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
+		return 0;
+
+	if (wbc)
+		f2fs_balance_fs(sbi);
+
+	/*
+	 * We need to lock here to prevent from producing dirty node pages
+	 * during the urgent cleaning time when runing out of free sections.
+	 */
+	ilock = mutex_lock_op(sbi);
+	ret = update_inode_page(inode);
+	mutex_unlock_op(sbi, ilock);
+	return ret;
+}
+
 /*
  * Called at the last iput() if i_nlink is zero
  */
 void f2fs_evict_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	int ilock;
 
+	trace_f2fs_evict_inode(inode);
 	truncate_inode_pages(&inode->i_data, 0);
 
 	if (inode->i_ino == F2FS_NODE_INO(sbi) ||
@@ -260,13 +255,18 @@ void f2fs_evict_inode(struct inode *inode)
 	if (inode->i_nlink || is_bad_inode(inode))
 		goto no_delete;
 
+	sb_start_intwrite(inode->i_sb);
 	set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
 	i_size_write(inode, 0);
 
 	if (F2FS_HAS_BLOCKS(inode))
 		f2fs_truncate(inode);
 
+	ilock = mutex_lock_op(sbi);
 	remove_inode_page(inode);
+	mutex_unlock_op(sbi, ilock);
+
+	sb_end_intwrite(inode->i_sb);
 no_delete:
 	clear_inode(inode);
 }
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 1a49b881bac..64c07169df0 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -15,8 +15,10 @@
 #include <linux/ctype.h>
 
 #include "f2fs.h"
+#include "node.h"
 #include "xattr.h"
 #include "acl.h"
+#include <trace/events/f2fs.h>
 
 static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 {
@@ -25,19 +27,19 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	nid_t ino;
 	struct inode *inode;
 	bool nid_free = false;
-	int err;
+	int err, ilock;
 
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
-	mutex_lock_op(sbi, NODE_NEW);
+	ilock = mutex_lock_op(sbi);
 	if (!alloc_nid(sbi, &ino)) {
-		mutex_unlock_op(sbi, NODE_NEW);
+		mutex_unlock_op(sbi, ilock);
 		err = -ENOSPC;
 		goto fail;
 	}
-	mutex_unlock_op(sbi, NODE_NEW);
+	mutex_unlock_op(sbi, ilock);
 
 	inode->i_uid = current_fsuid();
 
@@ -61,7 +63,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 		nid_free = true;
 		goto out;
 	}
-
+	trace_f2fs_new_inode(inode, 0);
 	mark_inode_dirty(inode);
 	return inode;
 
@@ -69,6 +71,8 @@ out:
 	clear_nlink(inode);
 	unlock_new_inode(inode);
 fail:
+	trace_f2fs_new_inode(inode, err);
+	make_bad_inode(inode);
 	iput(inode);
 	if (nid_free)
 		alloc_nid_failed(sbi, ino);
@@ -82,7 +86,7 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
 	int ret;
 
 	if (sublen > slen)
-		return 1;
+		return 0;
 
 	ret = memcmp(s + slen - sublen, sub, sublen);
 	if (ret) {	/* compare upper case */
@@ -90,16 +94,16 @@ static int is_multimedia_file(const unsigned char *s, const char *sub)
 		char upper_sub[8];
 		for (i = 0; i < sublen && i < sizeof(upper_sub); i++)
 			upper_sub[i] = toupper(sub[i]);
-		return memcmp(s + slen - sublen, upper_sub, sublen);
+		return !memcmp(s + slen - sublen, upper_sub, sublen);
 	}
 
-	return ret;
+	return !ret;
 }
 
 /*
  * Set multimedia files as cold files for hot/cold data separation
  */
-static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode,
+static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
 		const unsigned char *name)
 {
 	int i;
@@ -107,8 +111,8 @@ static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode,
 
 	int count = le32_to_cpu(sbi->raw_super->extension_count);
 	for (i = 0; i < count; i++) {
-		if (!is_multimedia_file(name, extlist[i])) {
-			F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+		if (is_multimedia_file(name, extlist[i])) {
+			file_set_cold(inode);
 			break;
 		}
 	}
@@ -121,7 +125,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	struct inode *inode;
 	nid_t ino = 0;
-	int err;
+	int err, ilock;
 
 	f2fs_balance_fs(sbi);
 
@@ -130,26 +134,28 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		return PTR_ERR(inode);
 
 	if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
-		set_cold_file(sbi, inode, dentry->d_name.name);
+		set_cold_files(sbi, inode, dentry->d_name.name);
 
 	inode->i_op = &f2fs_file_inode_operations;
 	inode->i_fop = &f2fs_file_operations;
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	ino = inode->i_ino;
 
+	ilock = mutex_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
+	mutex_unlock_op(sbi, ilock);
 	if (err)
 		goto out;
 
 	alloc_nid_done(sbi, ino);
 
-	if (!sbi->por_doing)
-		d_instantiate(dentry, inode);
+	d_instantiate(dentry, inode);
 	unlock_new_inode(inode);
 	return 0;
 out:
 	clear_nlink(inode);
 	unlock_new_inode(inode);
+	make_bad_inode(inode);
 	iput(inode);
 	alloc_nid_failed(sbi, ino);
 	return err;
@@ -161,15 +167,17 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 	struct inode *inode = old_dentry->d_inode;
 	struct super_block *sb = dir->i_sb;
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
-	int err;
+	int err, ilock;
 
 	f2fs_balance_fs(sbi);
 
 	inode->i_ctime = CURRENT_TIME;
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
 	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+	ilock = mutex_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
+	mutex_unlock_op(sbi, ilock);
 	if (err)
 		goto out;
 
@@ -197,7 +205,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	struct f2fs_dir_entry *de;
 	struct page *page;
 
-	if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
+	if (dentry->d_name.len > F2FS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
@@ -222,7 +230,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 	struct f2fs_dir_entry *de;
 	struct page *page;
 	int err = -ENOENT;
+	int ilock;
 
+	trace_f2fs_unlink_enter(dir, dentry);
 	f2fs_balance_fs(sbi);
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
@@ -236,11 +246,14 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 		goto fail;
 	}
 
+	ilock = mutex_lock_op(sbi);
 	f2fs_delete_entry(de, page, inode);
+	mutex_unlock_op(sbi, ilock);
 
 	/* In order to evict this inode,  we set it dirty */
 	mark_inode_dirty(inode);
 fail:
+	trace_f2fs_unlink_exit(inode, err);
 	return err;
 }
 
@@ -251,7 +264,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	struct inode *inode;
 	size_t symlen = strlen(symname) + 1;
-	int err;
+	int err, ilock;
 
 	f2fs_balance_fs(sbi);
 
@@ -262,7 +275,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_op = &f2fs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 
+	ilock = mutex_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
+	mutex_unlock_op(sbi, ilock);
 	if (err)
 		goto out;
 
@@ -275,6 +290,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 out:
 	clear_nlink(inode);
 	unlock_new_inode(inode);
+	make_bad_inode(inode);
 	iput(inode);
 	alloc_nid_failed(sbi, inode->i_ino);
 	return err;
@@ -284,7 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
 	struct inode *inode;
-	int err;
+	int err, ilock;
 
 	f2fs_balance_fs(sbi);
 
@@ -298,7 +314,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
 
 	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+	ilock = mutex_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
+	mutex_unlock_op(sbi, ilock);
 	if (err)
 		goto out_fail;
 
@@ -313,6 +331,7 @@ out_fail:
 	clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
 	clear_nlink(inode);
 	unlock_new_inode(inode);
+	make_bad_inode(inode);
 	iput(inode);
 	alloc_nid_failed(sbi, inode->i_ino);
 	return err;
@@ -333,6 +352,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 	struct inode *inode;
 	int err = 0;
+	int ilock;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -346,7 +366,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	init_special_inode(inode, inode->i_mode, rdev);
 	inode->i_op = &f2fs_special_inode_operations;
 
+	ilock = mutex_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
+	mutex_unlock_op(sbi, ilock);
 	if (err)
 		goto out;
 
@@ -357,6 +379,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 out:
 	clear_nlink(inode);
 	unlock_new_inode(inode);
+	make_bad_inode(inode);
 	iput(inode);
 	alloc_nid_failed(sbi, inode->i_ino);
 	return err;
@@ -374,7 +397,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct f2fs_dir_entry *old_dir_entry = NULL;
 	struct f2fs_dir_entry *old_entry;
 	struct f2fs_dir_entry *new_entry;
-	int err = -ENOENT;
+	int err = -ENOENT, ilock = -1;
 
 	f2fs_balance_fs(sbi);
 
@@ -389,7 +412,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out_old;
 	}
 
-	mutex_lock_op(sbi, RENAME);
+	ilock = mutex_lock_op(sbi);
 
 	if (new_inode) {
 		struct page *new_page;
@@ -412,7 +435,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		drop_nlink(new_inode);
 		if (!new_inode->i_nlink)
 			add_orphan_inode(sbi, new_inode->i_ino);
-		f2fs_write_inode(new_inode, NULL);
+		update_inode_page(new_inode);
 	} else {
 		err = f2fs_add_link(new_dentry, old_inode);
 		if (err)
@@ -420,12 +443,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 		if (old_dir_entry) {
 			inc_nlink(new_dir);
-			f2fs_write_inode(new_dir, NULL);
+			update_inode_page(new_dir);
 		}
 	}
 
 	old_inode->i_ctime = CURRENT_TIME;
-	set_inode_flag(F2FS_I(old_inode), FI_NEED_CP);
 	mark_inode_dirty(old_inode);
 
 	f2fs_delete_entry(old_entry, old_page, NULL);
@@ -439,10 +461,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			f2fs_put_page(old_dir_page, 0);
 		}
 		drop_nlink(old_dir);
-		f2fs_write_inode(old_dir, NULL);
+		update_inode_page(old_dir);
 	}
 
-	mutex_unlock_op(sbi, RENAME);
+	mutex_unlock_op(sbi, ilock);
 	return 0;
 
 out_dir:
@@ -450,7 +472,7 @@ out_dir:
 		kunmap(old_dir_page);
 		f2fs_put_page(old_dir_page, 0);
 	}
-	mutex_unlock_op(sbi, RENAME);
+	mutex_unlock_op(sbi, ilock);
 out_old:
 	kunmap(old_page);
 	f2fs_put_page(old_page, 0);
@@ -468,6 +490,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
 	.rmdir		= f2fs_rmdir,
 	.mknod		= f2fs_mknod,
 	.rename		= f2fs_rename,
+	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
 	.get_acl	= f2fs_get_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
@@ -482,6 +505,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
 	.readlink       = generic_readlink,
 	.follow_link    = page_follow_link_light,
 	.put_link       = page_put_link,
+	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
 #ifdef CONFIG_F2FS_FS_XATTR
 	.setxattr	= generic_setxattr,
@@ -492,6 +516,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
 };
 
 const struct inode_operations f2fs_special_inode_operations = {
+	.getattr	= f2fs_getattr,
 	.setattr        = f2fs_setattr,
 	.get_acl	= f2fs_get_acl,
 #ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9bda63c9c16..b418aee0957 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -19,6 +19,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include <trace/events/f2fs.h>
 
 static struct kmem_cache *nat_entry_slab;
 static struct kmem_cache *free_nid_slab;
@@ -88,10 +89,13 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 {
 	struct address_space *mapping = sbi->meta_inode->i_mapping;
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct blk_plug plug;
 	struct page *page;
 	pgoff_t index;
 	int i;
 
+	blk_start_plug(&plug);
+
 	for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
 		if (nid >= nm_i->max_nid)
 			nid = 0;
@@ -100,12 +104,16 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
 		page = grab_cache_page(mapping, index);
 		if (!page)
 			continue;
-		if (f2fs_readpage(sbi, page, index, READ)) {
+		if (PageUptodate(page)) {
 			f2fs_put_page(page, 1);
 			continue;
 		}
-		page_cache_release(page);
+		if (f2fs_readpage(sbi, page, index, READ))
+			continue;
+
+		f2fs_put_page(page, 0);
 	}
+	blk_finish_plug(&plug);
 }
 
 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
@@ -236,7 +244,7 @@ static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
-	if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD)
+	if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD)
 		return 0;
 
 	write_lock(&nm_i->nat_tree_lock);
@@ -320,15 +328,14 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
 	noffset[0] = 0;
 
 	if (block < direct_index) {
-		offset[n++] = block;
-		level = 0;
+		offset[n] = block;
 		goto got;
 	}
 	block -= direct_index;
 	if (block < direct_blks) {
 		offset[n++] = NODE_DIR1_BLOCK;
 		noffset[n] = 1;
-		offset[n++] = block;
+		offset[n] = block;
 		level = 1;
 		goto got;
 	}
@@ -336,7 +343,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
 	if (block < direct_blks) {
 		offset[n++] = NODE_DIR2_BLOCK;
 		noffset[n] = 2;
-		offset[n++] = block;
+		offset[n] = block;
 		level = 1;
 		goto got;
 	}
@@ -346,7 +353,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
 		noffset[n] = 3;
 		offset[n++] = block / direct_blks;
 		noffset[n] = 4 + offset[n - 1];
-		offset[n++] = block % direct_blks;
+		offset[n] = block % direct_blks;
 		level = 2;
 		goto got;
 	}
@@ -356,7 +363,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
 		noffset[n] = 4 + dptrs_per_blk;
 		offset[n++] = block / direct_blks;
 		noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
-		offset[n++] = block % direct_blks;
+		offset[n] = block % direct_blks;
 		level = 2;
 		goto got;
 	}
@@ -371,7 +378,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4])
 		noffset[n] = 7 + (dptrs_per_blk * 2) +
 			      offset[n - 2] * (dptrs_per_blk + 1) +
 			      offset[n - 1];
-		offset[n++] = block % direct_blks;
+		offset[n] = block % direct_blks;
 		level = 3;
 		goto got;
 	} else {
@@ -383,8 +390,11 @@ got:
 
 /*
  * Caller should call f2fs_put_dnode(dn).
+ * Also, it should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op() only if ro is not set RDONLY_NODE.
+ * In the case of RDONLY_NODE, we don't need to care about mutex.
  */
-int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
+int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
 	struct page *npage[4];
@@ -398,12 +408,16 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
 	level = get_node_path(index, offset, noffset);
 
 	nids[0] = dn->inode->i_ino;
-	npage[0] = get_node_page(sbi, nids[0]);
-	if (IS_ERR(npage[0]))
-		return PTR_ERR(npage[0]);
+	npage[0] = dn->inode_page;
 
+	if (!npage[0]) {
+		npage[0] = get_node_page(sbi, nids[0]);
+		if (IS_ERR(npage[0]))
+			return PTR_ERR(npage[0]);
+	}
 	parent = npage[0];
-	nids[1] = get_nid(parent, offset[0], true);
+	if (level != 0)
+		nids[1] = get_nid(parent, offset[0], true);
 	dn->inode_page = npage[0];
 	dn->inode_page_locked = true;
 
@@ -411,30 +425,25 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
 	for (i = 1; i <= level; i++) {
 		bool done = false;
 
-		if (!nids[i] && !ro) {
-			mutex_lock_op(sbi, NODE_NEW);
-
+		if (!nids[i] && mode == ALLOC_NODE) {
 			/* alloc new node */
 			if (!alloc_nid(sbi, &(nids[i]))) {
-				mutex_unlock_op(sbi, NODE_NEW);
 				err = -ENOSPC;
 				goto release_pages;
 			}
 
 			dn->nid = nids[i];
-			npage[i] = new_node_page(dn, noffset[i]);
+			npage[i] = new_node_page(dn, noffset[i], NULL);
 			if (IS_ERR(npage[i])) {
 				alloc_nid_failed(sbi, nids[i]);
-				mutex_unlock_op(sbi, NODE_NEW);
 				err = PTR_ERR(npage[i]);
 				goto release_pages;
 			}
 
 			set_nid(parent, offset[i - 1], nids[i], i == 1);
 			alloc_nid_done(sbi, nids[i]);
-			mutex_unlock_op(sbi, NODE_NEW);
 			done = true;
-		} else if (ro && i == level && level > 1) {
+		} else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
 			npage[i] = get_node_page_ra(parent, offset[i - 1]);
 			if (IS_ERR(npage[i])) {
 				err = PTR_ERR(npage[i]);
@@ -507,6 +516,7 @@ invalidate:
 
 	f2fs_put_page(dn->node_page, 1);
 	dn->node_page = NULL;
+	trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
 }
 
 static int truncate_dnode(struct dnode_of_data *dn)
@@ -547,9 +557,13 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
 	if (dn->nid == 0)
 		return NIDS_PER_BLOCK + 1;
 
+	trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
+
 	page = get_node_page(sbi, dn->nid);
-	if (IS_ERR(page))
+	if (IS_ERR(page)) {
+		trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
 		return PTR_ERR(page);
+	}
 
 	rn = (struct f2fs_node *)page_address(page);
 	if (depth < 3) {
@@ -591,10 +605,12 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
 	} else {
 		f2fs_put_page(page, 1);
 	}
+	trace_f2fs_truncate_nodes_exit(dn->inode, freed);
 	return freed;
 
 out_err:
 	f2fs_put_page(page, 1);
+	trace_f2fs_truncate_nodes_exit(dn->inode, ret);
 	return ret;
 }
 
@@ -649,6 +665,9 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
 fail:
 	for (i = depth - 3; i >= 0; i--)
 		f2fs_put_page(pages[i], 1);
+
+	trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
+
 	return err;
 }
 
@@ -658,18 +677,23 @@ fail:
 int truncate_inode_blocks(struct inode *inode, pgoff_t from)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+	struct address_space *node_mapping = sbi->node_inode->i_mapping;
 	int err = 0, cont = 1;
 	int level, offset[4], noffset[4];
-	unsigned int nofs;
+	unsigned int nofs = 0;
 	struct f2fs_node *rn;
 	struct dnode_of_data dn;
 	struct page *page;
 
-	level = get_node_path(from, offset, noffset);
+	trace_f2fs_truncate_inode_blocks_enter(inode, from);
 
+	level = get_node_path(from, offset, noffset);
+restart:
 	page = get_node_page(sbi, inode->i_ino);
-	if (IS_ERR(page))
+	if (IS_ERR(page)) {
+		trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
 		return PTR_ERR(page);
+	}
 
 	set_new_dnode(&dn, inode, page, NULL, 0);
 	unlock_page(page);
@@ -728,6 +752,10 @@ skip_partial:
 		if (offset[1] == 0 &&
 				rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
 			lock_page(page);
+			if (page->mapping != node_mapping) {
+				f2fs_put_page(page, 1);
+				goto restart;
+			}
 			wait_on_page_writeback(page);
 			rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
 			set_page_dirty(page);
@@ -739,9 +767,14 @@ skip_partial:
 	}
 fail:
 	f2fs_put_page(page, 0);
+	trace_f2fs_truncate_inode_blocks_exit(inode, err);
 	return err > 0 ? 0 : err;
 }
 
+/*
+ * Caller should grab and release a mutex by calling mutex_lock_op() and
+ * mutex_unlock_op().
+ */
 int remove_inode_page(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -749,21 +782,16 @@ int remove_inode_page(struct inode *inode)
 	nid_t ino = inode->i_ino;
 	struct dnode_of_data dn;
 
-	mutex_lock_op(sbi, NODE_TRUNC);
 	page = get_node_page(sbi, ino);
-	if (IS_ERR(page)) {
-		mutex_unlock_op(sbi, NODE_TRUNC);
+	if (IS_ERR(page))
 		return PTR_ERR(page);
-	}
 
 	if (F2FS_I(inode)->i_xattr_nid) {
 		nid_t nid = F2FS_I(inode)->i_xattr_nid;
 		struct page *npage = get_node_page(sbi, nid);
 
-		if (IS_ERR(npage)) {
-			mutex_unlock_op(sbi, NODE_TRUNC);
+		if (IS_ERR(npage))
 			return PTR_ERR(npage);
-		}
 
 		F2FS_I(inode)->i_xattr_nid = 0;
 		set_new_dnode(&dn, inode, page, npage, nid);
@@ -775,30 +803,22 @@ int remove_inode_page(struct inode *inode)
 	BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1);
 	set_new_dnode(&dn, inode, page, page, ino);
 	truncate_node(&dn);
-
-	mutex_unlock_op(sbi, NODE_TRUNC);
 	return 0;
 }
 
-int new_inode_page(struct inode *inode, struct dentry *dentry)
+struct page *new_inode_page(struct inode *inode, const struct qstr *name)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-	struct page *page;
 	struct dnode_of_data dn;
 
 	/* allocate inode page for new inode */
 	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
-	mutex_lock_op(sbi, NODE_NEW);
-	page = new_node_page(&dn, 0);
-	init_dent_inode(dentry, page);
-	mutex_unlock_op(sbi, NODE_NEW);
-	if (IS_ERR(page))
-		return PTR_ERR(page);
-	f2fs_put_page(page, 1);
-	return 0;
+
+	/* caller should f2fs_put_page(page, 1); */
+	return new_node_page(&dn, 0, NULL);
 }
 
-struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+struct page *new_node_page(struct dnode_of_data *dn,
+				unsigned int ofs, struct page *ipage)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
 	struct address_space *mapping = sbi->node_inode->i_mapping;
@@ -831,7 +851,10 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
 	set_cold_node(dn->inode, page);
 
 	dn->node_page = page;
-	sync_inode_page(dn);
+	if (ipage)
+		update_inode(dn->inode, ipage);
+	else
+		sync_inode_page(dn);
 	set_page_dirty(page);
 	if (ofs == 0)
 		inc_valid_inode_count(sbi);
@@ -844,6 +867,12 @@ fail:
 	return ERR_PTR(err);
 }
 
+/*
+ * Caller should do after getting the following values.
+ * 0: f2fs_put_page(page, 0)
+ * LOCKED_PAGE: f2fs_put_page(page, 1)
+ * error: nothing
+ */
 static int read_node_page(struct page *page, int type)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
@@ -851,8 +880,14 @@ static int read_node_page(struct page *page, int type)
 
 	get_node_info(sbi, page->index, &ni);
 
-	if (ni.blk_addr == NULL_ADDR)
+	if (ni.blk_addr == NULL_ADDR) {
+		f2fs_put_page(page, 1);
 		return -ENOENT;
+	}
+
+	if (PageUptodate(page))
+		return LOCKED_PAGE;
+
 	return f2fs_readpage(sbi, page, ni.blk_addr, type);
 }
 
@@ -863,44 +898,53 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 {
 	struct address_space *mapping = sbi->node_inode->i_mapping;
 	struct page *apage;
+	int err;
 
 	apage = find_get_page(mapping, nid);
-	if (apage && PageUptodate(apage))
-		goto release_out;
+	if (apage && PageUptodate(apage)) {
+		f2fs_put_page(apage, 0);
+		return;
+	}
 	f2fs_put_page(apage, 0);
 
 	apage = grab_cache_page(mapping, nid);
 	if (!apage)
 		return;
 
-	if (read_node_page(apage, READA))
-		goto unlock_out;
-
-	page_cache_release(apage);
+	err = read_node_page(apage, READA);
+	if (err == 0)
+		f2fs_put_page(apage, 0);
+	else if (err == LOCKED_PAGE)
+		f2fs_put_page(apage, 1);
 	return;
-
-unlock_out:
-	unlock_page(apage);
-release_out:
-	page_cache_release(apage);
 }
 
 struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
 {
-	int err;
-	struct page *page;
 	struct address_space *mapping = sbi->node_inode->i_mapping;
-
+	struct page *page;
+	int err;
+repeat:
 	page = grab_cache_page(mapping, nid);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
 	err = read_node_page(page, READ_SYNC);
-	if (err) {
-		f2fs_put_page(page, 1);
+	if (err < 0)
 		return ERR_PTR(err);
-	}
+	else if (err == LOCKED_PAGE)
+		goto got_it;
 
+	lock_page(page);
+	if (!PageUptodate(page)) {
+		f2fs_put_page(page, 1);
+		return ERR_PTR(-EIO);
+	}
+	if (page->mapping != mapping) {
+		f2fs_put_page(page, 1);
+		goto repeat;
+	}
+got_it:
 	BUG_ON(nid != nid_of_node(page));
 	mark_page_accessed(page);
 	return page;
@@ -914,31 +958,27 @@ struct page *get_node_page_ra(struct page *parent, int start)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
 	struct address_space *mapping = sbi->node_inode->i_mapping;
-	int i, end;
-	int err = 0;
-	nid_t nid;
+	struct blk_plug plug;
 	struct page *page;
+	int err, i, end;
+	nid_t nid;
 
 	/* First, try getting the desired direct node. */
 	nid = get_nid(parent, start, false);
 	if (!nid)
 		return ERR_PTR(-ENOENT);
-
-	page = find_get_page(mapping, nid);
-	if (page && PageUptodate(page))
-		goto page_hit;
-	f2fs_put_page(page, 0);
-
 repeat:
 	page = grab_cache_page(mapping, nid);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	err = read_node_page(page, READA);
-	if (err) {
-		f2fs_put_page(page, 1);
+	err = read_node_page(page, READ_SYNC);
+	if (err < 0)
 		return ERR_PTR(err);
-	}
+	else if (err == LOCKED_PAGE)
+		goto page_hit;
+
+	blk_start_plug(&plug);
 
 	/* Then, try readahead for siblings of the desired node */
 	end = start + MAX_RA_NODE;
@@ -950,18 +990,19 @@ repeat:
 		ra_node_page(sbi, nid);
 	}
 
-page_hit:
-	lock_page(page);
-	if (PageError(page)) {
-		f2fs_put_page(page, 1);
-		return ERR_PTR(-EIO);
-	}
+	blk_finish_plug(&plug);
 
-	/* Has the page been truncated? */
+	lock_page(page);
 	if (page->mapping != mapping) {
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
+page_hit:
+	if (!PageUptodate(page)) {
+		f2fs_put_page(page, 1);
+		return ERR_PTR(-EIO);
+	}
+	mark_page_accessed(page);
 	return page;
 }
 
@@ -976,7 +1017,7 @@ void sync_inode_page(struct dnode_of_data *dn)
 		if (!dn->inode_page_locked)
 			unlock_page(dn->inode_page);
 	} else {
-		f2fs_write_inode(dn->inode, NULL);
+		update_inode_page(dn->inode);
 	}
 }
 
@@ -1091,17 +1132,8 @@ static int f2fs_write_node_page(struct page *page,
 	block_t new_addr;
 	struct node_info ni;
 
-	if (wbc->for_reclaim) {
-		dec_page_count(sbi, F2FS_DIRTY_NODES);
-		wbc->pages_skipped++;
-		set_page_dirty(page);
-		return AOP_WRITEPAGE_ACTIVATE;
-	}
-
 	wait_on_page_writeback(page);
 
-	mutex_lock_op(sbi, NODE_WRITE);
-
 	/* get old block addr of this node page */
 	nid = nid_of_node(page);
 	BUG_ON(page->index != nid);
@@ -1109,17 +1141,25 @@ static int f2fs_write_node_page(struct page *page,
 	get_node_info(sbi, nid, &ni);
 
 	/* This page is already truncated */
-	if (ni.blk_addr == NULL_ADDR)
+	if (ni.blk_addr == NULL_ADDR) {
+		dec_page_count(sbi, F2FS_DIRTY_NODES);
+		unlock_page(page);
 		return 0;
+	}
 
-	set_page_writeback(page);
+	if (wbc->for_reclaim) {
+		dec_page_count(sbi, F2FS_DIRTY_NODES);
+		wbc->pages_skipped++;
+		set_page_dirty(page);
+		return AOP_WRITEPAGE_ACTIVATE;
+	}
 
-	/* insert node offset */
+	mutex_lock(&sbi->node_write);
+	set_page_writeback(page);
 	write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
 	set_node_addr(sbi, &ni, new_addr);
 	dec_page_count(sbi, F2FS_DIRTY_NODES);
-
-	mutex_unlock_op(sbi, NODE_WRITE);
+	mutex_unlock(&sbi->node_write);
 	unlock_page(page);
 	return 0;
 }
@@ -1134,12 +1174,11 @@ static int f2fs_write_node_pages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-	struct block_device *bdev = sbi->sb->s_bdev;
 	long nr_to_write = wbc->nr_to_write;
 
 	/* First check balancing cached NAT entries */
 	if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
-		write_checkpoint(sbi, false, false);
+		f2fs_sync_fs(sbi->sb, true);
 		return 0;
 	}
 
@@ -1148,10 +1187,9 @@ static int f2fs_write_node_pages(struct address_space *mapping,
 		return 0;
 
 	/* if mounting is failed, skip writing node pages */
-	wbc->nr_to_write = bio_get_nr_vecs(bdev);
+	wbc->nr_to_write = max_hw_blocks(sbi);
 	sync_node_pages(sbi, 0, wbc);
-	wbc->nr_to_write = nr_to_write -
-		(bio_get_nr_vecs(bdev) - wbc->nr_to_write);
+	wbc->nr_to_write = nr_to_write - (max_hw_blocks(sbi) - wbc->nr_to_write);
 	return 0;
 }
 
@@ -1170,7 +1208,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
 	return 0;
 }
 
-static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
+static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
+				      unsigned int length)
 {
 	struct inode *inode = page->mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1182,7 +1221,7 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
 static int f2fs_release_node_page(struct page *page, gfp_t wait)
 {
 	ClearPagePrivate(page);
-	return 0;
+	return 1;
 }
 
 /*
@@ -1199,14 +1238,13 @@ const struct address_space_operations f2fs_node_aops = {
 static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
 {
 	struct list_head *this;
-	struct free_nid *i = NULL;
+	struct free_nid *i;
 	list_for_each(this, head) {
 		i = list_entry(this, struct free_nid, list);
 		if (i->nid == n)
-			break;
-		i = NULL;
+			return i;
 	}
-	return i;
+	return NULL;
 }
 
 static void __del_from_free_nid_list(struct free_nid *i)
@@ -1215,11 +1253,29 @@ static void __del_from_free_nid_list(struct free_nid *i)
 	kmem_cache_free(free_nid_slab, i);
 }
 
-static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
 {
 	struct free_nid *i;
+	struct nat_entry *ne;
+	bool allocated = false;
 
 	if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
+		return -1;
+
+	/* 0 nid should not be used */
+	if (nid == 0)
+		return 0;
+
+	if (!build)
+		goto retry;
+
+	/* do not add allocated nids */
+	read_lock(&nm_i->nat_tree_lock);
+	ne = __lookup_nat_cache(nm_i, nid);
+	if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
+		allocated = true;
+	read_unlock(&nm_i->nat_tree_lock);
+	if (allocated)
 		return 0;
 retry:
 	i = kmem_cache_alloc(free_nid_slab, GFP_NOFS);
@@ -1254,63 +1310,59 @@ static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
 	spin_unlock(&nm_i->free_nid_list_lock);
 }
 
-static int scan_nat_page(struct f2fs_nm_info *nm_i,
+static void scan_nat_page(struct f2fs_nm_info *nm_i,
 			struct page *nat_page, nid_t start_nid)
 {
 	struct f2fs_nat_block *nat_blk = page_address(nat_page);
 	block_t blk_addr;
-	int fcnt = 0;
 	int i;
 
-	/* 0 nid should not be used */
-	if (start_nid == 0)
-		++start_nid;
-
 	i = start_nid % NAT_ENTRY_PER_BLOCK;
 
 	for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
-		blk_addr  = le32_to_cpu(nat_blk->entries[i].block_addr);
+
+		if (start_nid >= nm_i->max_nid)
+			break;
+
+		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
 		BUG_ON(blk_addr == NEW_ADDR);
-		if (blk_addr == NULL_ADDR)
-			fcnt += add_free_nid(nm_i, start_nid);
+		if (blk_addr == NULL_ADDR) {
+			if (add_free_nid(nm_i, start_nid, true) < 0)
+				break;
+		}
 	}
-	return fcnt;
 }
 
 static void build_free_nids(struct f2fs_sb_info *sbi)
 {
-	struct free_nid *fnid, *next_fnid;
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
 	struct f2fs_summary_block *sum = curseg->sum_blk;
-	nid_t nid = 0;
-	bool is_cycled = false;
-	int fcnt = 0;
-	int i;
+	int i = 0;
+	nid_t nid = nm_i->next_scan_nid;
 
-	nid = nm_i->next_scan_nid;
-	nm_i->init_scan_nid = nid;
+	/* Enough entries */
+	if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK)
+		return;
 
+	/* readahead nat pages to be scanned */
 	ra_nat_pages(sbi, nid);
 
 	while (1) {
 		struct page *page = get_current_nat_page(sbi, nid);
 
-		fcnt += scan_nat_page(nm_i, page, nid);
+		scan_nat_page(nm_i, page, nid);
 		f2fs_put_page(page, 1);
 
 		nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
-
-		if (nid >= nm_i->max_nid) {
+		if (nid >= nm_i->max_nid)
 			nid = 0;
-			is_cycled = true;
-		}
-		if (fcnt > MAX_FREE_NIDS)
-			break;
-		if (is_cycled && nm_i->init_scan_nid <= nid)
+
+		if (i++ == FREE_NID_PAGES)
 			break;
 	}
 
+	/* go to the next free nat pages to find free nids abundantly */
 	nm_i->next_scan_nid = nid;
 
 	/* find free nids from current sum_pages */
@@ -1319,22 +1371,11 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 		block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
 		nid = le32_to_cpu(nid_in_journal(sum, i));
 		if (addr == NULL_ADDR)
-			add_free_nid(nm_i, nid);
+			add_free_nid(nm_i, nid, true);
 		else
 			remove_free_nid(nm_i, nid);
 	}
 	mutex_unlock(&curseg->curseg_mutex);
-
-	/* remove the free nids from current allocated nids */
-	list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) {
-		struct nat_entry *ne;
-
-		read_lock(&nm_i->nat_tree_lock);
-		ne = __lookup_nat_cache(nm_i, fnid->nid);
-		if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
-			remove_free_nid(nm_i, fnid->nid);
-		read_unlock(&nm_i->nat_tree_lock);
-	}
 }
 
 /*
@@ -1348,41 +1389,36 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 	struct free_nid *i = NULL;
 	struct list_head *this;
 retry:
-	mutex_lock(&nm_i->build_lock);
-	if (!nm_i->fcnt) {
-		/* scan NAT in order to build free nid list */
-		build_free_nids(sbi);
-		if (!nm_i->fcnt) {
-			mutex_unlock(&nm_i->build_lock);
-			return false;
-		}
-	}
-	mutex_unlock(&nm_i->build_lock);
+	if (sbi->total_valid_node_count + 1 >= nm_i->max_nid)
+		return false;
 
-	/*
-	 * We check fcnt again since previous check is racy as
-	 * we didn't hold free_nid_list_lock. So other thread
-	 * could consume all of free nids.
-	 */
 	spin_lock(&nm_i->free_nid_list_lock);
-	if (!nm_i->fcnt) {
-		spin_unlock(&nm_i->free_nid_list_lock);
-		goto retry;
-	}
 
-	BUG_ON(list_empty(&nm_i->free_nid_list));
-	list_for_each(this, &nm_i->free_nid_list) {
-		i = list_entry(this, struct free_nid, list);
-		if (i->state == NID_NEW)
-			break;
-	}
+	/* We should not use stale free nids created by build_free_nids */
+	if (nm_i->fcnt && !sbi->on_build_free_nids) {
+		BUG_ON(list_empty(&nm_i->free_nid_list));
+		list_for_each(this, &nm_i->free_nid_list) {
+			i = list_entry(this, struct free_nid, list);
+			if (i->state == NID_NEW)
+				break;
+		}
 
-	BUG_ON(i->state != NID_NEW);
-	*nid = i->nid;
-	i->state = NID_ALLOC;
-	nm_i->fcnt--;
+		BUG_ON(i->state != NID_NEW);
+		*nid = i->nid;
+		i->state = NID_ALLOC;
+		nm_i->fcnt--;
+		spin_unlock(&nm_i->free_nid_list_lock);
+		return true;
+	}
 	spin_unlock(&nm_i->free_nid_list_lock);
-	return true;
+
+	/* Let's scan nat pages and its caches to get free nids */
+	mutex_lock(&nm_i->build_lock);
+	sbi->on_build_free_nids = 1;
+	build_free_nids(sbi);
+	sbi->on_build_free_nids = 0;
+	mutex_unlock(&nm_i->build_lock);
+	goto retry;
 }
 
 /*
@@ -1395,10 +1431,8 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
 
 	spin_lock(&nm_i->free_nid_list_lock);
 	i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
-	if (i) {
-		BUG_ON(i->state != NID_ALLOC);
-		__del_from_free_nid_list(i);
-	}
+	BUG_ON(!i || i->state != NID_ALLOC);
+	__del_from_free_nid_list(i);
 	spin_unlock(&nm_i->free_nid_list_lock);
 }
 
@@ -1407,8 +1441,19 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
  */
 void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 {
-	alloc_nid_done(sbi, nid);
-	add_free_nid(NM_I(sbi), nid);
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+	struct free_nid *i;
+
+	spin_lock(&nm_i->free_nid_list_lock);
+	i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+	BUG_ON(!i || i->state != NID_ALLOC);
+	if (nm_i->fcnt > 2 * MAX_FREE_NIDS) {
+		__del_from_free_nid_list(i);
+	} else {
+		i->state = NID_NEW;
+		nm_i->fcnt++;
+	}
+	spin_unlock(&nm_i->free_nid_list_lock);
 }
 
 void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -1451,9 +1496,10 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 	new_ni = old_ni;
 	new_ni.ino = ino;
 
+	if (!inc_valid_node_count(sbi, NULL, 1))
+		WARN_ON(1);
 	set_node_addr(sbi, &new_ni, NEW_ADDR);
 	inc_valid_inode_count(sbi);
-
 	f2fs_put_page(ipage, 1);
 	return 0;
 }
@@ -1479,23 +1525,24 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
 	sum_entry = &sum->entries[0];
 
 	for (i = 0; i < last_offset; i++, sum_entry++) {
+		/*
+		 * In order to read next node page,
+		 * we must clear PageUptodate flag.
+		 */
+		ClearPageUptodate(page);
+
 		if (f2fs_readpage(sbi, page, addr, READ_SYNC))
 			goto out;
 
+		lock_page(page);
 		rn = (struct f2fs_node *)page_address(page);
 		sum_entry->nid = rn->footer.nid;
 		sum_entry->version = 0;
 		sum_entry->ofs_in_node = 0;
 		addr++;
-
-		/*
-		 * In order to read next node page,
-		 * we must clear PageUptodate flag.
-		 */
-		ClearPageUptodate(page);
 	}
-out:
 	unlock_page(page);
+out:
 	__free_pages(page, 0);
 	return 0;
 }
@@ -1618,13 +1665,11 @@ flush_now:
 			nid_in_journal(sum, offset) = cpu_to_le32(nid);
 		}
 
-		if (nat_get_blkaddr(ne) == NULL_ADDR) {
+		if (nat_get_blkaddr(ne) == NULL_ADDR &&
+				add_free_nid(NM_I(sbi), nid, false) <= 0) {
 			write_lock(&nm_i->nat_tree_lock);
 			__del_from_nat_cache(nm_i, ne);
 			write_unlock(&nm_i->nat_tree_lock);
-
-			/* We can reuse this freed nid at this point */
-			add_free_nid(NM_I(sbi), nid);
 		} else {
 			write_lock(&nm_i->nat_tree_lock);
 			__clear_nat_cache_dirty(nm_i, ne);
@@ -1665,19 +1710,16 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	spin_lock_init(&nm_i->free_nid_list_lock);
 	rwlock_init(&nm_i->nat_tree_lock);
 
-	nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
-	nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
 	nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
-
-	nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL);
-	if (!nm_i->nat_bitmap)
-		return -ENOMEM;
+	nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
 	version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
 	if (!version_bitmap)
 		return -EFAULT;
 
-	/* copy version bitmap */
-	memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size);
+	nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
+					GFP_KERNEL);
+	if (!nm_i->nat_bitmap)
+		return -ENOMEM;
 	return 0;
 }
 
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index afdb130f782..c65fb4f4230 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -29,6 +29,9 @@
 /* vector size for gang look-up from nat cache that consists of radix tree */
 #define NATVEC_SIZE	64
 
+/* return value for read_node_page */
+#define LOCKED_PAGE	1
+
 /*
  * For node information
  */
@@ -239,7 +242,7 @@ static inline bool IS_DNODE(struct page *node_page)
 		return false;
 	if (ofs >= 6 + 2 * NIDS_PER_BLOCK) {
 		ofs -= 6 + 2 * NIDS_PER_BLOCK;
-		if ((long int)ofs % (NIDS_PER_BLOCK + 1))
+		if (!((long int)ofs % (NIDS_PER_BLOCK + 1)))
 			return false;
 	}
 	return true;
@@ -272,11 +275,28 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
  *  - Mark cold node blocks in their node footer
  *  - Mark cold data pages in page cache
  */
-static inline int is_cold_file(struct inode *inode)
+static inline int is_file(struct inode *inode, int type)
+{
+	return F2FS_I(inode)->i_advise & type;
+}
+
+static inline void set_file(struct inode *inode, int type)
 {
-	return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
+	F2FS_I(inode)->i_advise |= type;
 }
 
+static inline void clear_file(struct inode *inode, int type)
+{
+	F2FS_I(inode)->i_advise &= ~type;
+}
+
+#define file_is_cold(inode)	is_file(inode, FADVISE_COLD_BIT)
+#define file_wrong_pino(inode)	is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_set_cold(inode)	set_file(inode, FADVISE_COLD_BIT)
+#define file_lost_pino(inode)	set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode)	clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode)	clear_file(inode, FADVISE_LOST_PINO_BIT)
+
 static inline int is_cold_data(struct page *page)
 {
 	return PageChecked(page);
@@ -292,29 +312,16 @@ static inline void clear_cold_data(struct page *page)
 	ClearPageChecked(page);
 }
 
-static inline int is_cold_node(struct page *page)
+static inline int is_node(struct page *page, int type)
 {
 	void *kaddr = page_address(page);
 	struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-	unsigned int flag = le32_to_cpu(rn->footer.flag);
-	return flag & (0x1 << COLD_BIT_SHIFT);
-}
-
-static inline unsigned char is_fsync_dnode(struct page *page)
-{
-	void *kaddr = page_address(page);
-	struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-	unsigned int flag = le32_to_cpu(rn->footer.flag);
-	return flag & (0x1 << FSYNC_BIT_SHIFT);
+	return le32_to_cpu(rn->footer.flag) & (1 << type);
 }
 
-static inline unsigned char is_dent_dnode(struct page *page)
-{
-	void *kaddr = page_address(page);
-	struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-	unsigned int flag = le32_to_cpu(rn->footer.flag);
-	return flag & (0x1 << DENT_BIT_SHIFT);
-}
+#define is_cold_node(page)	is_node(page, COLD_BIT_SHIFT)
+#define is_fsync_dnode(page)	is_node(page, FSYNC_BIT_SHIFT)
+#define is_dent_dnode(page)	is_node(page, DENT_BIT_SHIFT)
 
 static inline void set_cold_node(struct inode *inode, struct page *page)
 {
@@ -328,26 +335,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page)
 	rn->footer.flag = cpu_to_le32(flag);
 }
 
-static inline void set_fsync_mark(struct page *page, int mark)
-{
-	void *kaddr = page_address(page);
-	struct f2fs_node *rn = (struct f2fs_node *)kaddr;
-	unsigned int flag = le32_to_cpu(rn->footer.flag);
-	if (mark)
-		flag |= (0x1 << FSYNC_BIT_SHIFT);
-	else
-		flag &= ~(0x1 << FSYNC_BIT_SHIFT);
-	rn->footer.flag = cpu_to_le32(flag);
-}
-
-static inline void set_dentry_mark(struct page *page, int mark)
+static inline void set_mark(struct page *page, int mark, int type)
 {
-	void *kaddr = page_address(page);
-	struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+	struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
 	unsigned int flag = le32_to_cpu(rn->footer.flag);
 	if (mark)
-		flag |= (0x1 << DENT_BIT_SHIFT);
+		flag |= (0x1 << type);
 	else
-		flag &= ~(0x1 << DENT_BIT_SHIFT);
+		flag &= ~(0x1 << type);
 	rn->footer.flag = cpu_to_le32(flag);
 }
+#define set_dentry_mark(page, mark)	set_mark(page, mark, DENT_BIT_SHIFT)
+#define set_fsync_mark(page, mark)	set_mark(page, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index f42e4060b39..d56d951c225 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,38 +40,54 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
 
 static int recover_dentry(struct page *ipage, struct inode *inode)
 {
-	struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
+	void *kaddr = page_address(ipage);
+	struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
 	struct f2fs_inode *raw_inode = &(raw_node->i);
-	struct dentry dent, parent;
+	nid_t pino = le32_to_cpu(raw_inode->i_pino);
 	struct f2fs_dir_entry *de;
+	struct qstr name;
 	struct page *page;
-	struct inode *dir;
+	struct inode *dir, *einode;
 	int err = 0;
 
-	if (!is_dent_dnode(ipage))
-		goto out;
-
-	dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
-	if (IS_ERR(dir)) {
-		err = -EINVAL;
-		goto out;
+	dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino);
+	if (!dir) {
+		dir = f2fs_iget(inode->i_sb, pino);
+		if (IS_ERR(dir)) {
+			err = PTR_ERR(dir);
+			goto out;
+		}
+		set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+		add_dirty_dir_inode(dir);
 	}
 
-	parent.d_inode = dir;
-	dent.d_parent = &parent;
-	dent.d_name.len = le32_to_cpu(raw_inode->i_namelen);
-	dent.d_name.name = raw_inode->i_name;
-
-	de = f2fs_find_entry(dir, &dent.d_name, &page);
-	if (de) {
+	name.len = le32_to_cpu(raw_inode->i_namelen);
+	name.name = raw_inode->i_name;
+retry:
+	de = f2fs_find_entry(dir, &name, &page);
+	if (de && inode->i_ino == le32_to_cpu(de->ino)) {
 		kunmap(page);
 		f2fs_put_page(page, 0);
-	} else {
-		err = f2fs_add_link(&dent, inode);
+		goto out;
 	}
-	iput(dir);
+	if (de) {
+		einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
+		if (IS_ERR(einode)) {
+			WARN_ON(1);
+			if (PTR_ERR(einode) == -ENOENT)
+				err = -EEXIST;
+			goto out;
+		}
+		f2fs_delete_entry(de, page, einode);
+		iput(einode);
+		goto retry;
+	}
+	err = __f2fs_add_link(dir, &name, inode);
 out:
-	kunmap(ipage);
+	f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
+			"ino = %x, name = %s, dir = %lx, err = %d",
+			ino_of_node(ipage), raw_inode->i_name,
+			IS_ERR(dir) ? 0 : dir->i_ino, err);
 	return err;
 }
 
@@ -81,6 +97,9 @@ static int recover_inode(struct inode *inode, struct page *node_page)
 	struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
 	struct f2fs_inode *raw_inode = &(raw_node->i);
 
+	if (!IS_INODE(node_page))
+		return 0;
+
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	i_size_write(inode, le64_to_cpu(raw_inode->i_size));
 	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
@@ -90,7 +109,12 @@ static int recover_inode(struct inode *inode, struct page *node_page)
 	inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
 	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
 
-	return recover_dentry(node_page, inode);
+	if (is_dent_dnode(node_page))
+		return recover_dentry(node_page, inode);
+
+	f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
+			ino_of_node(node_page), raw_inode->i_name);
+	return 0;
 }
 
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -114,64 +138,61 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+		err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+		if (err)
 			goto out;
 
+		lock_page(page);
+
 		if (cp_ver != cpver_of_node(page))
-			goto out;
+			break;
 
 		if (!is_fsync_dnode(page))
 			goto next;
 
 		entry = get_fsync_inode(head, ino_of_node(page));
 		if (entry) {
-			entry->blkaddr = blkaddr;
 			if (IS_INODE(page) && is_dent_dnode(page))
 				set_inode_flag(F2FS_I(entry->inode),
 							FI_INC_LINK);
 		} else {
 			if (IS_INODE(page) && is_dent_dnode(page)) {
-				if (recover_inode_page(sbi, page)) {
-					err = -ENOMEM;
-					goto out;
-				}
+				err = recover_inode_page(sbi, page);
+				if (err)
+					break;
 			}
 
 			/* add this fsync inode to the list */
 			entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
 			if (!entry) {
 				err = -ENOMEM;
-				goto out;
+				break;
 			}
 
 			entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
 			if (IS_ERR(entry->inode)) {
 				err = PTR_ERR(entry->inode);
 				kmem_cache_free(fsync_entry_slab, entry);
-				goto out;
+				break;
 			}
-
 			list_add_tail(&entry->list, head);
-			entry->blkaddr = blkaddr;
-		}
-		if (IS_INODE(page)) {
-			err = recover_inode(entry->inode, page);
-			if (err)
-				goto out;
 		}
+		entry->blkaddr = blkaddr;
+
+		err = recover_inode(entry->inode, page);
+		if (err && err != -ENOENT)
+			break;
 next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
-		ClearPageUptodate(page);
 	}
-out:
 	unlock_page(page);
+out:
 	__free_pages(page, 0);
 	return err;
 }
 
-static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
-					struct list_head *head)
+static void destroy_fsync_dnodes(struct list_head *head)
 {
 	struct fsync_inode_entry *entry, *tmp;
 
@@ -182,15 +203,15 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
 	}
 }
 
-static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
-						block_t blkaddr)
+static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+			block_t blkaddr, struct dnode_of_data *dn)
 {
 	struct seg_entry *sentry;
 	unsigned int segno = GET_SEGNO(sbi, blkaddr);
 	unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
 					(sbi->blocks_per_seg - 1);
 	struct f2fs_summary sum;
-	nid_t ino;
+	nid_t ino, nid;
 	void *kaddr;
 	struct inode *inode;
 	struct page *node_page;
@@ -199,7 +220,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
 
 	sentry = get_seg_entry(sbi, segno);
 	if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
-		return;
+		return 0;
 
 	/* Get the previous summary */
 	for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -218,29 +239,50 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
 		f2fs_put_page(sum_page, 1);
 	}
 
+	/* Use the locked dnode page and inode */
+	nid = le32_to_cpu(sum.nid);
+	if (dn->inode->i_ino == nid) {
+		struct dnode_of_data tdn = *dn;
+		tdn.nid = nid;
+		tdn.node_page = dn->inode_page;
+		tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+		truncate_data_blocks_range(&tdn, 1);
+		return 0;
+	} else if (dn->nid == nid) {
+		struct dnode_of_data tdn = *dn;
+		tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+		truncate_data_blocks_range(&tdn, 1);
+		return 0;
+	}
+
 	/* Get the node page */
-	node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
+	node_page = get_node_page(sbi, nid);
+	if (IS_ERR(node_page))
+		return PTR_ERR(node_page);
 	bidx = start_bidx_of_node(ofs_of_node(node_page)) +
-				le16_to_cpu(sum.ofs_in_node);
+					le16_to_cpu(sum.ofs_in_node);
 	ino = ino_of_node(node_page);
 	f2fs_put_page(node_page, 1);
 
 	/* Deallocate previous index in the node page */
-	inode = f2fs_iget_nowait(sbi->sb, ino);
+	inode = f2fs_iget(sbi->sb, ino);
 	if (IS_ERR(inode))
-		return;
+		return PTR_ERR(inode);
 
 	truncate_hole(inode, bidx, bidx + 1);
 	iput(inode);
+	return 0;
 }
 
-static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
+static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 					struct page *page, block_t blkaddr)
 {
 	unsigned int start, end;
 	struct dnode_of_data dn;
 	struct f2fs_summary sum;
 	struct node_info ni;
+	int err = 0, recovered = 0;
+	int ilock;
 
 	start = start_bidx_of_node(ofs_of_node(page));
 	if (IS_INODE(page))
@@ -248,9 +290,14 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	else
 		end = start + ADDRS_PER_BLOCK;
 
+	ilock = mutex_lock_op(sbi);
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
-	if (get_dnode_of_data(&dn, start, 0))
-		return;
+
+	err = get_dnode_of_data(&dn, start, ALLOC_NODE);
+	if (err) {
+		mutex_unlock_op(sbi, ilock);
+		return err;
+	}
 
 	wait_on_page_writeback(dn.node_page);
 
@@ -272,13 +319,16 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 			}
 
 			/* Check the previous node page having this index */
-			check_index_in_prev_nodes(sbi, dest);
+			err = check_index_in_prev_nodes(sbi, dest, &dn);
+			if (err)
+				goto err;
 
 			set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
 
 			/* write dummy data page */
 			recover_data_page(sbi, NULL, &sum, src, dest);
 			update_extent_cache(dest, &dn);
+			recovered++;
 		}
 		dn.ofs_in_node++;
 	}
@@ -294,15 +344,23 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	set_page_dirty(dn.node_page);
 
 	recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
+err:
 	f2fs_put_dnode(&dn);
+	mutex_unlock_op(sbi, ilock);
+
+	f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
+			"recovered_data = %d blocks, err = %d",
+			inode->i_ino, recovered, err);
+	return err;
 }
 
-static void recover_data(struct f2fs_sb_info *sbi,
+static int recover_data(struct f2fs_sb_info *sbi,
 				struct list_head *head, int type)
 {
 	unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
 	struct curseg_info *curseg;
 	struct page *page;
+	int err = 0;
 	block_t blkaddr;
 
 	/* get node pages in the current segment */
@@ -312,23 +370,29 @@ static void recover_data(struct f2fs_sb_info *sbi,
 	/* read node page */
 	page = alloc_page(GFP_NOFS | __GFP_ZERO);
 	if (IS_ERR(page))
-		return;
+		return -ENOMEM;
+
 	lock_page(page);
 
 	while (1) {
 		struct fsync_inode_entry *entry;
 
-		if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+		err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+		if (err)
 			goto out;
 
+		lock_page(page);
+
 		if (cp_ver != cpver_of_node(page))
-			goto out;
+			break;
 
 		entry = get_fsync_inode(head, ino_of_node(page));
 		if (!entry)
 			goto next;
 
-		do_recover_data(sbi, entry->inode, page, blkaddr);
+		err = do_recover_data(sbi, entry->inode, page, blkaddr);
+		if (err)
+			break;
 
 		if (entry->blkaddr == blkaddr) {
 			iput(entry->inode);
@@ -338,40 +402,45 @@ static void recover_data(struct f2fs_sb_info *sbi,
 next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
-		ClearPageUptodate(page);
 	}
-out:
 	unlock_page(page);
+out:
 	__free_pages(page, 0);
 
-	allocate_new_segments(sbi);
+	if (!err)
+		allocate_new_segments(sbi);
+	return err;
 }
 
-void recover_fsync_data(struct f2fs_sb_info *sbi)
+int recover_fsync_data(struct f2fs_sb_info *sbi)
 {
 	struct list_head inode_list;
+	int err;
 
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
 			sizeof(struct fsync_inode_entry), NULL);
 	if (unlikely(!fsync_entry_slab))
-		return;
+		return -ENOMEM;
 
 	INIT_LIST_HEAD(&inode_list);
 
 	/* step #1: find fsynced inode numbers */
-	if (find_fsync_dnodes(sbi, &inode_list))
+	sbi->por_doing = 1;
+	err = find_fsync_dnodes(sbi, &inode_list);
+	if (err)
 		goto out;
 
 	if (list_empty(&inode_list))
 		goto out;
 
 	/* step #2: recover data */
-	sbi->por_doing = 1;
-	recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
-	sbi->por_doing = 0;
+	err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
 	BUG_ON(!list_empty(&inode_list));
 out:
-	destroy_fsync_dnodes(sbi, &inode_list);
+	destroy_fsync_dnodes(&inode_list);
 	kmem_cache_destroy(fsync_entry_slab);
-	write_checkpoint(sbi, false, false);
+	sbi->por_doing = 0;
+	if (!err)
+		write_checkpoint(sbi, false);
+	return err;
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4b009906658..a86d125a988 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -18,6 +18,7 @@
 #include "f2fs.h"
 #include "segment.h"
 #include "node.h"
+#include <trace/events/f2fs.h>
 
 /*
  * This function balances dirty node and dentry pages.
@@ -29,7 +30,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi)
 	 * We should do GC or end up with checkpoint, if there are so many dirty
 	 * dir/node pages without enough free segments.
 	 */
-	if (has_not_enough_free_secs(sbi)) {
+	if (has_not_enough_free_secs(sbi, 0)) {
 		mutex_lock(&sbi->gc_mutex);
 		f2fs_gc(sbi);
 	}
@@ -49,9 +50,20 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 
 	if (dirty_type == DIRTY) {
 		struct seg_entry *sentry = get_seg_entry(sbi, segno);
+		enum dirty_type t = DIRTY_HOT_DATA;
+
 		dirty_type = sentry->type;
+
 		if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
 			dirty_i->nr_dirty[dirty_type]++;
+
+		/* Only one bitmap should be set */
+		for (; t <= DIRTY_COLD_NODE; t++) {
+			if (t == dirty_type)
+				continue;
+			if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
+				dirty_i->nr_dirty[t]--;
+		}
 	}
 }
 
@@ -64,13 +76,16 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
 		dirty_i->nr_dirty[dirty_type]--;
 
 	if (dirty_type == DIRTY) {
-		struct seg_entry *sentry = get_seg_entry(sbi, segno);
-		dirty_type = sentry->type;
-		if (test_and_clear_bit(segno,
-					dirty_i->dirty_segmap[dirty_type]))
-			dirty_i->nr_dirty[dirty_type]--;
-		clear_bit(segno, dirty_i->victim_segmap[FG_GC]);
-		clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
+		enum dirty_type t = DIRTY_HOT_DATA;
+
+		/* clear all the bitmaps */
+		for (; t <= DIRTY_COLD_NODE; t++)
+			if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
+				dirty_i->nr_dirty[t]--;
+
+		if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
+			clear_bit(GET_SECNO(sbi, segno),
+						dirty_i->victim_secmap);
 	}
 }
 
@@ -79,7 +94,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
  * Adding dirty entry into seglist is not critical operation.
  * If a given segment is one of current working segments, it won't be added.
  */
-void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	unsigned short valid_blocks;
@@ -111,17 +126,16 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
 static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned int segno, offset = 0;
+	unsigned int segno = -1;
 	unsigned int total_segs = TOTAL_SEGS(sbi);
 
 	mutex_lock(&dirty_i->seglist_lock);
 	while (1) {
 		segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
-				offset);
+				segno + 1);
 		if (segno >= total_segs)
 			break;
 		__set_test_and_free(sbi, segno);
-		offset = segno + 1;
 	}
 	mutex_unlock(&dirty_i->seglist_lock);
 }
@@ -129,17 +143,16 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 void clear_prefree_segments(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned int segno, offset = 0;
+	unsigned int segno = -1;
 	unsigned int total_segs = TOTAL_SEGS(sbi);
 
 	mutex_lock(&dirty_i->seglist_lock);
 	while (1) {
 		segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
-				offset);
+				segno + 1);
 		if (segno >= total_segs)
 			break;
 
-		offset = segno + 1;
 		if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
 			dirty_i->nr_dirty[PRE]--;
 
@@ -242,11 +255,11 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
  * This function should be resided under the curseg_mutex lock
  */
 static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
-		struct f2fs_summary *sum, unsigned short offset)
+					struct f2fs_summary *sum)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 	void *addr = curseg->sum_blk;
-	addr += offset * sizeof(struct f2fs_summary);
+	addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
 	memcpy(addr, sum, sizeof(struct f2fs_summary));
 	return;
 }
@@ -296,48 +309,15 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
 	f2fs_put_page(page, 1);
 }
 
-static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi,
-					int ofs_unit, int type)
+static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
 {
-	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
-	unsigned int segno, next_segno, i;
-	int ofs = 0;
-
-	/*
-	 * If there is not enough reserved sections,
-	 * we should not reuse prefree segments.
-	 */
-	if (has_not_enough_free_secs(sbi))
-		return NULL_SEGNO;
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	unsigned int segno = curseg->segno + 1;
+	struct free_segmap_info *free_i = FREE_I(sbi);
 
-	/*
-	 * NODE page should not reuse prefree segment,
-	 * since those information is used for SPOR.
-	 */
-	if (IS_NODESEG(type))
-		return NULL_SEGNO;
-next:
-	segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++);
-	ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit;
-	if (segno < TOTAL_SEGS(sbi)) {
-		/* skip intermediate segments in a section */
-		if (segno % ofs_unit)
-			goto next;
-
-		/* skip if whole section is not prefree */
-		next_segno = find_next_zero_bit(prefree_segmap,
-						TOTAL_SEGS(sbi), segno + 1);
-		if (next_segno - segno < ofs_unit)
-			goto next;
-
-		/* skip if whole section was not free at the last checkpoint */
-		for (i = 0; i < ofs_unit; i++)
-			if (get_seg_entry(sbi, segno)->ckpt_valid_blocks)
-				goto next;
-		return segno;
-	}
-	return NULL_SEGNO;
+	if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
+		return !test_bit(segno, free_i->free_segmap);
+	return 0;
 }
 
 /*
@@ -348,9 +328,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
 			unsigned int *newseg, bool new_sec, int dir)
 {
 	struct free_segmap_info *free_i = FREE_I(sbi);
-	unsigned int total_secs = sbi->total_sections;
 	unsigned int segno, secno, zoneno;
-	unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone;
+	unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone;
 	unsigned int hint = *newseg / sbi->segs_per_sec;
 	unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
 	unsigned int left_start = hint;
@@ -363,16 +342,17 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
 	if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
 		segno = find_next_zero_bit(free_i->free_segmap,
 					TOTAL_SEGS(sbi), *newseg + 1);
-		if (segno < TOTAL_SEGS(sbi))
+		if (segno - *newseg < sbi->segs_per_sec -
+					(*newseg % sbi->segs_per_sec))
 			goto got_it;
 	}
 find_other_zone:
-	secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint);
-	if (secno >= total_secs) {
+	secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint);
+	if (secno >= TOTAL_SECS(sbi)) {
 		if (dir == ALLOC_RIGHT) {
 			secno = find_next_zero_bit(free_i->free_secmap,
-						total_secs, 0);
-			BUG_ON(secno >= total_secs);
+							TOTAL_SECS(sbi), 0);
+			BUG_ON(secno >= TOTAL_SECS(sbi));
 		} else {
 			go_left = 1;
 			left_start = hint - 1;
@@ -387,8 +367,8 @@ find_other_zone:
 			continue;
 		}
 		left_start = find_next_zero_bit(free_i->free_secmap,
-						total_secs, 0);
-		BUG_ON(left_start >= total_secs);
+							TOTAL_SECS(sbi), 0);
+		BUG_ON(left_start >= TOTAL_SECS(sbi));
 		break;
 	}
 	secno = left_start;
@@ -463,7 +443,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
 	int dir = ALLOC_LEFT;
 
 	write_sum_page(sbi, curseg->sum_blk,
-				GET_SUM_BLOCK(sbi, curseg->segno));
+				GET_SUM_BLOCK(sbi, segno));
 	if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
 		dir = ALLOC_RIGHT;
 
@@ -536,6 +516,23 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
 	}
 }
 
+static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
+{
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
+
+	if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0))
+		return v_ops->get_victim(sbi,
+				&(curseg)->next_segno, BG_GC, type, SSR);
+
+	/* For data segments, let's do SSR more intensively */
+	for (; type >= CURSEG_HOT_DATA; type--)
+		if (v_ops->get_victim(sbi, &(curseg)->next_segno,
+						BG_GC, type, SSR))
+			return 1;
+	return 0;
+}
+
 /*
  * flush out current segment and replace it with new segment
  * This function should be returned with success, otherwise BUG
@@ -544,26 +541,25 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
 						int type, bool force)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
-	unsigned int ofs_unit;
 
 	if (force) {
 		new_curseg(sbi, type, true);
 		goto out;
 	}
 
-	ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec;
-	curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type);
-
-	if (curseg->next_segno != NULL_SEGNO)
-		change_curseg(sbi, type, false);
-	else if (type == CURSEG_WARM_NODE)
+	if (type == CURSEG_WARM_NODE)
+		new_curseg(sbi, type, false);
+	else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
 		new_curseg(sbi, type, false);
 	else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
 		change_curseg(sbi, type, true);
 	else
 		new_curseg(sbi, type, false);
 out:
+#ifdef CONFIG_F2FS_STAT_FS
 	sbi->segment_count[curseg->alloc_type]++;
+#endif
+	return;
 }
 
 void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -600,6 +596,7 @@ static void f2fs_end_io_write(struct bio *bio, int err)
 			if (page->mapping)
 				set_bit(AS_EIO, &page->mapping->flags);
 			set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
+			p->sbi->sb->s_flags |= MS_RDONLY;
 		}
 		end_page_writeback(page);
 		dec_page_count(p->sbi, F2FS_WRITEBACK);
@@ -638,10 +635,16 @@ static void do_submit_bio(struct f2fs_sb_info *sbi,
 	if (type >= META_FLUSH)
 		rw = WRITE_FLUSH_FUA;
 
+	if (btype == META)
+		rw |= REQ_META;
+
 	if (sbi->bio[btype]) {
 		struct bio_private *p = sbi->bio[btype]->bi_private;
 		p->sbi = sbi;
 		sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
+
+		trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]);
+
 		if (type == META_FLUSH) {
 			DECLARE_COMPLETION_ONSTACK(wait);
 			p->is_sync = true;
@@ -678,7 +681,7 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
 		do_submit_bio(sbi, type, false);
 alloc_new:
 	if (sbi->bio[type] == NULL) {
-		sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev));
+		sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi));
 		sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
 		/*
 		 * The end_io will be assigned at the sumbission phase.
@@ -696,6 +699,7 @@ alloc_new:
 	sbi->last_block_in_bio[type] = blk_addr;
 
 	up_write(&sbi->bio_sem);
+	trace_f2fs_submit_write_page(page, blk_addr, type);
 }
 
 static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
@@ -738,7 +742,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
 
 		if (S_ISDIR(inode->i_mode))
 			return CURSEG_HOT_DATA;
-		else if (is_cold_data(page) || is_cold_file(inode))
+		else if (is_cold_data(page) || file_is_cold(inode))
 			return CURSEG_COLD_DATA;
 		else
 			return CURSEG_WARM_DATA;
@@ -787,11 +791,13 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
 	 * because, this function updates a summary entry in the
 	 * current summary block.
 	 */
-	__add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+	__add_sum_entry(sbi, type, sum);
 
 	mutex_lock(&sit_i->sentry_lock);
 	__refresh_next_blkoff(sbi, curseg);
+#ifdef CONFIG_F2FS_STAT_FS
 	sbi->block_count[curseg->alloc_type]++;
+#endif
 
 	/*
 	 * SIT information should be updated before segment allocation,
@@ -815,15 +821,10 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
 	mutex_unlock(&curseg->curseg_mutex);
 }
 
-int write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
-			struct writeback_control *wbc)
+void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 {
-	if (wbc->for_reclaim)
-		return AOP_WRITEPAGE_ACTIVATE;
-
 	set_page_writeback(page);
 	submit_write_page(sbi, page, page->index, META);
-	return 0;
 }
 
 void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -891,7 +892,7 @@ void recover_data_page(struct f2fs_sb_info *sbi,
 
 	curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
 					(sbi->blocks_per_seg - 1);
-	__add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+	__add_sum_entry(sbi, type, sum);
 
 	refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
 
@@ -928,7 +929,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
 	}
 	curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
 					(sbi->blocks_per_seg - 1);
-	__add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+	__add_sum_entry(sbi, type, sum);
 
 	/* change the current log to the next block addr in advance */
 	if (next_segno != segno) {
@@ -1377,7 +1378,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 	}
 
 	if (sbi->segs_per_sec > 1) {
-		sit_i->sec_entries = vzalloc(sbi->total_sections *
+		sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) *
 					sizeof(struct sec_entry));
 		if (!sit_i->sec_entries)
 			return -ENOMEM;
@@ -1390,10 +1391,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 	bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
 	src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
 
-	dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+	dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL);
 	if (!dst_bitmap)
 		return -ENOMEM;
-	memcpy(dst_bitmap, src_bitmap, bitmap_size);
 
 	/* init SIT information */
 	sit_i->s_ops = &default_salloc_ops;
@@ -1429,7 +1429,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
 	if (!free_i->free_segmap)
 		return -ENOMEM;
 
-	sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections);
+	sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
 	free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
 	if (!free_i->free_secmap)
 		return -ENOMEM;
@@ -1528,13 +1528,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct free_segmap_info *free_i = FREE_I(sbi);
-	unsigned int segno = 0, offset = 0;
+	unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
 	unsigned short valid_blocks;
 
-	while (segno < TOTAL_SEGS(sbi)) {
+	while (1) {
 		/* find dirty segment based on free segmap */
-		segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
-		if (segno >= TOTAL_SEGS(sbi))
+		segno = find_next_inuse(free_i, total_segs, offset);
+		if (segno >= total_segs)
 			break;
 		offset = segno + 1;
 		valid_blocks = get_valid_blocks(sbi, segno, 0);
@@ -1546,14 +1546,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
 	}
 }
 
-static int init_victim_segmap(struct f2fs_sb_info *sbi)
+static int init_victim_secmap(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-	unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+	unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi));
 
-	dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
-	dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
-	if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC])
+	dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
+	if (!dirty_i->victim_secmap)
 		return -ENOMEM;
 	return 0;
 }
@@ -1580,7 +1579,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
 	}
 
 	init_dirty_segmap(sbi);
-	return init_victim_segmap(sbi);
+	return init_victim_secmap(sbi);
 }
 
 /*
@@ -1667,18 +1666,10 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
 	mutex_unlock(&dirty_i->seglist_lock);
 }
 
-void reset_victim_segmap(struct f2fs_sb_info *sbi)
-{
-	unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
-	memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size);
-}
-
-static void destroy_victim_segmap(struct f2fs_sb_info *sbi)
+static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-
-	kfree(dirty_i->victim_segmap[FG_GC]);
-	kfree(dirty_i->victim_segmap[BG_GC]);
+	kfree(dirty_i->victim_secmap);
 }
 
 static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
@@ -1693,7 +1684,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
 	for (i = 0; i < NR_DIRTY_TYPE; i++)
 		discard_dirty_segmap(sbi, i);
 
-	destroy_victim_segmap(sbi);
+	destroy_victim_secmap(sbi);
 	SM_I(sbi)->dirty_info = NULL;
 	kfree(dirty_i);
 }
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 66a288a52fd..062424a0e4c 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -8,10 +8,13 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+#include <linux/blkdev.h>
+
 /* constant macro */
 #define NULL_SEGNO			((unsigned int)(~0))
+#define NULL_SECNO			((unsigned int)(~0))
 
-/* V: Logical segment # in volume, R: Relative segment # in main area */
+/* L: Logical segment # in volume, R: Relative segment # in main area */
 #define GET_L2R_SEGNO(free_i, segno)	(segno - free_i->start_segno)
 #define GET_R2L_SEGNO(free_i, segno)	(segno + free_i->start_segno)
 
@@ -23,13 +26,13 @@
 	((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) ||		\
 	(t == CURSEG_WARM_NODE))
 
-#define IS_CURSEG(sbi, segno)						\
-	((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||	\
-	 (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) ||	\
-	 (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) ||	\
-	 (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) ||	\
-	 (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) ||	\
-	 (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+#define IS_CURSEG(sbi, seg)						\
+	((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) ||	\
+	 (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) ||	\
+	 (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) ||	\
+	 (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) ||	\
+	 (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) ||	\
+	 (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
 
 #define IS_CURSEC(sbi, secno)						\
 	((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno /		\
@@ -81,9 +84,12 @@
 #define f2fs_bitmap_size(nr)			\
 	(BITS_TO_LONGS(nr) * sizeof(unsigned long))
 #define TOTAL_SEGS(sbi)	(SM_I(sbi)->main_segments)
+#define TOTAL_SECS(sbi)	(sbi->total_sections)
 
 #define SECTOR_FROM_BLOCK(sbi, blk_addr)				\
 	(blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+#define SECTOR_TO_BLOCK(sbi, sectors)					\
+	(sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
 
 /* during checkpoint, bio_private is used to synchronize the last bio */
 struct bio_private {
@@ -213,7 +219,7 @@ struct dirty_seglist_info {
 	unsigned long *dirty_segmap[NR_DIRTY_TYPE];
 	struct mutex seglist_lock;		/* lock for segment bitmaps */
 	int nr_dirty[NR_DIRTY_TYPE];		/* # of dirty segments */
-	unsigned long *victim_segmap[2];	/* BG_GC, FG_GC */
+	unsigned long *victim_secmap;		/* background GC victims */
 };
 
 /* victim selection function for cleaning and SSR */
@@ -450,35 +456,21 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
 	return (free_sections(sbi) < overprovision_sections(sbi));
 }
 
-static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
-{
-	struct curseg_info *curseg = CURSEG_I(sbi, type);
-	return DIRTY_I(sbi)->v_ops->get_victim(sbi,
-				&(curseg)->next_segno, BG_GC, type, SSR);
-}
-
-static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi)
+static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
 {
-	unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) *
-			sbi->segs_per_sec;
-	int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
-			>> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
-	int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
-			>> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
+	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
 
 	if (sbi->por_doing)
 		return false;
 
-	if (free_sections(sbi) <= (node_secs + 2 * dent_secs +
-						reserved_sections(sbi)))
-		return true;
-	return false;
+	return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
+						reserved_sections(sbi)));
 }
 
 static inline int utilization(struct f2fs_sb_info *sbi)
 {
-	return (long int)valid_user_blocks(sbi) * 100 /
-			(long int)sbi->user_block_count;
+	return div_u64(valid_user_blocks(sbi) * 100, sbi->user_block_count);
 }
 
 /*
@@ -629,3 +621,17 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
 		le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count)
 				- (base + 1) + type;
 }
+
+static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
+{
+	if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno))
+		return true;
+	return false;
+}
+
+static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
+{
+	struct block_device *bdev = sbi->sb->s_bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 37fad04c866..75c7dc363e9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -12,7 +12,6 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/statfs.h>
-#include <linux/proc_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
 #include <linux/kthread.h>
@@ -21,16 +20,21 @@
 #include <linux/seq_file.h>
 #include <linux/random.h>
 #include <linux/exportfs.h>
+#include <linux/blkdev.h>
 #include <linux/f2fs_fs.h>
 
 #include "f2fs.h"
 #include "node.h"
+#include "segment.h"
 #include "xattr.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/f2fs.h>
+
 static struct kmem_cache *f2fs_inode_cachep;
 
 enum {
-	Opt_gc_background_off,
+	Opt_gc_background,
 	Opt_disable_roll_forward,
 	Opt_discard,
 	Opt_noheap,
@@ -42,7 +46,7 @@ enum {
 };
 
 static match_table_t f2fs_tokens = {
-	{Opt_gc_background_off, "background_gc_off"},
+	{Opt_gc_background, "background_gc=%s"},
 	{Opt_disable_roll_forward, "disable_roll_forward"},
 	{Opt_discard, "discard"},
 	{Opt_noheap, "no_heap"},
@@ -72,6 +76,91 @@ static void init_once(void *foo)
 	inode_init_once(&fi->vfs_inode);
 }
 
+static int parse_options(struct super_block *sb, char *options)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	substring_t args[MAX_OPT_ARGS];
+	char *p, *name;
+	int arg = 0;
+
+	if (!options)
+		return 0;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = NULL;
+		token = match_token(p, f2fs_tokens, args);
+
+		switch (token) {
+		case Opt_gc_background:
+			name = match_strdup(&args[0]);
+
+			if (!name)
+				return -ENOMEM;
+			if (!strncmp(name, "on", 2))
+				set_opt(sbi, BG_GC);
+			else if (!strncmp(name, "off", 3))
+				clear_opt(sbi, BG_GC);
+			else {
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			break;
+		case Opt_disable_roll_forward:
+			set_opt(sbi, DISABLE_ROLL_FORWARD);
+			break;
+		case Opt_discard:
+			set_opt(sbi, DISCARD);
+			break;
+		case Opt_noheap:
+			set_opt(sbi, NOHEAP);
+			break;
+#ifdef CONFIG_F2FS_FS_XATTR
+		case Opt_nouser_xattr:
+			clear_opt(sbi, XATTR_USER);
+			break;
+#else
+		case Opt_nouser_xattr:
+			f2fs_msg(sb, KERN_INFO,
+				"nouser_xattr options not supported");
+			break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+		case Opt_noacl:
+			clear_opt(sbi, POSIX_ACL);
+			break;
+#else
+		case Opt_noacl:
+			f2fs_msg(sb, KERN_INFO, "noacl options not supported");
+			break;
+#endif
+		case Opt_active_logs:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+			if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
+				return -EINVAL;
+			sbi->active_logs = arg;
+			break;
+		case Opt_disable_ext_identify:
+			set_opt(sbi, DISABLE_EXT_IDENTIFY);
+			break;
+		default:
+			f2fs_msg(sb, KERN_ERR,
+				"Unrecognized mount option \"%s\" or missing value",
+				p);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
 static struct inode *f2fs_alloc_inode(struct super_block *sb)
 {
 	struct f2fs_inode_info *fi;
@@ -82,7 +171,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 
 	init_once((void *) fi);
 
-	/* Initilize f2fs-specific inode info */
+	/* Initialize f2fs-specific inode info */
 	fi->vfs_inode.i_version = 1;
 	atomic_set(&fi->dirty_dents, 0);
 	fi->i_current_depth = 1;
@@ -94,6 +183,31 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	return &fi->vfs_inode;
 }
 
+static int f2fs_drop_inode(struct inode *inode)
+{
+	/*
+	 * This is to avoid a deadlock condition like below.
+	 * writeback_single_inode(inode)
+	 *  - f2fs_write_data_page
+	 *    - f2fs_gc -> iput -> evict
+	 *       - inode_wait_for_writeback(inode)
+	 */
+	if (!inode_unhashed(inode) && inode->i_state & I_SYNC)
+		return 0;
+	return generic_drop_inode(inode);
+}
+
+/*
+ * f2fs_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We should call set_dirty_inode to write the dirty inode through write_inode.
+ */
+static void f2fs_dirty_inode(struct inode *inode, int flags)
+{
+	set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+	return;
+}
+
 static void f2fs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -112,7 +226,7 @@ static void f2fs_put_super(struct super_block *sb)
 	f2fs_destroy_stats(sbi);
 	stop_gc_thread(sbi);
 
-	write_checkpoint(sbi, false, true);
+	write_checkpoint(sbi, true);
 
 	iput(sbi->node_inode);
 	iput(sbi->meta_inode);
@@ -132,17 +246,38 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
 
+	trace_f2fs_sync_fs(sb, sync);
+
 	if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
 		return 0;
 
-	if (sync)
-		write_checkpoint(sbi, false, false);
-	else
+	if (sync) {
+		mutex_lock(&sbi->gc_mutex);
+		write_checkpoint(sbi, false);
+		mutex_unlock(&sbi->gc_mutex);
+	} else {
 		f2fs_balance_fs(sbi);
+	}
 
 	return 0;
 }
 
+static int f2fs_freeze(struct super_block *sb)
+{
+	int err;
+
+	if (f2fs_readonly(sb))
+		return 0;
+
+	err = f2fs_sync_fs(sb, 1);
+	return err;
+}
+
+static int f2fs_unfreeze(struct super_block *sb)
+{
+	return 0;
+}
+
 static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
@@ -164,7 +299,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = sbi->total_node_count;
 	buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
 
-	buf->f_namelen = F2FS_MAX_NAME_LEN;
+	buf->f_namelen = F2FS_NAME_LEN;
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 
@@ -175,10 +310,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
 
-	if (test_opt(sbi, BG_GC))
-		seq_puts(seq, ",background_gc_on");
+	if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC))
+		seq_printf(seq, ",background_gc=%s", "on");
 	else
-		seq_puts(seq, ",background_gc_off");
+		seq_printf(seq, ",background_gc=%s", "off");
 	if (test_opt(sbi, DISABLE_ROLL_FORWARD))
 		seq_puts(seq, ",disable_roll_forward");
 	if (test_opt(sbi, DISCARD))
@@ -198,22 +333,79 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",noacl");
 #endif
 	if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
-		seq_puts(seq, ",disable_ext_indentify");
+		seq_puts(seq, ",disable_ext_identify");
 
 	seq_printf(seq, ",active_logs=%u", sbi->active_logs);
 
 	return 0;
 }
 
+static int f2fs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	struct f2fs_mount_info org_mount_opt;
+	int err, active_logs;
+
+	/*
+	 * Save the old mount options in case we
+	 * need to restore them.
+	 */
+	org_mount_opt = sbi->mount_opt;
+	active_logs = sbi->active_logs;
+
+	/* parse mount options */
+	err = parse_options(sb, data);
+	if (err)
+		goto restore_opts;
+
+	/*
+	 * Previous and new state of filesystem is RO,
+	 * so no point in checking GC conditions.
+	 */
+	if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+		goto skip;
+
+	/*
+	 * We stop the GC thread if FS is mounted as RO
+	 * or if background_gc = off is passed in mount
+	 * option. Also sync the filesystem.
+	 */
+	if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
+		if (sbi->gc_thread) {
+			stop_gc_thread(sbi);
+			f2fs_sync_fs(sb, 1);
+		}
+	} else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
+		err = start_gc_thread(sbi);
+		if (err)
+			goto restore_opts;
+	}
+skip:
+	/* Update the POSIXACL Flag */
+	 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+	return 0;
+
+restore_opts:
+	sbi->mount_opt = org_mount_opt;
+	sbi->active_logs = active_logs;
+	return err;
+}
+
 static struct super_operations f2fs_sops = {
 	.alloc_inode	= f2fs_alloc_inode,
+	.drop_inode	= f2fs_drop_inode,
 	.destroy_inode	= f2fs_destroy_inode,
 	.write_inode	= f2fs_write_inode,
+	.dirty_inode	= f2fs_dirty_inode,
 	.show_options	= f2fs_show_options,
 	.evict_inode	= f2fs_evict_inode,
 	.put_super	= f2fs_put_super,
 	.sync_fs	= f2fs_sync_fs,
+	.freeze_fs	= f2fs_freeze,
+	.unfreeze_fs	= f2fs_unfreeze,
 	.statfs		= f2fs_statfs,
+	.remount_fs	= f2fs_remount,
 };
 
 static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -261,79 +453,6 @@ static const struct export_operations f2fs_export_ops = {
 	.get_parent = f2fs_get_parent,
 };
 
-static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
-				char *options)
-{
-	substring_t args[MAX_OPT_ARGS];
-	char *p;
-	int arg = 0;
-
-	if (!options)
-		return 0;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
-		if (!*p)
-			continue;
-		/*
-		 * Initialize args struct so we know whether arg was
-		 * found; some options take optional arguments.
-		 */
-		args[0].to = args[0].from = NULL;
-		token = match_token(p, f2fs_tokens, args);
-
-		switch (token) {
-		case Opt_gc_background_off:
-			clear_opt(sbi, BG_GC);
-			break;
-		case Opt_disable_roll_forward:
-			set_opt(sbi, DISABLE_ROLL_FORWARD);
-			break;
-		case Opt_discard:
-			set_opt(sbi, DISCARD);
-			break;
-		case Opt_noheap:
-			set_opt(sbi, NOHEAP);
-			break;
-#ifdef CONFIG_F2FS_FS_XATTR
-		case Opt_nouser_xattr:
-			clear_opt(sbi, XATTR_USER);
-			break;
-#else
-		case Opt_nouser_xattr:
-			f2fs_msg(sb, KERN_INFO,
-				"nouser_xattr options not supported");
-			break;
-#endif
-#ifdef CONFIG_F2FS_FS_POSIX_ACL
-		case Opt_noacl:
-			clear_opt(sbi, POSIX_ACL);
-			break;
-#else
-		case Opt_noacl:
-			f2fs_msg(sb, KERN_INFO, "noacl options not supported");
-			break;
-#endif
-		case Opt_active_logs:
-			if (args->from && match_int(args, &arg))
-				return -EINVAL;
-			if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
-				return -EINVAL;
-			sbi->active_logs = arg;
-			break;
-		case Opt_disable_ext_identify:
-			set_opt(sbi, DISABLE_EXT_IDENTIFY);
-			break;
-		default:
-			f2fs_msg(sb, KERN_ERR,
-				"Unrecognized mount option \"%s\" or missing value",
-				p);
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
 static loff_t max_file_size(unsigned bits)
 {
 	loff_t result = ADDRS_PER_INODE;
@@ -366,14 +485,23 @@ static int sanity_check_raw_super(struct super_block *sb,
 		return 1;
 	}
 
+	/* Currently, support only 4KB page cache size */
+	if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid page_cache_size (%lu), supports only 4KB\n",
+			PAGE_CACHE_SIZE);
+		return 1;
+	}
+
 	/* Currently, support only 4KB block size */
 	blocksize = 1 << le32_to_cpu(raw_super->log_blocksize);
-	if (blocksize != PAGE_CACHE_SIZE) {
+	if (blocksize != F2FS_BLKSIZE) {
 		f2fs_msg(sb, KERN_INFO,
 			"Invalid blocksize (%u), supports only 4KB\n",
 			blocksize);
 		return 1;
 	}
+
 	if (le32_to_cpu(raw_super->log_sectorsize) !=
 					F2FS_LOG_SECTOR_SIZE) {
 		f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize");
@@ -387,10 +515,11 @@ static int sanity_check_raw_super(struct super_block *sb,
 	return 0;
 }
 
-static int sanity_check_ckpt(struct f2fs_super_block *raw_super,
-				struct f2fs_checkpoint *ckpt)
+static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 {
 	unsigned int total, fsmeta;
+	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 
 	total = le32_to_cpu(raw_super->segment_count);
 	fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -401,6 +530,11 @@ static int sanity_check_ckpt(struct f2fs_super_block *raw_super,
 
 	if (fsmeta >= total)
 		return 1;
+
+	if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
+		return 1;
+	}
 	return 0;
 }
 
@@ -424,11 +558,38 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->root_ino_num = le32_to_cpu(raw_super->root_ino);
 	sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
 	sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
+	sbi->cur_victim_sec = NULL_SECNO;
 
 	for (i = 0; i < NR_COUNT_TYPE; i++)
 		atomic_set(&sbi->nr_pages[i], 0);
 }
 
+static int validate_superblock(struct super_block *sb,
+		struct f2fs_super_block **raw_super,
+		struct buffer_head **raw_super_buf, sector_t block)
+{
+	const char *super = (block == 0 ? "first" : "second");
+
+	/* read f2fs raw super block */
+	*raw_super_buf = sb_bread(sb, block);
+	if (!*raw_super_buf) {
+		f2fs_msg(sb, KERN_ERR, "unable to read %s superblock",
+				super);
+		return -EIO;
+	}
+
+	*raw_super = (struct f2fs_super_block *)
+		((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET);
+
+	/* sanity checking of raw super */
+	if (!sanity_check_raw_super(sb, *raw_super))
+		return 0;
+
+	f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
+				"in %s superblock", super);
+	return -EINVAL;
+}
+
 static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct f2fs_sb_info *sbi;
@@ -449,16 +610,15 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	/* read f2fs raw super block */
-	raw_super_buf = sb_bread(sb, 0);
-	if (!raw_super_buf) {
-		err = -EIO;
-		f2fs_msg(sb, KERN_ERR, "unable to read superblock");
-		goto free_sbi;
+	err = validate_superblock(sb, &raw_super, &raw_super_buf, 0);
+	if (err) {
+		brelse(raw_super_buf);
+		/* check secondary superblock when primary failed */
+		err = validate_superblock(sb, &raw_super, &raw_super_buf, 1);
+		if (err)
+			goto free_sb_buf;
 	}
-	raw_super = (struct f2fs_super_block *)
-			((char *)raw_super_buf->b_data + F2FS_SUPER_OFFSET);
-
+	sb->s_fs_info = sbi;
 	/* init some FS parameters */
 	sbi->active_logs = NR_CURSEG_TYPE;
 
@@ -471,14 +631,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	set_opt(sbi, POSIX_ACL);
 #endif
 	/* parse mount options */
-	if (parse_options(sb, sbi, (char *)data))
-		goto free_sb_buf;
-
-	/* sanity checking of raw super */
-	if (sanity_check_raw_super(sb, raw_super)) {
-		f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem");
+	err = parse_options(sb, (char *)data);
+	if (err)
 		goto free_sb_buf;
-	}
 
 	sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
 	sb->s_max_links = F2FS_LINK_MAX;
@@ -488,7 +643,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_xattr = f2fs_xattr_handlers;
 	sb->s_export_op = &f2fs_export_ops;
 	sb->s_magic = F2FS_SUPER_MAGIC;
-	sb->s_fs_info = sbi;
 	sb->s_time_gran = 1;
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -499,11 +653,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->raw_super = raw_super;
 	sbi->raw_super_buf = raw_super_buf;
 	mutex_init(&sbi->gc_mutex);
-	mutex_init(&sbi->write_inode);
 	mutex_init(&sbi->writepages);
 	mutex_init(&sbi->cp_mutex);
-	for (i = 0; i < NR_LOCK_TYPE; i++)
+	for (i = 0; i < NR_GLOBAL_LOCKS; i++)
 		mutex_init(&sbi->fs_lock[i]);
+	mutex_init(&sbi->node_write);
 	sbi->por_doing = 0;
 	spin_lock_init(&sbi->stat_lock);
 	init_rwsem(&sbi->bio_sem);
@@ -525,7 +679,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* sanity checking of checkpoint */
 	err = -EINVAL;
-	if (sanity_check_ckpt(raw_super, sbi->ckpt)) {
+	if (sanity_check_ckpt(sbi)) {
 		f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
 		goto free_cp;
 	}
@@ -590,18 +744,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* recover fsynced data */
-	if (!test_opt(sbi, DISABLE_ROLL_FORWARD))
-		recover_fsync_data(sbi);
+	if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+		err = recover_fsync_data(sbi);
+		if (err)
+			f2fs_msg(sb, KERN_ERR,
+				"Cannot recover all fsync data errno=%ld", err);
+	}
 
-	/* After POR, we can run background GC thread */
-	err = start_gc_thread(sbi);
-	if (err)
-		goto fail;
+	/*
+	 * If filesystem is not mounted as read-only then
+	 * do start the gc_thread.
+	 */
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/* After POR, we can run background GC thread.*/
+		err = start_gc_thread(sbi);
+		if (err)
+			goto fail;
+	}
 
 	err = f2fs_build_stats(sbi);
 	if (err)
 		goto fail;
 
+	if (test_opt(sbi, DISCARD)) {
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
+		if (!blk_queue_discard(q))
+			f2fs_msg(sb, KERN_WARNING,
+					"mounting with \"discard\" option, but "
+					"the device does not support discard");
+	}
+
 	return 0;
 fail:
 	stop_gc_thread(sbi);
@@ -639,6 +811,7 @@ static struct file_system_type f2fs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("f2fs");
 
 static int __init init_inodecache(void)
 {
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8038c049650..3ab07ecd86c 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -20,6 +20,7 @@
  */
 #include <linux/rwsem.h>
 #include <linux/f2fs_fs.h>
+#include <linux/security.h>
 #include "f2fs.h"
 #include "xattr.h"
 
@@ -43,6 +44,10 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
 		prefix = XATTR_TRUSTED_PREFIX;
 		prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 		break;
+	case F2FS_XATTR_INDEX_SECURITY:
+		prefix = XATTR_SECURITY_PREFIX;
+		prefix_len = XATTR_SECURITY_PREFIX_LEN;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -50,7 +55,7 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
 	total_len = prefix_len + name_len + 1;
 	if (list && total_len <= list_size) {
 		memcpy(list, prefix, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
+		memcpy(list + prefix_len, name, name_len);
 		list[prefix_len + name_len] = '\0';
 	}
 	return total_len;
@@ -70,13 +75,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		break;
+	case F2FS_XATTR_INDEX_SECURITY:
+		break;
 	default:
 		return -EINVAL;
 	}
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	return f2fs_getxattr(dentry->d_inode, type, name,
-			buffer, size);
+	return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
 }
 
 static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -93,13 +99,15 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		break;
+	case F2FS_XATTR_INDEX_SECURITY:
+		break;
 	default:
 		return -EINVAL;
 	}
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
 
-	return f2fs_setxattr(dentry->d_inode, type, name, value, size);
+	return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL);
 }
 
 static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
@@ -145,6 +153,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
 	return 0;
 }
 
+#ifdef CONFIG_F2FS_FS_SECURITY
+static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		void *page)
+{
+	const struct xattr *xattr;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
+				xattr->name, xattr->value,
+				xattr->value_len, (struct page *)page);
+		if (err < 0)
+			break;
+	}
+	return err;
+}
+
+int f2fs_init_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr, struct page *ipage)
+{
+	return security_inode_init_security(inode, dir, qstr,
+				&f2fs_initxattrs, ipage);
+}
+#endif
+
 const struct xattr_handler f2fs_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.flags	= F2FS_XATTR_INDEX_USER,
@@ -169,6 +202,14 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
 	.set    = f2fs_xattr_advise_set,
 };
 
+const struct xattr_handler f2fs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.flags	= F2FS_XATTR_INDEX_SECURITY,
+	.list	= f2fs_xattr_generic_list,
+	.get	= f2fs_xattr_generic_get,
+	.set	= f2fs_xattr_generic_set,
+};
+
 static const struct xattr_handler *f2fs_xattr_handler_map[] = {
 	[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -176,6 +217,9 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
 	[F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
 #endif
 	[F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+	[F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler,
+#endif
 	[F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
 };
 
@@ -186,6 +230,9 @@ const struct xattr_handler *f2fs_xattr_handlers[] = {
 	&f2fs_xattr_acl_default_handler,
 #endif
 	&f2fs_xattr_trusted_handler,
+#ifdef CONFIG_F2FS_FS_SECURITY
+	&f2fs_xattr_security_handler,
+#endif
 	&f2fs_xattr_advise_handler,
 	NULL,
 };
@@ -218,6 +265,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
 		return -ENODATA;
 
 	page = get_node_page(sbi, fi->i_xattr_nid);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
 	base_addr = page_address(page);
 
 	list_for_each_xattr(entry, base_addr) {
@@ -268,6 +317,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 		return 0;
 
 	page = get_node_page(sbi, fi->i_xattr_nid);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
 	base_addr = page_address(page);
 
 	list_for_each_xattr(entry, base_addr) {
@@ -296,7 +347,7 @@ cleanup:
 }
 
 int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
-					const void *value, size_t value_len)
+			const void *value, size_t value_len, struct page *ipage)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -307,37 +358,40 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
 	int error, found, free, newsize;
 	size_t name_len;
 	char *pval;
+	int ilock;
 
 	if (name == NULL)
 		return -EINVAL;
-	name_len = strlen(name);
 
 	if (value == NULL)
 		value_len = 0;
 
-	if (name_len > 255 || value_len > MAX_VALUE_LEN)
+	name_len = strlen(name);
+
+	if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN)
 		return -ERANGE;
 
 	f2fs_balance_fs(sbi);
 
-	mutex_lock_op(sbi, NODE_NEW);
+	ilock = mutex_lock_op(sbi);
+
 	if (!fi->i_xattr_nid) {
 		/* Allocate new attribute block */
 		struct dnode_of_data dn;
 
 		if (!alloc_nid(sbi, &fi->i_xattr_nid)) {
-			mutex_unlock_op(sbi, NODE_NEW);
-			return -ENOSPC;
+			error = -ENOSPC;
+			goto exit;
 		}
 		set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
 		mark_inode_dirty(inode);
 
-		page = new_node_page(&dn, XATTR_NODE_OFFSET);
+		page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
 		if (IS_ERR(page)) {
 			alloc_nid_failed(sbi, fi->i_xattr_nid);
 			fi->i_xattr_nid = 0;
-			mutex_unlock_op(sbi, NODE_NEW);
-			return PTR_ERR(page);
+			error = PTR_ERR(page);
+			goto exit;
 		}
 
 		alloc_nid_done(sbi, fi->i_xattr_nid);
@@ -349,8 +403,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
 		/* The inode already has an extended attribute block. */
 		page = get_node_page(sbi, fi->i_xattr_nid);
 		if (IS_ERR(page)) {
-			mutex_unlock_op(sbi, NODE_NEW);
-			return PTR_ERR(page);
+			error = PTR_ERR(page);
+			goto exit;
 		}
 
 		base_addr = page_address(page);
@@ -432,12 +486,16 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
 		inode->i_ctime = CURRENT_TIME;
 		clear_inode_flag(fi, FI_ACL_MODE);
 	}
-	f2fs_write_inode(inode, NULL);
-	mutex_unlock_op(sbi, NODE_NEW);
+	if (ipage)
+		update_inode(inode, ipage);
+	else
+		update_inode_page(inode);
+	mutex_unlock_op(sbi, ilock);
 
 	return 0;
 cleanup:
 	f2fs_put_page(page, 1);
-	mutex_unlock_op(sbi, NODE_NEW);
+exit:
+	mutex_unlock_op(sbi, ilock);
 	return error;
 }
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 49c9558305e..3c0817bef25 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -112,21 +112,19 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
 extern const struct xattr_handler f2fs_xattr_acl_access_handler;
 extern const struct xattr_handler f2fs_xattr_acl_default_handler;
 extern const struct xattr_handler f2fs_xattr_advise_handler;
+extern const struct xattr_handler f2fs_xattr_security_handler;
 
 extern const struct xattr_handler *f2fs_xattr_handlers[];
 
-extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
-		const void *value, size_t value_len);
-extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
-		void *buffer, size_t buffer_size);
-extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
-		size_t buffer_size);
-
+extern int f2fs_setxattr(struct inode *, int, const char *,
+				const void *, size_t, struct page *);
+extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
+extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
 #else
 
 #define f2fs_xattr_handlers	NULL
 static inline int f2fs_setxattr(struct inode *inode, int name_index,
-	const char *name, const void *value, size_t value_len)
+		const char *name, const void *value, size_t value_len)
 {
 	return -EOPNOTSUPP;
 }
@@ -142,4 +140,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
 }
 #endif
 
+#ifdef CONFIG_F2FS_FS_SECURITY
+extern int f2fs_init_security(struct inode *, struct inode *,
+				const struct qstr *, struct page *);
+#else
+static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr, struct page *ipage)
+{
+	return 0;
+}
+#endif
 #endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 58bf744dbf3..3963ede84eb 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -543,6 +543,7 @@ end_of_dir:
 EXPORT_SYMBOL_GPL(fat_search_long);
 
 struct fat_ioctl_filldir_callback {
+	struct dir_context ctx;
 	void __user *dirent;
 	int result;
 	/* for dir ioctl */
@@ -552,8 +553,9 @@ struct fat_ioctl_filldir_callback {
 	int short_len;
 };
 
-static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
-			 filldir_t filldir, int short_only, int both)
+static int __fat_readdir(struct inode *inode, struct file *file,
+			 struct dir_context *ctx, int short_only,
+			 struct fat_ioctl_filldir_callback *both)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -564,27 +566,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
 	unsigned char bufname[FAT_MAX_SHORT_SIZE];
 	int isvfat = sbi->options.isvfat;
 	const char *fill_name = NULL;
-	unsigned long inum;
-	unsigned long lpos, dummy, *furrfu = &lpos;
+	int fake_offset = 0;
 	loff_t cpos;
 	int short_len = 0, fill_len = 0;
 	int ret = 0;
 
 	mutex_lock(&sbi->s_lock);
 
-	cpos = filp->f_pos;
+	cpos = ctx->pos;
 	/* Fake . and .. for the root directory. */
 	if (inode->i_ino == MSDOS_ROOT_INO) {
-		while (cpos < 2) {
-			if (filldir(dirent, "..", cpos+1, cpos,
-				    MSDOS_ROOT_INO, DT_DIR) < 0)
-				goto out;
-			cpos++;
-			filp->f_pos++;
-		}
-		if (cpos == 2) {
-			dummy = 2;
-			furrfu = &dummy;
+		if (!dir_emit_dots(file, ctx))
+			goto out;
+		if (ctx->pos == 2) {
+			fake_offset = 1;
 			cpos = 0;
 		}
 	}
@@ -619,7 +614,7 @@ parse_record:
 		int status = fat_parse_long(inode, &cpos, &bh, &de,
 					    &unicode, &nr_slots);
 		if (status < 0) {
-			filp->f_pos = cpos;
+			ctx->pos = cpos;
 			ret = status;
 			goto out;
 		} else if (status == PARSE_INVALID)
@@ -639,6 +634,19 @@ parse_record:
 			/* !both && !short_only, so we don't need shortname. */
 			if (!both)
 				goto start_filldir;
+
+			short_len = fat_parse_short(sb, de, bufname,
+						    sbi->options.dotsOK);
+			if (short_len == 0)
+				goto record_end;
+			/* hack for fat_ioctl_filldir() */
+			both->longname = fill_name;
+			both->long_len = fill_len;
+			both->shortname = bufname;
+			both->short_len = short_len;
+			fill_name = NULL;
+			fill_len = 0;
+			goto start_filldir;
 		}
 	}
 
@@ -646,28 +654,21 @@ parse_record:
 	if (short_len == 0)
 		goto record_end;
 
-	if (nr_slots) {
-		/* hack for fat_ioctl_filldir() */
-		struct fat_ioctl_filldir_callback *p = dirent;
-
-		p->longname = fill_name;
-		p->long_len = fill_len;
-		p->shortname = bufname;
-		p->short_len = short_len;
-		fill_name = NULL;
-		fill_len = 0;
-	} else {
-		fill_name = bufname;
-		fill_len = short_len;
-	}
+	fill_name = bufname;
+	fill_len = short_len;
 
 start_filldir:
-	lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
-	if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
-		inum = inode->i_ino;
-	else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
-		inum = parent_ino(filp->f_path.dentry);
+	if (!fake_offset)
+		ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+
+	if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
+		if (!dir_emit_dot(file, ctx))
+			goto fill_failed;
+	} else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
+		if (!dir_emit_dotdot(file, ctx))
+			goto fill_failed;
 	} else {
+		unsigned long inum;
 		loff_t i_pos = fat_make_i_pos(sb, bh, de);
 		struct inode *tmp = fat_iget(sb, i_pos);
 		if (tmp) {
@@ -675,18 +676,17 @@ start_filldir:
 			iput(tmp);
 		} else
 			inum = iunique(sb, MSDOS_ROOT_INO);
+		if (!dir_emit(ctx, fill_name, fill_len, inum,
+			    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG))
+			goto fill_failed;
 	}
 
-	if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
-		    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
-		goto fill_failed;
-
 record_end:
-	furrfu = &lpos;
-	filp->f_pos = cpos;
+	fake_offset = 0;
+	ctx->pos = cpos;
 	goto get_new;
 end_of_dir:
-	filp->f_pos = cpos;
+	ctx->pos = cpos;
 fill_failed:
 	brelse(bh);
 	if (unicode)
@@ -696,10 +696,9 @@ out:
 	return ret;
 }
 
-static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int fat_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
+	return __fat_readdir(file_inode(file), file, ctx, 0, NULL);
 }
 
 #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type)			   \
@@ -755,20 +754,25 @@ efault:									   \
 
 FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
 
-static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
+static int fat_ioctl_readdir(struct inode *inode, struct file *file,
 			     void __user *dirent, filldir_t filldir,
 			     int short_only, int both)
 {
-	struct fat_ioctl_filldir_callback buf;
+	struct fat_ioctl_filldir_callback buf = {
+		.ctx.actor = filldir,
+		.dirent = dirent
+	};
 	int ret;
 
 	buf.dirent = dirent;
 	buf.result = 0;
 	mutex_lock(&inode->i_mutex);
+	buf.ctx.pos = file->f_pos;
 	ret = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
-		ret = __fat_readdir(inode, filp, &buf, filldir,
-				    short_only, both);
+		ret = __fat_readdir(inode, file, &buf.ctx,
+				    short_only, both ? &buf : NULL);
+		file->f_pos = buf.ctx.pos;
 	}
 	mutex_unlock(&inode->i_mutex);
 	if (ret >= 0)
@@ -779,7 +783,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
 static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
 			  unsigned long arg)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
 	int short_only, both;
 
@@ -819,7 +823,7 @@ FAT_IOCTL_FILLDIR_FUNC(fat_compat_ioctl_filldir, compat_dirent)
 static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 				 unsigned long arg)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct compat_dirent __user *d1 = compat_ptr(arg);
 	int short_only, both;
 
@@ -854,7 +858,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 const struct file_operations fat_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= fat_readdir,
+	.iterate	= fat_readdir,
 	.unlocked_ioctl	= fat_dir_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= fat_compat_dir_ioctl,
@@ -964,6 +968,29 @@ int fat_scan(struct inode *dir, const unsigned char *name,
 }
 EXPORT_SYMBOL_GPL(fat_scan);
 
+/*
+ * Scans a directory for a given logstart.
+ * Returns an error code or zero.
+ */
+int fat_scan_logstart(struct inode *dir, int i_logstart,
+		      struct fat_slot_info *sinfo)
+{
+	struct super_block *sb = dir->i_sb;
+
+	sinfo->slot_off = 0;
+	sinfo->bh = NULL;
+	while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh,
+				   &sinfo->de) >= 0) {
+		if (fat_get_start(MSDOS_SB(sb), sinfo->de) == i_logstart) {
+			sinfo->slot_off -= sizeof(*sinfo->de);
+			sinfo->nr_slots = 1;
+			sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
 static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
 {
 	struct super_block *sb = dir->i_sb;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 12701a56775..4241e6f39e8 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -23,6 +23,9 @@
 #define FAT_ERRORS_PANIC	2      /* panic on error */
 #define FAT_ERRORS_RO		3      /* remount r/o on error */
 
+#define FAT_NFS_STALE_RW	1      /* NFS RW support, can cause ESTALE */
+#define FAT_NFS_NOSTALE_RO	2      /* NFS RO support, no ESTALE issue */
+
 struct fat_mount_options {
 	kuid_t fs_uid;
 	kgid_t fs_gid;
@@ -34,6 +37,7 @@ struct fat_mount_options {
 	unsigned short shortname;  /* flags for shortname display/create rule */
 	unsigned char name_check;  /* r = relaxed, n = normal, s = strict */
 	unsigned char errors;	   /* On error: continue, panic, remount-ro */
+	unsigned char nfs;	  /* NFS support: nostale_ro, stale_rw */
 	unsigned short allow_utime;/* permission for setting the [am]time */
 	unsigned quiet:1,          /* set = fake successful chmods and chowns */
 		 showexec:1,       /* set = only set x bit for com/exe/bat */
@@ -48,8 +52,7 @@ struct fat_mount_options {
 		 usefree:1,	   /* Use free_clusters for FAT32 */
 		 tz_set:1,	   /* Filesystem timestamps' offset set */
 		 rodir:1,	   /* allow ATTR_RO for directory */
-		 discard:1,	   /* Issue discard requests on deletions */
-		 nfs:1;		   /* Do extra work needed for NFS export */
+		 discard:1;	   /* Issue discard requests on deletions */
 };
 
 #define FAT_HASH_BITS	8
@@ -72,6 +75,7 @@ struct msdos_sb_info {
 	unsigned long root_cluster;   /* first cluster of the root directory */
 	unsigned long fsinfo_sector;  /* sector number of FAT32 fsinfo */
 	struct mutex fat_lock;
+	struct mutex nfs_build_inode_lock;
 	struct mutex s_lock;
 	unsigned int prev_free;      /* previously allocated cluster number */
 	unsigned int free_clusters;  /* -1 if undefined */
@@ -82,6 +86,7 @@ struct msdos_sb_info {
 	const void *dir_ops;	      /* Opaque; default directory operations */
 	int dir_per_block;	      /* dir entries per block */
 	int dir_per_block_bits;	      /* log2(dir_per_block) */
+	unsigned int vol_id;		/*volume ID*/
 
 	int fatent_shift;
 	struct fatent_operations *fatent_ops;
@@ -95,6 +100,8 @@ struct msdos_sb_info {
 
 	spinlock_t dir_hash_lock;
 	struct hlist_head dir_hashtable[FAT_HASH_SIZE];
+
+	unsigned int dirty;           /* fs state before mount */
 };
 
 #define FAT_CACHE_VALID	0	/* special case for valid cache */
@@ -213,6 +220,27 @@ static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
 		+ sbi->data_start;
 }
 
+static inline void fat_get_blknr_offset(struct msdos_sb_info *sbi,
+				loff_t i_pos, sector_t *blknr, int *offset)
+{
+	*blknr = i_pos >> sbi->dir_per_block_bits;
+	*offset = i_pos & (sbi->dir_per_block - 1);
+}
+
+static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
+					struct inode *inode)
+{
+	loff_t i_pos;
+#if BITS_PER_LONG == 32
+	spin_lock(&sbi->inode_hash_lock);
+#endif
+	i_pos = MSDOS_I(inode)->i_pos;
+#if BITS_PER_LONG == 32
+	spin_unlock(&sbi->inode_hash_lock);
+#endif
+	return i_pos;
+}
+
 static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
 {
 #ifdef __BIG_ENDIAN
@@ -269,6 +297,8 @@ extern int fat_dir_empty(struct inode *dir);
 extern int fat_subdirs(struct inode *dir);
 extern int fat_scan(struct inode *dir, const unsigned char *name,
 		    struct fat_slot_info *sinfo);
+extern int fat_scan_logstart(struct inode *dir, int i_logstart,
+			     struct fat_slot_info *sinfo);
 extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
 				struct msdos_dir_entry **de);
 extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
@@ -346,6 +376,7 @@ extern struct inode *fat_build_inode(struct super_block *sb,
 extern int fat_sync_inode(struct inode *inode);
 extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 			  int isvfat, void (*setup)(struct super_block *));
+extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de);
 
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 			    struct inode *i2);
@@ -380,12 +411,8 @@ int fat_cache_init(void);
 void fat_cache_destroy(void);
 
 /* fat/nfs.c */
-struct fid;
-extern struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
-				       int fh_len, int fh_type);
-extern struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
-				       int fh_len, int fh_type);
-extern struct dentry *fat_get_parent(struct dentry *child_dir);
+extern const struct export_operations fat_export_ops;
+extern const struct export_operations fat_export_ops_nostale;
 
 /* helper for printk */
 typedef unsigned long long	llu;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a62e0ecbe2d..9b104f54305 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -32,7 +32,7 @@ static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
 
 static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	int is_dir = S_ISDIR(inode->i_mode);
 	u32 attr, oldattr;
@@ -114,9 +114,15 @@ out:
 	return err;
 }
 
+static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	return put_user(sbi->vol_id, user_attr);
+}
+
 long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	u32 __user *user_attr = (u32 __user *)arg;
 
 	switch (cmd) {
@@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return fat_ioctl_get_attributes(inode, user_attr);
 	case FAT_IOCTL_SET_ATTRIBUTES:
 		return fat_ioctl_set_attributes(filp, user_attr);
+	case FAT_IOCTL_GET_VOLUME_ID:
+		return fat_ioctl_get_volume_id(inode, user_attr);
 	default:
 		return -ENOTTY;	/* Inappropriate ioctl for device */
 	}
@@ -306,6 +314,11 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	struct inode *inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 	stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
+
+	if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
+		/* Use i_pos for ino. This is used as fileid of nfs. */
+		stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode);
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(fat_getattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index f8f491677a4..11b51bb55b4 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -18,8 +18,8 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
-#include <linux/exportfs.h>
 #include <linux/mount.h>
+#include <linux/aio.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/uio.h>
@@ -341,12 +341,11 @@ struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
-	struct hlist_node *_p;
 	struct msdos_inode_info *i;
 	struct inode *inode = NULL;
 
 	spin_lock(&sbi->inode_hash_lock);
-	hlist_for_each_entry(i, _p, head, i_fat_hash) {
+	hlist_for_each_entry(i, head, i_fat_hash) {
 		BUG_ON(i->vfs_inode.i_sb != sb);
 		if (i->i_pos != i_pos)
 			continue;
@@ -386,7 +385,7 @@ static int fat_calc_dir_size(struct inode *inode)
 }
 
 /* doesn't deal with root inode */
-static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
+int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	int error;
@@ -445,12 +444,25 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	return 0;
 }
 
+static inline void fat_lock_build_inode(struct msdos_sb_info *sbi)
+{
+	if (sbi->options.nfs == FAT_NFS_NOSTALE_RO)
+		mutex_lock(&sbi->nfs_build_inode_lock);
+}
+
+static inline void fat_unlock_build_inode(struct msdos_sb_info *sbi)
+{
+	if (sbi->options.nfs == FAT_NFS_NOSTALE_RO)
+		mutex_unlock(&sbi->nfs_build_inode_lock);
+}
+
 struct inode *fat_build_inode(struct super_block *sb,
 			struct msdos_dir_entry *de, loff_t i_pos)
 {
 	struct inode *inode;
 	int err;
 
+	fat_lock_build_inode(MSDOS_SB(sb));
 	inode = fat_iget(sb, i_pos);
 	if (inode)
 		goto out;
@@ -470,6 +482,7 @@ struct inode *fat_build_inode(struct super_block *sb,
 	fat_attach(inode, i_pos);
 	insert_inode_hash(inode);
 out:
+	fat_unlock_build_inode(MSDOS_SB(sb));
 	return inode;
 }
 
@@ -488,10 +501,59 @@ static void fat_evict_inode(struct inode *inode)
 	fat_detach(inode);
 }
 
+static void fat_set_state(struct super_block *sb,
+			unsigned int set, unsigned int force)
+{
+	struct buffer_head *bh;
+	struct fat_boot_sector *b;
+	struct msdos_sb_info *sbi = sb->s_fs_info;
+
+	/* do not change any thing if mounted read only */
+	if ((sb->s_flags & MS_RDONLY) && !force)
+		return;
+
+	/* do not change state if fs was dirty */
+	if (sbi->dirty) {
+		/* warn only on set (mount). */
+		if (set)
+			fat_msg(sb, KERN_WARNING, "Volume was not properly "
+				"unmounted. Some data may be corrupt. "
+				"Please run fsck.");
+		return;
+	}
+
+	bh = sb_bread(sb, 0);
+	if (bh == NULL) {
+		fat_msg(sb, KERN_ERR, "unable to read boot sector "
+			"to mark fs as dirty");
+		return;
+	}
+
+	b = (struct fat_boot_sector *) bh->b_data;
+
+	if (sbi->fat_bits == 32) {
+		if (set)
+			b->fat32.state |= FAT_STATE_DIRTY;
+		else
+			b->fat32.state &= ~FAT_STATE_DIRTY;
+	} else /* fat 16 and 12 */ {
+		if (set)
+			b->fat16.state |= FAT_STATE_DIRTY;
+		else
+			b->fat16.state &= ~FAT_STATE_DIRTY;
+	}
+
+	mark_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	brelse(bh);
+}
+
 static void fat_put_super(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
+	fat_set_state(sb, 0, 0);
+
 	iput(sbi->fsinfo_inode);
 	iput(sbi->fat_inode);
 
@@ -566,8 +628,18 @@ static void __exit fat_destroy_inodecache(void)
 
 static int fat_remount(struct super_block *sb, int *flags, char *data)
 {
+	int new_rdonly;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	*flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
+
+	/* make sure we update state on remount. */
+	new_rdonly = *flags & MS_RDONLY;
+	if (new_rdonly != (sb->s_flags & MS_RDONLY)) {
+		if (new_rdonly)
+			fat_set_state(sb, 0, 0);
+		else
+			fat_set_state(sb, 1, 1);
+	}
 	return 0;
 }
 
@@ -597,20 +669,6 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
-				    struct inode *inode)
-{
-	loff_t i_pos;
-#if BITS_PER_LONG == 32
-	spin_lock(&sbi->inode_hash_lock);
-#endif
-	i_pos = MSDOS_I(inode)->i_pos;
-#if BITS_PER_LONG == 32
-	spin_unlock(&sbi->inode_hash_lock);
-#endif
-	return i_pos;
-}
-
 static int __fat_write_inode(struct inode *inode, int wait)
 {
 	struct super_block *sb = inode->i_sb;
@@ -618,7 +676,8 @@ static int __fat_write_inode(struct inode *inode, int wait)
 	struct buffer_head *bh;
 	struct msdos_dir_entry *raw_entry;
 	loff_t i_pos;
-	int err;
+	sector_t blocknr;
+	int err, offset;
 
 	if (inode->i_ino == MSDOS_ROOT_INO)
 		return 0;
@@ -628,7 +687,8 @@ retry:
 	if (!i_pos)
 		return 0;
 
-	bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
+	fat_get_blknr_offset(sbi, i_pos, &blocknr, &offset);
+	bh = sb_bread(sb, blocknr);
 	if (!bh) {
 		fat_msg(sb, KERN_ERR, "unable to read inode block "
 		       "for updating (i_pos %lld)", i_pos);
@@ -641,8 +701,7 @@ retry:
 		goto retry;
 	}
 
-	raw_entry = &((struct msdos_dir_entry *) (bh->b_data))
-	    [i_pos & (sbi->dir_per_block - 1)];
+	raw_entry = &((struct msdos_dir_entry *) (bh->b_data))[offset];
 	if (S_ISDIR(inode->i_mode))
 		raw_entry->size = 0;
 	else
@@ -703,12 +762,6 @@ static const struct super_operations fat_sops = {
 	.show_options	= fat_show_options,
 };
 
-static const struct export_operations fat_export_ops = {
-	.fh_to_dentry	= fat_fh_to_dentry,
-	.fh_to_parent	= fat_fh_to_parent,
-	.get_parent	= fat_get_parent,
-};
-
 static int fat_show_options(struct seq_file *m, struct dentry *root)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb);
@@ -756,8 +809,6 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",usefree");
 	if (opts->quiet)
 		seq_puts(m, ",quiet");
-	if (opts->nfs)
-		seq_puts(m, ",nfs");
 	if (opts->showexec)
 		seq_puts(m, ",showexec");
 	if (opts->sys_immutable)
@@ -791,6 +842,10 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",errors=panic");
 	else
 		seq_puts(m, ",errors=remount-ro");
+	if (opts->nfs == FAT_NFS_NOSTALE_RO)
+		seq_puts(m, ",nfs=nostale_ro");
+	else if (opts->nfs)
+		seq_puts(m, ",nfs=stale_rw");
 	if (opts->discard)
 		seq_puts(m, ",discard");
 
@@ -807,7 +862,7 @@ enum {
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
 	Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
 	Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
-	Opt_err,
+	Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err,
 };
 
 static const match_table_t fat_tokens = {
@@ -837,7 +892,9 @@ static const match_table_t fat_tokens = {
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
 	{Opt_discard, "discard"},
-	{Opt_nfs, "nfs"},
+	{Opt_nfs_stale_rw, "nfs"},
+	{Opt_nfs_stale_rw, "nfs=stale_rw"},
+	{Opt_nfs_nostale_ro, "nfs=nostale_ro"},
 	{Opt_obsolete, "conv=binary"},
 	{Opt_obsolete, "conv=text"},
 	{Opt_obsolete, "conv=auto"},
@@ -1034,6 +1091,12 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
 		case Opt_err_ro:
 			opts->errors = FAT_ERRORS_RO;
 			break;
+		case Opt_nfs_stale_rw:
+			opts->nfs = FAT_NFS_STALE_RW;
+			break;
+		case Opt_nfs_nostale_ro:
+			opts->nfs = FAT_NFS_NOSTALE_RO;
+			break;
 
 		/* msdos specific */
 		case Opt_dots:
@@ -1092,9 +1155,6 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
 		case Opt_discard:
 			opts->discard = 1;
 			break;
-		case Opt_nfs:
-			opts->nfs = 1;
-			break;
 
 		/* obsolete mount options */
 		case Opt_obsolete:
@@ -1125,6 +1185,10 @@ out:
 		opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
 	if (opts->unicode_xlate)
 		opts->utf8 = 0;
+	if (opts->nfs == FAT_NFS_NOSTALE_RO) {
+		sb->s_flags |= MS_RDONLY;
+		sb->s_export_op = &fat_export_ops_nostale;
+	}
 
 	return 0;
 }
@@ -1135,7 +1199,7 @@ static int fat_read_root(struct inode *inode)
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	int error;
 
-	MSDOS_I(inode)->i_pos = 0;
+	MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO;
 	inode->i_uid = sbi->options.fs_uid;
 	inode->i_gid = sbi->options.fs_gid;
 	inode->i_version++;
@@ -1165,6 +1229,19 @@ static int fat_read_root(struct inode *inode)
 	return 0;
 }
 
+static unsigned long calc_fat_clusters(struct super_block *sb)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+
+	/* Divide first to avoid overflow */
+	if (sbi->fat_bits != 12) {
+		unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits;
+		return ent_per_sec * sbi->fat_length;
+	}
+
+	return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
+}
+
 /*
  * Read the super block of an MS-DOS FS.
  */
@@ -1198,6 +1275,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 	sb->s_magic = MSDOS_SUPER_MAGIC;
 	sb->s_op = &fat_sops;
 	sb->s_export_op = &fat_export_ops;
+	mutex_init(&sbi->nfs_build_inode_lock);
 	ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
 			     DEFAULT_RATELIMIT_BURST);
 
@@ -1298,17 +1376,17 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 	sbi->prev_free = FAT_START_ENT;
 	sb->s_maxbytes = 0xffffffff;
 
-	if (!sbi->fat_length && b->fat32_length) {
+	if (!sbi->fat_length && b->fat32.length) {
 		struct fat_boot_fsinfo *fsinfo;
 		struct buffer_head *fsinfo_bh;
 
 		/* Must be FAT32 */
 		sbi->fat_bits = 32;
-		sbi->fat_length = le32_to_cpu(b->fat32_length);
-		sbi->root_cluster = le32_to_cpu(b->root_cluster);
+		sbi->fat_length = le32_to_cpu(b->fat32.length);
+		sbi->root_cluster = le32_to_cpu(b->fat32.root_cluster);
 
 		/* MC - if info_sector is 0, don't multiply by 0 */
-		sbi->fsinfo_sector = le16_to_cpu(b->info_sector);
+		sbi->fsinfo_sector = le16_to_cpu(b->fat32.info_sector);
 		if (sbi->fsinfo_sector == 0)
 			sbi->fsinfo_sector = 1;
 
@@ -1337,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 		brelse(fsinfo_bh);
 	}
 
+	/* interpret volume ID as a little endian 32 bit integer */
+	if (sbi->fat_bits == 32)
+		sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
+					((u32)b->fat32.vol_id[1] << 8) |
+					((u32)b->fat32.vol_id[2] << 16) |
+					((u32)b->fat32.vol_id[3] << 24));
+	else /* fat 16 or 12 */
+		sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
+					((u32)b->fat16.vol_id[1] << 8) |
+					((u32)b->fat16.vol_id[2] << 16) |
+					((u32)b->fat16.vol_id[3] << 24));
+
 	sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
 	sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
 
@@ -1362,8 +1452,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 	if (sbi->fat_bits != 32)
 		sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12;
 
+	/* some OSes set FAT_STATE_DIRTY and clean it on unmount. */
+	if (sbi->fat_bits == 32)
+		sbi->dirty = b->fat32.state & FAT_STATE_DIRTY;
+	else /* fat 16 or 12 */
+		sbi->dirty = b->fat16.state & FAT_STATE_DIRTY;
+
 	/* check that FAT table does not overflow */
-	fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
+	fat_clusters = calc_fat_clusters(sb);
 	total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
 	if (total_clusters > MAX_FAT(sb)) {
 		if (!silent)
@@ -1456,6 +1552,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 					"the device does not support discard");
 	}
 
+	fat_set_state(sb, 1, 0);
 	return 0;
 
 out_invalid:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 359d307b550..628e22a5a54 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
-		printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
+		fat_msg(sb, KERN_ERR, "error, %pV", &vaf);
 		va_end(args);
 	}
 
@@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
 		panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
 	else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
 		sb->s_flags |= MS_RDONLY;
-		printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
-				"set read-only\n", sb->s_id);
+		fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
 	}
 }
 EXPORT_SYMBOL_GPL(__fat_fs_error);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index e2cfda94a28..a783b0e1272 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,8 +148,7 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
  * that the existing dentry can be used. The msdos fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
-	       struct qstr *qstr)
+static int msdos_hash(const struct dentry *dentry, struct qstr *qstr)
 {
 	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
 	unsigned char msdos_name[MSDOS_NAME];
@@ -165,8 +164,7 @@ static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
  * Compare two msdos names. If either of the names are invalid,
  * we fall back to doing the standard name comparison.
  */
-static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
@@ -668,6 +666,7 @@ static struct file_system_type msdos_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("msdos");
 
 static int __init init_msdos_fs(void)
 {
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index ac959d655e7..6df8d3d885e 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -107,8 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr)
  * that the existing dentry can be used. The vfat fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+static int vfat_hash(const struct dentry *dentry, struct qstr *qstr)
 {
 	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
 	return 0;
@@ -120,8 +119,7 @@ static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
  * that the existing dentry can be used. The vfat fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
 {
 	struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
 	const unsigned char *name;
@@ -142,8 +140,7 @@ static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
 /*
  * Case insensitive compare of two vfat names.
  */
-static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
@@ -162,8 +159,7 @@ static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
 /*
  * Case sensitive compare of two vfat names.
  */
-static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	unsigned int alen, blen;
@@ -1073,6 +1069,7 @@ static struct file_system_type vfat_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("vfat");
 
 static int __init init_vfat_fs(void)
 {
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index ef4b5faba87..93e14933dcb 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -14,6 +14,18 @@
 #include <linux/exportfs.h>
 #include "fat.h"
 
+struct fat_fid {
+	u32 i_gen;
+	u32 i_pos_low;
+	u16 i_pos_hi;
+	u16 parent_i_pos_hi;
+	u32 parent_i_pos_low;
+	u32 parent_i_gen;
+};
+
+#define FAT_FID_SIZE_WITHOUT_PARENT 3
+#define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32))
+
 /**
  * Look up a directory inode given its starting cluster.
  */
@@ -21,13 +33,12 @@ static struct inode *fat_dget(struct super_block *sb, int i_logstart)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct hlist_head *head;
-	struct hlist_node *_p;
 	struct msdos_inode_info *i;
 	struct inode *inode = NULL;
 
 	head = sbi->dir_hashtable + fat_dir_hash(i_logstart);
 	spin_lock(&sbi->dir_hash_lock);
-	hlist_for_each_entry(i, _p, head, i_dir_hash) {
+	hlist_for_each_entry(i, head, i_dir_hash) {
 		BUG_ON(i->vfs_inode.i_sb != sb);
 		if (i->i_logstart != i_logstart)
 			continue;
@@ -39,63 +50,252 @@ static struct inode *fat_dget(struct super_block *sb, int i_logstart)
 	return inode;
 }
 
-static struct inode *fat_nfs_get_inode(struct super_block *sb,
-				       u64 ino, u32 generation)
+static struct inode *fat_ilookup(struct super_block *sb, u64 ino, loff_t i_pos)
 {
-	struct inode *inode;
+	if (MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO)
+		return fat_iget(sb, i_pos);
 
-	if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO))
-		return NULL;
+	else {
+		if ((ino < MSDOS_ROOT_INO) || (ino == MSDOS_FSINFO_INO))
+			return NULL;
+		return ilookup(sb, ino);
+	}
+}
+
+static struct inode *__fat_nfs_get_inode(struct super_block *sb,
+				       u64 ino, u32 generation, loff_t i_pos)
+{
+	struct inode *inode = fat_ilookup(sb, ino, i_pos);
 
-	inode = ilookup(sb, ino);
 	if (inode && generation && (inode->i_generation != generation)) {
 		iput(inode);
 		inode = NULL;
 	}
+	if (inode == NULL && MSDOS_SB(sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
+		struct buffer_head *bh = NULL;
+		struct msdos_dir_entry *de ;
+		sector_t blocknr;
+		int offset;
+		fat_get_blknr_offset(MSDOS_SB(sb), i_pos, &blocknr, &offset);
+		bh = sb_bread(sb, blocknr);
+		if (!bh) {
+			fat_msg(sb, KERN_ERR,
+				"unable to read block(%llu) for building NFS inode",
+				(llu)blocknr);
+			return inode;
+		}
+		de = (struct msdos_dir_entry *)bh->b_data;
+		/* If a file is deleted on server and client is not updated
+		 * yet, we must not build the inode upon a lookup call.
+		 */
+		if (IS_FREE(de[offset].name))
+			inode = NULL;
+		else
+			inode = fat_build_inode(sb, &de[offset], i_pos);
+		brelse(bh);
+	}
 
 	return inode;
 }
 
+static struct inode *fat_nfs_get_inode(struct super_block *sb,
+				       u64 ino, u32 generation)
+{
+
+	return __fat_nfs_get_inode(sb, ino, generation, 0);
+}
+
+static int
+fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
+		      struct inode *parent)
+{
+	int len = *lenp;
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct fat_fid *fid = (struct fat_fid *) fh;
+	loff_t i_pos;
+	int type = FILEID_FAT_WITHOUT_PARENT;
+
+	if (parent) {
+		if (len < FAT_FID_SIZE_WITH_PARENT) {
+			*lenp = FAT_FID_SIZE_WITH_PARENT;
+			return FILEID_INVALID;
+		}
+	} else {
+		if (len < FAT_FID_SIZE_WITHOUT_PARENT) {
+			*lenp = FAT_FID_SIZE_WITHOUT_PARENT;
+			return FILEID_INVALID;
+		}
+	}
+
+	i_pos = fat_i_pos_read(sbi, inode);
+	*lenp = FAT_FID_SIZE_WITHOUT_PARENT;
+	fid->i_gen = inode->i_generation;
+	fid->i_pos_low = i_pos & 0xFFFFFFFF;
+	fid->i_pos_hi = (i_pos >> 32) & 0xFFFF;
+	if (parent) {
+		i_pos = fat_i_pos_read(sbi, parent);
+		fid->parent_i_pos_hi = (i_pos >> 32) & 0xFFFF;
+		fid->parent_i_pos_low = i_pos & 0xFFFFFFFF;
+		fid->parent_i_gen = parent->i_generation;
+		type = FILEID_FAT_WITH_PARENT;
+		*lenp = FAT_FID_SIZE_WITH_PARENT;
+	}
+
+	return type;
+}
+
 /**
  * Map a NFS file handle to a corresponding dentry.
  * The dentry may or may not be connected to the filesystem root.
  */
-struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
+static struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
 				int fh_len, int fh_type)
 {
 	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
 				    fat_nfs_get_inode);
 }
 
+static struct dentry *fat_fh_to_dentry_nostale(struct super_block *sb,
+					       struct fid *fh, int fh_len,
+					       int fh_type)
+{
+	struct inode *inode = NULL;
+	struct fat_fid *fid = (struct fat_fid *)fh;
+	loff_t i_pos;
+
+	switch (fh_type) {
+	case FILEID_FAT_WITHOUT_PARENT:
+		if (fh_len < FAT_FID_SIZE_WITHOUT_PARENT)
+			return NULL;
+		break;
+	case FILEID_FAT_WITH_PARENT:
+		if (fh_len < FAT_FID_SIZE_WITH_PARENT)
+			return NULL;
+		break;
+	default:
+		return NULL;
+	}
+	i_pos = fid->i_pos_hi;
+	i_pos = (i_pos << 32) | (fid->i_pos_low);
+	inode = __fat_nfs_get_inode(sb, 0, fid->i_gen, i_pos);
+
+	return d_obtain_alias(inode);
+}
+
 /*
  * Find the parent for a file specified by NFS handle.
  * This requires that the handle contain the i_ino of the parent.
  */
-struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
+static struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
 				int fh_len, int fh_type)
 {
 	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
 				    fat_nfs_get_inode);
 }
 
+static struct dentry *fat_fh_to_parent_nostale(struct super_block *sb,
+					       struct fid *fh, int fh_len,
+					       int fh_type)
+{
+	struct inode *inode = NULL;
+	struct fat_fid *fid = (struct fat_fid *)fh;
+	loff_t i_pos;
+
+	if (fh_len < FAT_FID_SIZE_WITH_PARENT)
+		return NULL;
+
+	switch (fh_type) {
+	case FILEID_FAT_WITH_PARENT:
+		i_pos = fid->parent_i_pos_hi;
+		i_pos = (i_pos << 32) | (fid->parent_i_pos_low);
+		inode = __fat_nfs_get_inode(sb, 0, fid->parent_i_gen, i_pos);
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+
+/*
+ * Rebuild the parent for a directory that is not connected
+ *  to the filesystem root
+ */
+static
+struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart)
+{
+	int search_clus, clus_to_match;
+	struct msdos_dir_entry *de;
+	struct inode *parent = NULL;
+	struct inode *dummy_grand_parent = NULL;
+	struct fat_slot_info sinfo;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	sector_t blknr = fat_clus_to_blknr(sbi, parent_logstart);
+	struct buffer_head *parent_bh = sb_bread(sb, blknr);
+	if (!parent_bh) {
+		fat_msg(sb, KERN_ERR,
+			"unable to read cluster of parent directory");
+		return NULL;
+	}
+
+	de = (struct msdos_dir_entry *) parent_bh->b_data;
+	clus_to_match = fat_get_start(sbi, &de[0]);
+	search_clus = fat_get_start(sbi, &de[1]);
+
+	dummy_grand_parent = fat_dget(sb, search_clus);
+	if (!dummy_grand_parent) {
+		dummy_grand_parent = new_inode(sb);
+		if (!dummy_grand_parent) {
+			brelse(parent_bh);
+			return parent;
+		}
+
+		dummy_grand_parent->i_ino = iunique(sb, MSDOS_ROOT_INO);
+		fat_fill_inode(dummy_grand_parent, &de[1]);
+		MSDOS_I(dummy_grand_parent)->i_pos = -1;
+	}
+
+	if (!fat_scan_logstart(dummy_grand_parent, clus_to_match, &sinfo))
+		parent = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+
+	brelse(parent_bh);
+	iput(dummy_grand_parent);
+
+	return parent;
+}
+
 /*
  * Find the parent for a directory that is not currently connected to
  * the filesystem root.
  *
  * On entry, the caller holds child_dir->d_inode->i_mutex.
  */
-struct dentry *fat_get_parent(struct dentry *child_dir)
+static struct dentry *fat_get_parent(struct dentry *child_dir)
 {
 	struct super_block *sb = child_dir->d_sb;
 	struct buffer_head *bh = NULL;
 	struct msdos_dir_entry *de;
 	struct inode *parent_inode = NULL;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
 	if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) {
-		int parent_logstart = fat_get_start(MSDOS_SB(sb), de);
+		int parent_logstart = fat_get_start(sbi, de);
 		parent_inode = fat_dget(sb, parent_logstart);
+		if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO)
+			parent_inode = fat_rebuild_parent(sb, parent_logstart);
 	}
 	brelse(bh);
 
 	return d_obtain_alias(parent_inode);
 }
+
+const struct export_operations fat_export_ops = {
+	.fh_to_dentry   = fat_fh_to_dentry,
+	.fh_to_parent   = fat_fh_to_parent,
+	.get_parent     = fat_get_parent,
+};
+
+const struct export_operations fat_export_ops_nostale = {
+	.encode_fh      = fat_encode_fh_nostale,
+	.fh_to_dentry   = fat_fh_to_dentry_nostale,
+	.fh_to_parent   = fat_fh_to_parent_nostale,
+	.get_parent     = fat_get_parent,
+};
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 71a600a19f0..6599222536e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -30,7 +30,7 @@
 
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
-	struct inode * inode = filp->f_path.dentry->d_inode;
+	struct inode * inode = file_inode(filp);
 	int error = 0;
 
 	/*
diff --git a/fs/fifo.c b/fs/fifo.c
deleted file mode 100644
index cf6f4345ceb..00000000000
--- a/fs/fifo.c
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- *  linux/fs/fifo.c
- *
- *  written by Paul H. Hargrove
- *
- *  Fixes:
- *	10-06-1999, AV: fixed OOM handling in fifo_open(), moved
- *			initialization there, switched to external
- *			allocation of pipe_inode_info.
- */
-
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/pipe_fs_i.h>
-
-static int wait_for_partner(struct inode* inode, unsigned int *cnt)
-{
-	int cur = *cnt;	
-
-	while (cur == *cnt) {
-		pipe_wait(inode->i_pipe);
-		if (signal_pending(current))
-			break;
-	}
-	return cur == *cnt ? -ERESTARTSYS : 0;
-}
-
-static void wake_up_partner(struct inode* inode)
-{
-	wake_up_interruptible(&inode->i_pipe->wait);
-}
-
-static int fifo_open(struct inode *inode, struct file *filp)
-{
-	struct pipe_inode_info *pipe;
-	int ret;
-
-	mutex_lock(&inode->i_mutex);
-	pipe = inode->i_pipe;
-	if (!pipe) {
-		ret = -ENOMEM;
-		pipe = alloc_pipe_info(inode);
-		if (!pipe)
-			goto err_nocleanup;
-		inode->i_pipe = pipe;
-	}
-	filp->f_version = 0;
-
-	/* We can only do regular read/write on fifos */
-	filp->f_mode &= (FMODE_READ | FMODE_WRITE);
-
-	switch (filp->f_mode) {
-	case FMODE_READ:
-	/*
-	 *  O_RDONLY
-	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
-	 *  opened, even when there is no process writing the FIFO.
-	 */
-		filp->f_op = &read_pipefifo_fops;
-		pipe->r_counter++;
-		if (pipe->readers++ == 0)
-			wake_up_partner(inode);
-
-		if (!pipe->writers) {
-			if ((filp->f_flags & O_NONBLOCK)) {
-				/* suppress POLLHUP until we have
-				 * seen a writer */
-				filp->f_version = pipe->w_counter;
-			} else {
-				if (wait_for_partner(inode, &pipe->w_counter))
-					goto err_rd;
-			}
-		}
-		break;
-	
-	case FMODE_WRITE:
-	/*
-	 *  O_WRONLY
-	 *  POSIX.1 says that O_NONBLOCK means return -1 with
-	 *  errno=ENXIO when there is no process reading the FIFO.
-	 */
-		ret = -ENXIO;
-		if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
-			goto err;
-
-		filp->f_op = &write_pipefifo_fops;
-		pipe->w_counter++;
-		if (!pipe->writers++)
-			wake_up_partner(inode);
-
-		if (!pipe->readers) {
-			if (wait_for_partner(inode, &pipe->r_counter))
-				goto err_wr;
-		}
-		break;
-	
-	case FMODE_READ | FMODE_WRITE:
-	/*
-	 *  O_RDWR
-	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
-	 *  This implementation will NEVER block on a O_RDWR open, since
-	 *  the process can at least talk to itself.
-	 */
-		filp->f_op = &rdwr_pipefifo_fops;
-
-		pipe->readers++;
-		pipe->writers++;
-		pipe->r_counter++;
-		pipe->w_counter++;
-		if (pipe->readers == 1 || pipe->writers == 1)
-			wake_up_partner(inode);
-		break;
-
-	default:
-		ret = -EINVAL;
-		goto err;
-	}
-
-	/* Ok! */
-	mutex_unlock(&inode->i_mutex);
-	return 0;
-
-err_rd:
-	if (!--pipe->readers)
-		wake_up_interruptible(&pipe->wait);
-	ret = -ERESTARTSYS;
-	goto err;
-
-err_wr:
-	if (!--pipe->writers)
-		wake_up_interruptible(&pipe->wait);
-	ret = -ERESTARTSYS;
-	goto err;
-
-err:
-	if (!pipe->readers && !pipe->writers)
-		free_pipe_info(inode);
-
-err_nocleanup:
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-
-/*
- * Dummy default file-operations: the only thing this does
- * is contain the open that then fills in the correct operations
- * depending on the access mode of the file...
- */
-const struct file_operations def_fifo_fops = {
-	.open		= fifo_open,	/* will set read_ or write_pipefifo_fops */
-	.llseek		= noop_llseek,
-};
diff --git a/fs/file.c b/fs/file.c
index 2b3570b7cae..4a78f981557 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -23,24 +23,10 @@
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
 
-struct fdtable_defer {
-	spinlock_t lock;
-	struct work_struct wq;
-	struct fdtable *next;
-};
-
 int sysctl_nr_open __read_mostly = 1024*1024;
 int sysctl_nr_open_min = BITS_PER_LONG;
 int sysctl_nr_open_max = 1024 * 1024; /* raised later */
 
-/*
- * We use this list to defer free fdtables that have vmalloced
- * sets/arrays. By keeping a per-cpu list, we avoid having to embed
- * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
- * this per-task structure.
- */
-static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
-
 static void *alloc_fdmem(size_t size)
 {
 	/*
@@ -67,46 +53,9 @@ static void __free_fdtable(struct fdtable *fdt)
 	kfree(fdt);
 }
 
-static void free_fdtable_work(struct work_struct *work)
-{
-	struct fdtable_defer *f =
-		container_of(work, struct fdtable_defer, wq);
-	struct fdtable *fdt;
-
-	spin_lock_bh(&f->lock);
-	fdt = f->next;
-	f->next = NULL;
-	spin_unlock_bh(&f->lock);
-	while(fdt) {
-		struct fdtable *next = fdt->next;
-
-		__free_fdtable(fdt);
-		fdt = next;
-	}
-}
-
 static void free_fdtable_rcu(struct rcu_head *rcu)
 {
-	struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
-	struct fdtable_defer *fddef;
-
-	BUG_ON(!fdt);
-	BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT);
-
-	if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
-		kfree(fdt->fd);
-		kfree(fdt->open_fds);
-		kfree(fdt);
-	} else {
-		fddef = &get_cpu_var(fdtable_defer_list);
-		spin_lock(&fddef->lock);
-		fdt->next = fddef->next;
-		fddef->next = fdt;
-		/* vmallocs are handled from the workqueue context */
-		schedule_work(&fddef->wq);
-		spin_unlock(&fddef->lock);
-		put_cpu_var(fdtable_defer_list);
-	}
+	__free_fdtable(container_of(rcu, struct fdtable, rcu));
 }
 
 /*
@@ -174,7 +123,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	fdt->open_fds = data;
 	data += nr / BITS_PER_BYTE;
 	fdt->close_on_exec = data;
-	fdt->next = NULL;
 
 	return fdt;
 
@@ -221,7 +169,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
 		/* Continue as planned */
 		copy_fdtable(new_fdt, cur_fdt);
 		rcu_assign_pointer(files->fdt, new_fdt);
-		if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
+		if (cur_fdt != &files->fdtab)
 			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
 	} else {
 		/* Somebody else expanded, so undo our attempt */
@@ -316,7 +264,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	new_fdt->close_on_exec = newf->close_on_exec_init;
 	new_fdt->open_fds = newf->open_fds_init;
 	new_fdt->fd = &newf->fd_array[0];
-	new_fdt->next = NULL;
 
 	spin_lock(&oldf->file_lock);
 	old_fdt = files_fdtable(oldf);
@@ -490,19 +437,8 @@ void exit_files(struct task_struct *tsk)
 	}
 }
 
-static void fdtable_defer_list_init(int cpu)
-{
-	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
-	spin_lock_init(&fddef->lock);
-	INIT_WORK(&fddef->wq, free_fdtable_work);
-	fddef->next = NULL;
-}
-
 void __init files_defer_init(void)
 {
-	int i;
-	for_each_possible_cpu(i)
-		fdtable_defer_list_init(i);
 	sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
 			     -BITS_PER_LONG;
 }
@@ -516,7 +452,7 @@ struct files_struct init_files = {
 		.close_on_exec	= init_files.close_on_exec_init,
 		.open_fds	= init_files.open_fds_init,
 	},
-	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock),
+	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
 };
 
 /*
diff --git a/fs/file_table.c b/fs/file_table.c
index de9e9653d61..08e719b884c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -94,8 +94,8 @@ int proc_nr_files(ctl_table *table, int write,
 #endif
 
 /* Find an unused file structure and return a pointer to it.
- * Returns NULL, if there are no more free file structures or
- * we run out of memory.
+ * Returns an error pointer if some error happend e.g. we over file
+ * structures limit, run out of memory or operation is not permitted.
  *
  * Be very careful using this.  You are responsible for
  * getting write access to any mount that you might assign
@@ -107,7 +107,8 @@ struct file *get_empty_filp(void)
 {
 	const struct cred *cred = current_cred();
 	static long old_max;
-	struct file * f;
+	struct file *f;
+	int error;
 
 	/*
 	 * Privileged users can go above max_files
@@ -122,13 +123,16 @@ struct file *get_empty_filp(void)
 	}
 
 	f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
-	if (f == NULL)
-		goto fail;
+	if (unlikely(!f))
+		return ERR_PTR(-ENOMEM);
 
 	percpu_counter_inc(&nr_files);
 	f->f_cred = get_cred(cred);
-	if (security_file_alloc(f))
-		goto fail_sec;
+	error = security_file_alloc(f);
+	if (unlikely(error)) {
+		file_free(f);
+		return ERR_PTR(error);
+	}
 
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_long_set(&f->f_count, 1);
@@ -144,12 +148,7 @@ over:
 		pr_info("VFS: file-max limit %lu reached\n", get_max_files());
 		old_max = get_nr_files();
 	}
-	goto fail;
-
-fail_sec:
-	file_free(f);
-fail:
-	return NULL;
+	return ERR_PTR(-ENFILE);
 }
 
 /**
@@ -173,10 +172,11 @@ struct file *alloc_file(struct path *path, fmode_t mode,
 	struct file *file;
 
 	file = get_empty_filp();
-	if (!file)
-		return NULL;
+	if (IS_ERR(file))
+		return file;
 
 	file->f_path = *path;
+	file->f_inode = path->dentry->d_inode;
 	file->f_mapping = path->dentry->d_inode->i_mapping;
 	file->f_mode = mode;
 	file->f_op = fop;
@@ -227,7 +227,7 @@ static void __fput(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct vfsmount *mnt = file->f_path.mnt;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_inode;
 
 	might_sleep();
 
@@ -259,6 +259,7 @@ static void __fput(struct file *file)
 		drop_file_write_access(file);
 	file->f_path.dentry = NULL;
 	file->f_path.mnt = NULL;
+	file->f_inode = NULL;
 	file_free(file);
 	dput(dentry);
 	mntput(mnt);
@@ -305,17 +306,18 @@ void fput(struct file *file)
 {
 	if (atomic_long_dec_and_test(&file->f_count)) {
 		struct task_struct *task = current;
+		unsigned long flags;
+
 		file_sb_list_del(file);
-		if (unlikely(in_interrupt() || task->flags & PF_KTHREAD)) {
-			unsigned long flags;
-			spin_lock_irqsave(&delayed_fput_lock, flags);
-			list_add(&file->f_u.fu_list, &delayed_fput_list);
-			schedule_work(&delayed_fput_work);
-			spin_unlock_irqrestore(&delayed_fput_lock, flags);
-			return;
+		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
+			init_task_work(&file->f_u.fu_rcuhead, ____fput);
+			if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
+				return;
 		}
-		init_task_work(&file->f_u.fu_rcuhead, ____fput);
-		task_work_add(task, &file->f_u.fu_rcuhead, true);
+		spin_lock_irqsave(&delayed_fput_lock, flags);
+		list_add(&file->f_u.fu_list, &delayed_fput_list);
+		schedule_work(&delayed_fput_work);
+		spin_unlock_irqrestore(&delayed_fput_lock, flags);
 	}
 }
 
@@ -447,7 +449,7 @@ void mark_files_ro(struct super_block *sb)
 
 	lg_global_lock(&files_lglock);
 	do_file_list_for_each_entry(sb, f) {
-		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+		if (!S_ISREG(file_inode(f)->i_mode))
 		       continue;
 		if (!file_count(f))
 			continue;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index da165f6adcb..92567d95ba6 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -273,7 +273,7 @@ struct file_system_type *get_fs_type(const char *name)
 	int len = dot ? dot - name : strlen(name);
 
 	fs = __get_fs_type(name, len);
-	if (!fs && (request_module("%.*s", len, name) == 0))
+	if (!fs && (request_module("fs-%.*s", len, name) == 0))
 		fs = __get_fs_type(name, len);
 
 	if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index bd447e88f20..25d4099a4ae 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -49,7 +49,7 @@
 
 
 static struct dentry *	vxfs_lookup(struct inode *, struct dentry *, unsigned int);
-static int		vxfs_readdir(struct file *, void *, filldir_t);
+static int		vxfs_readdir(struct file *, struct dir_context *);
 
 const struct inode_operations vxfs_dir_inode_ops = {
 	.lookup =		vxfs_lookup,
@@ -58,7 +58,7 @@ const struct inode_operations vxfs_dir_inode_ops = {
 const struct file_operations vxfs_dir_operations = {
 	.llseek =		generic_file_llseek,
 	.read =			generic_read_dir,
-	.readdir =		vxfs_readdir,
+	.iterate =		vxfs_readdir,
 };
 
  
@@ -235,28 +235,25 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
  *   Zero.
  */
 static int
-vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
+vxfs_readdir(struct file *fp, struct dir_context *ctx)
 {
-	struct inode		*ip = fp->f_path.dentry->d_inode;
+	struct inode		*ip = file_inode(fp);
 	struct super_block	*sbp = ip->i_sb;
 	u_long			bsize = sbp->s_blocksize;
 	u_long			page, npages, block, pblocks, nblocks, offset;
 	loff_t			pos;
 
-	switch ((long)fp->f_pos) {
-	case 0:
-		if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0)
-			goto out;
-		fp->f_pos++;
-		/* fallthrough */
-	case 1:
-		if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0)
-			goto out;
-		fp->f_pos++;
-		/* fallthrough */
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(fp, ctx))
+			return 0;
+		ctx->pos = 1;
 	}
-
-	pos = fp->f_pos - 2;
+	if (ctx->pos == 1) {
+		if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR))
+			return 0;
+		ctx->pos = 2;
+	}
+	pos = ctx->pos - 2;
 	
 	if (pos > VXFS_DIRROUND(ip->i_size))
 		return 0;
@@ -270,16 +267,16 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
 	block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
 
 	for (; page < npages; page++, block = 0) {
-		caddr_t			kaddr;
+		char			*kaddr;
 		struct page		*pp;
 
 		pp = vxfs_get_page(ip->i_mapping, page);
 		if (IS_ERR(pp))
 			continue;
-		kaddr = (caddr_t)page_address(pp);
+		kaddr = (char *)page_address(pp);
 
 		for (; block <= nblocks && block <= pblocks; block++) {
-			caddr_t			baddr, limit;
+			char			*baddr, *limit;
 			struct vxfs_dirblk	*dbp;
 			struct vxfs_direct	*de;
 
@@ -292,21 +289,18 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
 				 (kaddr + offset) :
 				 (baddr + VXFS_DIRBLKOV(dbp)));
 
-			for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) {
-				int	over;
-
+			for (; (char *)de <= limit; de = vxfs_next_entry(de)) {
 				if (!de->d_reclen)
 					break;
 				if (!de->d_ino)
 					continue;
 
-				offset = (caddr_t)de - kaddr;
-				over = filler(retp, de->d_name, de->d_namelen,
-					((page << PAGE_CACHE_SHIFT) | offset) + 2,
-					de->d_ino, DT_UNKNOWN);
-				if (over) {
+				offset = (char *)de - kaddr;
+				ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+				if (!dir_emit(ctx, de->d_name, de->d_namelen,
+					de->d_ino, DT_UNKNOWN)) {
 					vxfs_put_page(pp);
-					goto done;
+					return 0;
 				}
 			}
 			offset = 0;
@@ -314,9 +308,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
 		vxfs_put_page(pp);
 		offset = 0;
 	}
-
-done:
-	fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
-out:
+	ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
 	return 0;
 }
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index fed2c8afb3a..e37eb274e49 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -52,7 +52,6 @@ MODULE_AUTHOR("Christoph Hellwig");
 MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver");
 MODULE_LICENSE("Dual BSD/GPL");
 
-MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */
 
 
 static void		vxfs_put_super(struct super_block *);
@@ -258,6 +257,8 @@ static struct file_system_type vxfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("vxfs"); /* makes mount -t vxfs autoload the module */
+MODULE_ALIAS("vxfs");
 
 static int __init
 vxfs_init(void)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 310972b72a6..68851ff2fd4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -22,7 +22,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
-#include <linux/freezer.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -46,6 +45,7 @@ struct wb_writeback_work {
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
+	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
@@ -88,20 +88,6 @@ static inline struct inode *wb_inode(struct list_head *head)
 #define CREATE_TRACE_POINTS
 #include <trace/events/writeback.h>
 
-/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
-static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
-{
-	if (bdi->wb.task) {
-		wake_up_process(bdi->wb.task);
-	} else {
-		/*
-		 * The bdi thread isn't there, wake up the forker thread which
-		 * will create and run it.
-		 */
-		wake_up_process(default_backing_dev_info.wb.task);
-	}
-}
-
 static void bdi_queue_work(struct backing_dev_info *bdi,
 			   struct wb_writeback_work *work)
 {
@@ -109,10 +95,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 
 	spin_lock_bh(&bdi->wb_lock);
 	list_add_tail(&work->list, &bdi->work_list);
-	if (!bdi->wb.task)
-		trace_writeback_nothread(bdi, work);
-	bdi_wakeup_flusher(bdi);
 	spin_unlock_bh(&bdi->wb_lock);
+
+	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 }
 
 static void
@@ -127,10 +112,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	 */
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work) {
-		if (bdi->wb.task) {
-			trace_writeback_nowork(bdi);
-			wake_up_process(bdi->wb.task);
-		}
+		trace_writeback_nowork(bdi);
+		mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 		return;
 	}
 
@@ -177,9 +160,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
 	 * writeback as soon as there is no other work to do.
 	 */
 	trace_writeback_wake_background(bdi);
-	spin_lock_bh(&bdi->wb_lock);
-	bdi_wakeup_flusher(bdi);
-	spin_unlock_bh(&bdi->wb_lock);
+	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 }
 
 /*
@@ -318,8 +299,14 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 
 static int write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
-		return inode->i_sb->s_op->write_inode(inode, wbc);
+	int ret;
+
+	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
+		trace_writeback_write_inode_start(inode, wbc);
+		ret = inode->i_sb->s_op->write_inode(inode, wbc);
+		trace_writeback_write_inode(inode, wbc);
+		return ret;
+	}
 	return 0;
 }
 
@@ -450,14 +437,18 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	WARN_ON(!(inode->i_state & I_SYNC));
 
+	trace_writeback_single_inode_start(inode, wbc, nr_to_write);
+
 	ret = do_writepages(mapping, wbc);
 
 	/*
 	 * Make sure to wait on the data before writing out the metadata.
 	 * This is important for filesystems that modify metadata on data
-	 * I/O completion.
+	 * I/O completion. We don't do it for sync(2) writeback because it has a
+	 * separate, external IO completion path and ->sync_fs for guaranteeing
+	 * inode metadata is written back correctly.
 	 */
-	if (wbc->sync_mode == WB_SYNC_ALL) {
+	if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
 		int err = filemap_fdatawait(mapping);
 		if (ret == 0)
 			ret = err;
@@ -590,6 +581,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		.tagged_writepages	= work->tagged_writepages,
 		.for_kupdate		= work->for_kupdate,
 		.for_background		= work->for_background,
+		.for_sync		= work->for_sync,
 		.range_cyclic		= work->range_cyclic,
 		.range_start		= 0,
 		.range_end		= LLONG_MAX,
@@ -971,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 /*
  * Retrieve work items and do the writeback they describe
  */
-long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+static long wb_do_writeback(struct bdi_writeback *wb)
 {
 	struct backing_dev_info *bdi = wb->bdi;
 	struct wb_writeback_work *work;
@@ -979,12 +971,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 
 	set_bit(BDI_writeback_running, &wb->bdi->state);
 	while ((work = get_next_work_item(bdi)) != NULL) {
-		/*
-		 * Override sync mode, in case we must wait for completion
-		 * because this thread is exiting now.
-		 */
-		if (force_wait)
-			work->sync_mode = WB_SYNC_ALL;
 
 		trace_writeback_exec(bdi, work);
 
@@ -1012,66 +998,49 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 
 /*
  * Handle writeback of dirty data for the device backed by this bdi. Also
- * wakes up periodically and does kupdated style flushing.
+ * reschedules periodically and does kupdated style flushing.
  */
-int bdi_writeback_thread(void *data)
+void bdi_writeback_workfn(struct work_struct *work)
 {
-	struct bdi_writeback *wb = data;
+	struct bdi_writeback *wb = container_of(to_delayed_work(work),
+						struct bdi_writeback, dwork);
 	struct backing_dev_info *bdi = wb->bdi;
 	long pages_written;
 
+	set_worker_desc("flush-%s", dev_name(bdi->dev));
 	current->flags |= PF_SWAPWRITE;
-	set_freezable();
-	wb->last_active = jiffies;
 
-	/*
-	 * Our parent may run at a different priority, just set us to normal
-	 */
-	set_user_nice(current, 0);
-
-	trace_writeback_thread_start(bdi);
-
-	while (!kthread_freezable_should_stop(NULL)) {
+	if (likely(!current_is_workqueue_rescuer() ||
+		   list_empty(&bdi->bdi_list))) {
 		/*
-		 * Remove own delayed wake-up timer, since we are already awake
-		 * and we'll take care of the periodic write-back.
+		 * The normal path.  Keep writing back @bdi until its
+		 * work_list is empty.  Note that this path is also taken
+		 * if @bdi is shutting down even when we're running off the
+		 * rescuer as work_list needs to be drained.
 		 */
-		del_timer(&wb->wakeup_timer);
-
-		pages_written = wb_do_writeback(wb, 0);
-
+		do {
+			pages_written = wb_do_writeback(wb);
+			trace_writeback_pages_written(pages_written);
+		} while (!list_empty(&bdi->work_list));
+	} else {
+		/*
+		 * bdi_wq can't get enough workers and we're running off
+		 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
+		 * enough for efficient IO.
+		 */
+		pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+						    WB_REASON_FORKER_THREAD);
 		trace_writeback_pages_written(pages_written);
-
-		if (pages_written)
-			wb->last_active = jiffies;
-
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
-			__set_current_state(TASK_RUNNING);
-			continue;
-		}
-
-		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-		else {
-			/*
-			 * We have nothing to do, so can go sleep without any
-			 * timeout and save power. When a work is queued or
-			 * something is made dirty - we will be woken up.
-			 */
-			schedule();
-		}
 	}
 
-	/* Flush any work that raced with us exiting */
-	if (!list_empty(&bdi->work_list))
-		wb_do_writeback(wb, 1);
+	if (!list_empty(&bdi->work_list) ||
+	    (wb_has_dirty_io(wb) && dirty_writeback_interval))
+		queue_delayed_work(bdi_wq, &wb->dwork,
+			msecs_to_jiffies(dirty_writeback_interval * 10));
 
-	trace_writeback_thread_stop(bdi);
-	return 0;
+	current->flags &= ~PF_SWAPWRITE;
 }
 
-
 /*
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
@@ -1150,8 +1119,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	 * dirty the inode itself
 	 */
 	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+		trace_writeback_dirty_inode_start(inode, flags);
+
 		if (sb->s_op->dirty_inode)
 			sb->s_op->dirty_inode(inode, flags);
+
+		trace_writeback_dirty_inode(inode, flags);
 	}
 
 	/*
@@ -1332,47 +1305,43 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 EXPORT_SYMBOL(writeback_inodes_sb);
 
 /**
- * writeback_inodes_sb_if_idle	-	start writeback if none underway
+ * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
  * @sb: the superblock
- * @reason: reason why some writeback work was initiated
+ * @nr: the number of pages to write
+ * @reason: the reason of writeback
  *
- * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
  * Returns 1 if writeback was started, 0 if not.
  */
-int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
+int try_to_writeback_inodes_sb_nr(struct super_block *sb,
+				  unsigned long nr,
+				  enum wb_reason reason)
 {
-	if (!writeback_in_progress(sb->s_bdi)) {
-		down_read(&sb->s_umount);
-		writeback_inodes_sb(sb, reason);
-		up_read(&sb->s_umount);
+	if (writeback_in_progress(sb->s_bdi))
 		return 1;
-	} else
+
+	if (!down_read_trylock(&sb->s_umount))
 		return 0;
+
+	writeback_inodes_sb_nr(sb, nr, reason);
+	up_read(&sb->s_umount);
+	return 1;
 }
-EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
+EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
 
 /**
- * writeback_inodes_sb_nr_if_idle	-	start writeback if none underway
+ * try_to_writeback_inodes_sb - try to start writeback if none underway
  * @sb: the superblock
- * @nr: the number of pages to write
  * @reason: reason why some writeback work was initiated
  *
- * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Implement by try_to_writeback_inodes_sb_nr()
  * Returns 1 if writeback was started, 0 if not.
  */
-int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
-				   unsigned long nr,
-				   enum wb_reason reason)
+int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
-	if (!writeback_in_progress(sb->s_bdi)) {
-		down_read(&sb->s_umount);
-		writeback_inodes_sb_nr(sb, nr, reason);
-		up_read(&sb->s_umount);
-		return 1;
-	} else
-		return 0;
+	return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
-EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
+EXPORT_SYMBOL(try_to_writeback_inodes_sb);
 
 /**
  * sync_inodes_sb	-	sync sb inode pages
@@ -1391,6 +1360,7 @@ void sync_inodes_sb(struct super_block *sb)
 		.range_cyclic	= 0,
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
+		.for_sync	= 1,
 	};
 
 	/* Nothing to do? */
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index fe6ca583bbc..d8ac61d0c93 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -10,7 +10,7 @@
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
  * It can block.
  */
-void set_fs_root(struct fs_struct *fs, struct path *path)
+void set_fs_root(struct fs_struct *fs, const struct path *path)
 {
 	struct path old_root;
 
@@ -29,7 +29,7 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
  * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
  * It can block.
  */
-void set_fs_pwd(struct fs_struct *fs, struct path *path)
+void set_fs_pwd(struct fs_struct *fs, const struct path *path)
 {
 	struct path old_pwd;
 
@@ -53,7 +53,7 @@ static inline int replace_path(struct path *p, const struct path *old, const str
 	return 1;
 }
 
-void chroot_fs_refs(struct path *old_root, struct path *new_root)
+void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
 {
 	struct task_struct *g, *p;
 	struct fs_struct *fs;
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index b52aed1dca9..f7cff367db7 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -115,7 +115,7 @@ struct fscache_cache *fscache_select_cache_for_object(
 				     struct fscache_object, cookie_link);
 
 		cache = object->cache;
-		if (object->state >= FSCACHE_OBJECT_DYING ||
+		if (fscache_object_is_dying(object) ||
 		    test_bit(FSCACHE_IOERROR, &cache->flags))
 			cache = NULL;
 
@@ -224,8 +224,10 @@ int fscache_add_cache(struct fscache_cache *cache,
 	BUG_ON(!ifsdef);
 
 	cache->flags = 0;
-	ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
-	ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+	ifsdef->event_mask =
+		((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
+		~(1 << FSCACHE_OBJECT_EV_CLEARED);
+	__set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
 
 	if (!tagname)
 		tagname = cache->identifier;
@@ -330,25 +332,25 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache,
 {
 	struct fscache_object *object;
 
-	spin_lock(&cache->object_list_lock);
-
 	while (!list_empty(&cache->object_list)) {
-		object = list_entry(cache->object_list.next,
-				    struct fscache_object, cache_link);
-		list_move_tail(&object->cache_link, dying_objects);
+		spin_lock(&cache->object_list_lock);
 
-		_debug("withdraw %p", object->cookie);
+		if (!list_empty(&cache->object_list)) {
+			object = list_entry(cache->object_list.next,
+					    struct fscache_object, cache_link);
+			list_move_tail(&object->cache_link, dying_objects);
 
-		spin_lock(&object->lock);
-		spin_unlock(&cache->object_list_lock);
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
-		spin_unlock(&object->lock);
+			_debug("withdraw %p", object->cookie);
+
+			/* This must be done under object_list_lock to prevent
+			 * a race with fscache_drop_object().
+			 */
+			fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+		}
 
+		spin_unlock(&cache->object_list_lock);
 		cond_resched();
-		spin_lock(&cache->object_list_lock);
 	}
-
-	spin_unlock(&cache->object_list_lock);
 }
 
 /**
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 8dcb114758e..0e91a3c9fdb 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -95,6 +95,11 @@ struct fscache_cookie *__fscache_acquire_cookie(
 	atomic_set(&cookie->usage, 1);
 	atomic_set(&cookie->n_children, 0);
 
+	/* We keep the active count elevated until relinquishment to prevent an
+	 * attempt to wake up every time the object operations queue quiesces.
+	 */
+	atomic_set(&cookie->n_active, 1);
+
 	atomic_inc(&parent->usage);
 	atomic_inc(&parent->n_children);
 
@@ -177,7 +182,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
 
 	cookie->flags =
 		(1 << FSCACHE_COOKIE_LOOKING_UP) |
-		(1 << FSCACHE_COOKIE_CREATING) |
 		(1 << FSCACHE_COOKIE_NO_DATA_YET);
 
 	/* ask the cache to allocate objects for this cookie and its parent
@@ -205,7 +209,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
 
 	/* initiate the process of looking up all the objects in the chain
 	 * (done by fscache_initialise_object()) */
-	fscache_enqueue_object(object);
+	fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
 
 	spin_unlock(&cookie->lock);
 
@@ -237,13 +241,12 @@ static int fscache_alloc_object(struct fscache_cache *cache,
 				struct fscache_cookie *cookie)
 {
 	struct fscache_object *object;
-	struct hlist_node *_n;
 	int ret;
 
 	_enter("%p,%p{%s}", cache, cookie, cookie->def->name);
 
 	spin_lock(&cookie->lock);
-	hlist_for_each_entry(object, _n, &cookie->backing_objects,
+	hlist_for_each_entry(object, &cookie->backing_objects,
 			     cookie_link) {
 		if (object->cache == cache)
 			goto object_already_extant;
@@ -286,7 +289,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
 
 object_already_extant:
 	ret = -ENOBUFS;
-	if (object->state >= FSCACHE_OBJECT_DYING) {
+	if (fscache_object_is_dead(object)) {
 		spin_unlock(&cookie->lock);
 		goto error;
 	}
@@ -311,7 +314,6 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
 {
 	struct fscache_object *p;
 	struct fscache_cache *cache = object->cache;
-	struct hlist_node *_n;
 	int ret;
 
 	_enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
@@ -321,9 +323,9 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
 	/* there may be multiple initial creations of this object, but we only
 	 * want one */
 	ret = -EEXIST;
-	hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
+	hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
 		if (p->cache == object->cache) {
-			if (p->state >= FSCACHE_OBJECT_DYING)
+			if (fscache_object_is_dying(p))
 				ret = -ENOBUFS;
 			goto cant_attach_object;
 		}
@@ -331,10 +333,10 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
 
 	/* pin the parent object */
 	spin_lock_nested(&cookie->parent->lock, 1);
-	hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
+	hlist_for_each_entry(p, &cookie->parent->backing_objects,
 			     cookie_link) {
 		if (p->cache == object->cache) {
-			if (p->state >= FSCACHE_OBJECT_DYING) {
+			if (fscache_object_is_dying(p)) {
 				ret = -ENOBUFS;
 				spin_unlock(&cookie->parent->lock);
 				goto cant_attach_object;
@@ -402,7 +404,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
 			object = hlist_entry(cookie->backing_objects.first,
 					     struct fscache_object,
 					     cookie_link);
-			if (object->state < FSCACHE_OBJECT_DYING)
+			if (fscache_object_is_live(object))
 				fscache_raise_event(
 					object, FSCACHE_OBJECT_EV_INVALIDATE);
 		}
@@ -435,7 +437,6 @@ EXPORT_SYMBOL(__fscache_wait_on_invalidate);
 void __fscache_update_cookie(struct fscache_cookie *cookie)
 {
 	struct fscache_object *object;
-	struct hlist_node *_p;
 
 	fscache_stat(&fscache_n_updates);
 
@@ -452,7 +453,7 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
 	spin_lock(&cookie->lock);
 
 	/* update the index entry on disk in each cache backing this cookie */
-	hlist_for_each_entry(object, _p,
+	hlist_for_each_entry(object,
 			     &cookie->backing_objects, cookie_link) {
 		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
 	}
@@ -470,9 +471,7 @@ EXPORT_SYMBOL(__fscache_update_cookie);
  */
 void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 {
-	struct fscache_cache *cache;
 	struct fscache_object *object;
-	unsigned long event;
 
 	fscache_stat(&fscache_n_relinquishes);
 	if (retire)
@@ -484,8 +483,11 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 		return;
 	}
 
-	_enter("%p{%s,%p},%d",
-	       cookie, cookie->def->name, cookie->netfs_data, retire);
+	_enter("%p{%s,%p,%d},%d",
+	       cookie, cookie->def->name, cookie->netfs_data,
+	       atomic_read(&cookie->n_active), retire);
+
+	ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
 
 	if (atomic_read(&cookie->n_children) != 0) {
 		printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
@@ -493,62 +495,28 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
 		BUG();
 	}
 
-	/* wait for the cookie to finish being instantiated (or to fail) */
-	if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
-		fscache_stat(&fscache_n_relinquishes_waitcrt);
-		wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
-			    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-	}
-
-	event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+	/* No further netfs-accessing operations on this cookie permitted */
+	set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
+	if (retire)
+		set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
 
-try_again:
 	spin_lock(&cookie->lock);
-
-	/* break links with all the active objects */
-	while (!hlist_empty(&cookie->backing_objects)) {
-		int n_reads;
-		object = hlist_entry(cookie->backing_objects.first,
-				     struct fscache_object,
-				     cookie_link);
-
-		_debug("RELEASE OBJ%x", object->debug_id);
-
-		set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
-		n_reads = atomic_read(&object->n_reads);
-		if (n_reads) {
-			int n_ops = object->n_ops;
-			int n_in_progress = object->n_in_progress;
-			spin_unlock(&cookie->lock);
-			printk(KERN_ERR "FS-Cache:"
-			       " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
-			       cookie->def->name,
-			       n_reads, n_ops, n_in_progress);
-			wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
-				    fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-			printk("Wait finished\n");
-			goto try_again;
-		}
-
-		/* detach each cache object from the object cookie */
-		spin_lock(&object->lock);
-		hlist_del_init(&object->cookie_link);
-
-		cache = object->cache;
-		object->cookie = NULL;
-		fscache_raise_event(object, event);
-		spin_unlock(&object->lock);
-
-		if (atomic_dec_and_test(&cookie->usage))
-			/* the cookie refcount shouldn't be reduced to 0 yet */
-			BUG();
+	hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
+		fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
 	}
+	spin_unlock(&cookie->lock);
 
-	/* detach pointers back to the netfs */
+	/* Wait for cessation of activity requiring access to the netfs (when
+	 * n_active reaches 0).
+	 */
+	if (!atomic_dec_and_test(&cookie->n_active))
+		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+				 TASK_UNINTERRUPTIBLE);
+
+	/* Clear pointers back to the netfs */
 	cookie->netfs_data	= NULL;
 	cookie->def		= NULL;
-
-	spin_unlock(&cookie->lock);
+	BUG_ON(cookie->stores.rnode);
 
 	if (cookie->parent) {
 		ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
@@ -556,7 +524,7 @@ try_again:
 		atomic_dec(&cookie->parent->n_children);
 	}
 
-	/* finally dispose of the cookie */
+	/* Dispose of the netfs's link to the cookie */
 	ASSERTCMP(atomic_read(&cookie->usage), >, 0);
 	fscache_cookie_put(cookie);
 
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index f5b4baee735..10a2ade0bdf 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -55,6 +55,7 @@ static struct fscache_cookie_def fscache_fsdef_index_def = {
 
 struct fscache_cookie fscache_fsdef_index = {
 	.usage		= ATOMIC_INIT(1),
+	.n_active	= ATOMIC_INIT(1),
 	.lock		= __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
 	.backing_objects = HLIST_HEAD_INIT,
 	.def		= &fscache_fsdef_index_def,
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index ee38fef4be5..12d505bedb5 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -93,14 +93,11 @@ static inline bool fscache_object_congested(void)
 
 extern int fscache_wait_bit(void *);
 extern int fscache_wait_bit_interruptible(void *);
+extern int fscache_wait_atomic_t(atomic_t *);
 
 /*
  * object.c
  */
-extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
-
-extern void fscache_withdrawing_object(struct fscache_cache *,
-				       struct fscache_object *);
 extern void fscache_enqueue_object(struct fscache_object *);
 
 /*
@@ -110,8 +107,10 @@ extern void fscache_enqueue_object(struct fscache_object *);
 extern const struct file_operations fscache_objlist_fops;
 
 extern void fscache_objlist_add(struct fscache_object *);
+extern void fscache_objlist_remove(struct fscache_object *);
 #else
 #define fscache_objlist_add(object) do {} while(0)
+#define fscache_objlist_remove(object) do {} while(0)
 #endif
 
 /*
@@ -291,6 +290,10 @@ static inline void fscache_raise_event(struct fscache_object *object,
 				       unsigned event)
 {
 	BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
+#if 0
+	printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
+	       object->debug_id, object->event_mask, (1 << event));
+#endif
 	if (!test_and_set_bit(event, &object->events) &&
 	    test_bit(event, &object->event_mask))
 		fscache_enqueue_object(object);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index f9d856773f7..7c27907e650 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -205,7 +205,6 @@ int fscache_wait_bit(void *flags)
 	schedule();
 	return 0;
 }
-EXPORT_SYMBOL(fscache_wait_bit);
 
 /*
  * wait_on_bit() sleep function for interruptible waiting
@@ -215,4 +214,12 @@ int fscache_wait_bit_interruptible(void *flags)
 	schedule();
 	return signal_pending(current);
 }
-EXPORT_SYMBOL(fscache_wait_bit_interruptible);
+
+/*
+ * wait_on_atomic_t() sleep function for uninterruptible waiting
+ */
+int fscache_wait_atomic_t(atomic_t *p)
+{
+	schedule();
+	return 0;
+}
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index e028b8eb1c4..b1bb6117473 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -40,6 +40,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
 	/* initialise the primary index cookie */
 	atomic_set(&netfs->primary_index->usage, 1);
 	atomic_set(&netfs->primary_index->n_children, 0);
+	atomic_set(&netfs->primary_index->n_active, 1);
 
 	netfs->primary_index->def		= &fscache_fsdef_netfs_def;
 	netfs->primary_index->parent		= &fscache_fsdef_index;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index f27c89d1788..e1959efad64 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -70,13 +70,10 @@ void fscache_objlist_add(struct fscache_object *obj)
 	write_unlock(&fscache_object_list_lock);
 }
 
-/**
- * fscache_object_destroy - Note that a cache object is about to be destroyed
- * @object: The object to be destroyed
- *
- * Note the imminent destruction and deallocation of a cache object record.
+/*
+ * Remove an object from the object list.
  */
-void fscache_object_destroy(struct fscache_object *obj)
+void fscache_objlist_remove(struct fscache_object *obj)
 {
 	write_lock(&fscache_object_list_lock);
 
@@ -85,7 +82,6 @@ void fscache_object_destroy(struct fscache_object *obj)
 
 	write_unlock(&fscache_object_list_lock);
 }
-EXPORT_SYMBOL(fscache_object_destroy);
 
 /*
  * find the object in the tree on or after the specified index
@@ -166,15 +162,14 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 {
 	struct fscache_objlist_data *data = m->private;
 	struct fscache_object *obj = v;
+	struct fscache_cookie *cookie;
 	unsigned long config = data->config;
-	uint16_t keylen, auxlen;
 	char _type[3], *type;
-	bool no_cookie;
 	u8 *buf = data->buf, *p;
 
 	if ((unsigned long) v == 1) {
 		seq_puts(m, "OBJECT   PARENT   STAT CHLDN OPS OOP IPR EX READS"
-			 " EM EV F S"
+			 " EM EV FL S"
 			 " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
 		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
 			      FSCACHE_OBJLIST_CONFIG_AUX))
@@ -193,7 +188,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 
 	if ((unsigned long) v == 2) {
 		seq_puts(m, "======== ======== ==== ===== === === === == ====="
-			 " == == = ="
+			 " == == == ="
 			 " | ================ == == ================");
 		if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
 			      FSCACHE_OBJLIST_CONFIG_AUX))
@@ -216,10 +211,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 		}							\
 	} while(0)
 
+	cookie = obj->cookie;
 	if (~config) {
-		FILTER(obj->cookie,
+		FILTER(cookie->def,
 		       COOKIE, NOCOOKIE);
-		FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
+		FILTER(fscache_object_is_active(obj) ||
 		       obj->n_ops != 0 ||
 		       obj->n_obj_ops != 0 ||
 		       obj->flags ||
@@ -235,10 +231,10 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 	}
 
 	seq_printf(m,
-		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
+		   "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
 		   obj->debug_id,
 		   obj->parent ? obj->parent->debug_id : -1,
-		   fscache_object_states_short[obj->state],
+		   obj->state->short_name,
 		   obj->n_children,
 		   obj->n_ops,
 		   obj->n_obj_ops,
@@ -250,48 +246,40 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 		   obj->flags,
 		   work_busy(&obj->work));
 
-	no_cookie = true;
-	keylen = auxlen = 0;
-	if (obj->cookie) {
-		spin_lock(&obj->lock);
-		if (obj->cookie) {
-			switch (obj->cookie->def->type) {
-			case 0:
-				type = "IX";
-				break;
-			case 1:
-				type = "DT";
-				break;
-			default:
-				sprintf(_type, "%02u",
-					obj->cookie->def->type);
-				type = _type;
-				break;
-			}
+	if (fscache_use_cookie(obj)) {
+		uint16_t keylen = 0, auxlen = 0;
 
-			seq_printf(m, "%-16s %s %2lx %16p",
-				   obj->cookie->def->name,
-				   type,
-				   obj->cookie->flags,
-				   obj->cookie->netfs_data);
-
-			if (obj->cookie->def->get_key &&
-			    config & FSCACHE_OBJLIST_CONFIG_KEY)
-				keylen = obj->cookie->def->get_key(
-					obj->cookie->netfs_data,
-					buf, 400);
-
-			if (obj->cookie->def->get_aux &&
-			    config & FSCACHE_OBJLIST_CONFIG_AUX)
-				auxlen = obj->cookie->def->get_aux(
-					obj->cookie->netfs_data,
-					buf + keylen, 512 - keylen);
-
-			no_cookie = false;
+		switch (cookie->def->type) {
+		case 0:
+			type = "IX";
+			break;
+		case 1:
+			type = "DT";
+			break;
+		default:
+			sprintf(_type, "%02u", cookie->def->type);
+			type = _type;
+			break;
 		}
-		spin_unlock(&obj->lock);
 
-		if (!no_cookie && (keylen > 0 || auxlen > 0)) {
+		seq_printf(m, "%-16s %s %2lx %16p",
+			   cookie->def->name,
+			   type,
+			   cookie->flags,
+			   cookie->netfs_data);
+
+		if (cookie->def->get_key &&
+		    config & FSCACHE_OBJLIST_CONFIG_KEY)
+			keylen = cookie->def->get_key(cookie->netfs_data,
+						      buf, 400);
+
+		if (cookie->def->get_aux &&
+		    config & FSCACHE_OBJLIST_CONFIG_AUX)
+			auxlen = cookie->def->get_aux(cookie->netfs_data,
+						      buf + keylen, 512 - keylen);
+		fscache_unuse_cookie(obj);
+
+		if (keylen > 0 || auxlen > 0) {
 			seq_printf(m, " ");
 			for (p = buf; keylen > 0; keylen--)
 				seq_printf(m, "%02x", *p++);
@@ -302,12 +290,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
 					seq_printf(m, "%02x", *p++);
 			}
 		}
-	}
 
-	if (no_cookie)
-		seq_printf(m, "<no_cookie>\n");
-	else
 		seq_printf(m, "\n");
+	} else {
+		seq_printf(m, "<no_netfs>\n");
+	}
 	return 0;
 }
 
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 50d41c18021..86d75a60b20 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -15,52 +15,131 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 #include "internal.h"
 
-const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
-	[FSCACHE_OBJECT_INIT]		= "OBJECT_INIT",
-	[FSCACHE_OBJECT_LOOKING_UP]	= "OBJECT_LOOKING_UP",
-	[FSCACHE_OBJECT_CREATING]	= "OBJECT_CREATING",
-	[FSCACHE_OBJECT_AVAILABLE]	= "OBJECT_AVAILABLE",
-	[FSCACHE_OBJECT_ACTIVE]		= "OBJECT_ACTIVE",
-	[FSCACHE_OBJECT_INVALIDATING]	= "OBJECT_INVALIDATING",
-	[FSCACHE_OBJECT_UPDATING]	= "OBJECT_UPDATING",
-	[FSCACHE_OBJECT_DYING]		= "OBJECT_DYING",
-	[FSCACHE_OBJECT_LC_DYING]	= "OBJECT_LC_DYING",
-	[FSCACHE_OBJECT_ABORT_INIT]	= "OBJECT_ABORT_INIT",
-	[FSCACHE_OBJECT_RELEASING]	= "OBJECT_RELEASING",
-	[FSCACHE_OBJECT_RECYCLING]	= "OBJECT_RECYCLING",
-	[FSCACHE_OBJECT_WITHDRAWING]	= "OBJECT_WITHDRAWING",
-	[FSCACHE_OBJECT_DEAD]		= "OBJECT_DEAD",
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
+static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
+static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
+
+#define __STATE_NAME(n) fscache_osm_##n
+#define STATE(n) (&__STATE_NAME(n))
+
+/*
+ * Define a work state.  Work states are execution states.  No event processing
+ * is performed by them.  The function attached to a work state returns a
+ * pointer indicating the next state to which the state machine should
+ * transition.  Returning NO_TRANSIT repeats the current state, but goes back
+ * to the scheduler first.
+ */
+#define WORK_STATE(n, sn, f) \
+	const struct fscache_state __STATE_NAME(n) = {			\
+		.name = #n,						\
+		.short_name = sn,					\
+		.work = f						\
+	}
+
+/*
+ * Returns from work states.
+ */
+#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
+
+#define NO_TRANSIT ((struct fscache_state *)NULL)
+
+/*
+ * Define a wait state.  Wait states are event processing states.  No execution
+ * is performed by them.  Wait states are just tables of "if event X occurs,
+ * clear it and transition to state Y".  The dispatcher returns to the
+ * scheduler if none of the events in which the wait state has an interest are
+ * currently pending.
+ */
+#define WAIT_STATE(n, sn, ...) \
+	const struct fscache_state __STATE_NAME(n) = {			\
+		.name = #n,						\
+		.short_name = sn,					\
+		.work = NULL,						\
+		.transitions = { __VA_ARGS__, { 0, NULL } }		\
+	}
+
+#define TRANSIT_TO(state, emask) \
+	{ .events = (emask), .transit_to = STATE(state) }
+
+/*
+ * The object state machine.
+ */
+static WORK_STATE(INIT_OBJECT,		"INIT", fscache_initialise_object);
+static WORK_STATE(PARENT_READY,		"PRDY", fscache_parent_ready);
+static WORK_STATE(ABORT_INIT,		"ABRT", fscache_abort_initialisation);
+static WORK_STATE(LOOK_UP_OBJECT,	"LOOK", fscache_look_up_object);
+static WORK_STATE(CREATE_OBJECT,	"CRTO", fscache_look_up_object);
+static WORK_STATE(OBJECT_AVAILABLE,	"AVBL", fscache_object_available);
+static WORK_STATE(JUMPSTART_DEPS,	"JUMP", fscache_jumpstart_dependents);
+
+static WORK_STATE(INVALIDATE_OBJECT,	"INVL", fscache_invalidate_object);
+static WORK_STATE(UPDATE_OBJECT,	"UPDT", fscache_update_object);
+
+static WORK_STATE(LOOKUP_FAILURE,	"LCFL", fscache_lookup_failure);
+static WORK_STATE(KILL_OBJECT,		"KILL", fscache_kill_object);
+static WORK_STATE(KILL_DEPENDENTS,	"KDEP", fscache_kill_dependents);
+static WORK_STATE(DROP_OBJECT,		"DROP", fscache_drop_object);
+static WORK_STATE(OBJECT_DEAD,		"DEAD", (void*)2UL);
+
+static WAIT_STATE(WAIT_FOR_INIT,	"?INI",
+		  TRANSIT_TO(INIT_OBJECT,	1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_PARENT,	"?PRN",
+		  TRANSIT_TO(PARENT_READY,	1 << FSCACHE_OBJECT_EV_PARENT_READY));
+
+static WAIT_STATE(WAIT_FOR_CMD,		"?CMD",
+		  TRANSIT_TO(INVALIDATE_OBJECT,	1 << FSCACHE_OBJECT_EV_INVALIDATE),
+		  TRANSIT_TO(UPDATE_OBJECT,	1 << FSCACHE_OBJECT_EV_UPDATE),
+		  TRANSIT_TO(JUMPSTART_DEPS,	1 << FSCACHE_OBJECT_EV_NEW_CHILD));
+
+static WAIT_STATE(WAIT_FOR_CLEARANCE,	"?CLR",
+		  TRANSIT_TO(KILL_OBJECT,	1 << FSCACHE_OBJECT_EV_CLEARED));
+
+/*
+ * Out-of-band event transition tables.  These are for handling unexpected
+ * events, such as an I/O error.  If an OOB event occurs, the state machine
+ * clears and disables the event and forces a transition to the nominated work
+ * state (acurrently executing work states will complete first).
+ *
+ * In such a situation, object->state remembers the state the machine should
+ * have been in/gone to and returning NO_TRANSIT returns to that.
+ */
+static const struct fscache_transition fscache_osm_init_oob[] = {
+	   TRANSIT_TO(ABORT_INIT,
+		      (1 << FSCACHE_OBJECT_EV_ERROR) |
+		      (1 << FSCACHE_OBJECT_EV_KILL)),
+	   { 0, NULL }
+};
+
+static const struct fscache_transition fscache_osm_lookup_oob[] = {
+	   TRANSIT_TO(LOOKUP_FAILURE,
+		      (1 << FSCACHE_OBJECT_EV_ERROR) |
+		      (1 << FSCACHE_OBJECT_EV_KILL)),
+	   { 0, NULL }
 };
-EXPORT_SYMBOL(fscache_object_states);
-
-const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
-	[FSCACHE_OBJECT_INIT]		= "INIT",
-	[FSCACHE_OBJECT_LOOKING_UP]	= "LOOK",
-	[FSCACHE_OBJECT_CREATING]	= "CRTN",
-	[FSCACHE_OBJECT_AVAILABLE]	= "AVBL",
-	[FSCACHE_OBJECT_ACTIVE]		= "ACTV",
-	[FSCACHE_OBJECT_INVALIDATING]	= "INVL",
-	[FSCACHE_OBJECT_UPDATING]	= "UPDT",
-	[FSCACHE_OBJECT_DYING]		= "DYNG",
-	[FSCACHE_OBJECT_LC_DYING]	= "LCDY",
-	[FSCACHE_OBJECT_ABORT_INIT]	= "ABTI",
-	[FSCACHE_OBJECT_RELEASING]	= "RELS",
-	[FSCACHE_OBJECT_RECYCLING]	= "RCYC",
-	[FSCACHE_OBJECT_WITHDRAWING]	= "WTHD",
-	[FSCACHE_OBJECT_DEAD]		= "DEAD",
+
+static const struct fscache_transition fscache_osm_run_oob[] = {
+	   TRANSIT_TO(KILL_OBJECT,
+		      (1 << FSCACHE_OBJECT_EV_ERROR) |
+		      (1 << FSCACHE_OBJECT_EV_KILL)),
+	   { 0, NULL }
 };
 
 static int  fscache_get_object(struct fscache_object *);
 static void fscache_put_object(struct fscache_object *);
-static void fscache_initialise_object(struct fscache_object *);
-static void fscache_lookup_object(struct fscache_object *);
-static void fscache_object_available(struct fscache_object *);
-static void fscache_invalidate_object(struct fscache_object *);
-static void fscache_release_object(struct fscache_object *);
-static void fscache_withdraw_object(struct fscache_object *);
-static void fscache_enqueue_dependents(struct fscache_object *);
+static bool fscache_enqueue_dependents(struct fscache_object *, int);
 static void fscache_dequeue_object(struct fscache_object *);
 
 /*
@@ -75,295 +154,116 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
 	       object->debug_id, parent->debug_id, parent->n_ops);
 
 	spin_lock_nested(&parent->lock, 1);
-	parent->n_ops--;
 	parent->n_obj_ops--;
+	parent->n_ops--;
 	if (parent->n_ops == 0)
 		fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
 	spin_unlock(&parent->lock);
 }
 
 /*
- * Notify netfs of invalidation completion.
+ * Object state machine dispatcher.
  */
-static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
+static void fscache_object_sm_dispatcher(struct fscache_object *object)
 {
-	if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
-		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
-}
-
-/*
- * process events that have been sent to an object's state machine
- * - initiates parent lookup
- * - does object lookup
- * - does object creation
- * - does object recycling and retirement
- * - does object withdrawal
- */
-static void fscache_object_state_machine(struct fscache_object *object)
-{
-	enum fscache_object_state new_state;
-	struct fscache_cookie *cookie;
-	int event;
+	const struct fscache_transition *t;
+	const struct fscache_state *state, *new_state;
+	unsigned long events, event_mask;
+	int event = -1;
 
 	ASSERT(object != NULL);
 
 	_enter("{OBJ%x,%s,%lx}",
-	       object->debug_id, fscache_object_states[object->state],
-	       object->events);
-
-	switch (object->state) {
-		/* wait for the parent object to become ready */
-	case FSCACHE_OBJECT_INIT:
-		object->event_mask =
-			FSCACHE_OBJECT_EVENTS_MASK &
-			~(1 << FSCACHE_OBJECT_EV_CLEARED);
-		fscache_initialise_object(object);
-		goto done;
-
-		/* look up the object metadata on disk */
-	case FSCACHE_OBJECT_LOOKING_UP:
-		fscache_lookup_object(object);
-		goto lookup_transit;
-
-		/* create the object metadata on disk */
-	case FSCACHE_OBJECT_CREATING:
-		fscache_lookup_object(object);
-		goto lookup_transit;
-
-		/* handle an object becoming available; start pending
-		 * operations and queue dependent operations for processing */
-	case FSCACHE_OBJECT_AVAILABLE:
-		fscache_object_available(object);
-		goto active_transit;
-
-		/* normal running state */
-	case FSCACHE_OBJECT_ACTIVE:
-		goto active_transit;
-
-		/* Invalidate an object on disk */
-	case FSCACHE_OBJECT_INVALIDATING:
-		clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
-		fscache_stat(&fscache_n_invalidates_run);
-		fscache_stat(&fscache_n_cop_invalidate_object);
-		fscache_invalidate_object(object);
-		fscache_stat_d(&fscache_n_cop_invalidate_object);
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
-		goto active_transit;
-
-		/* update the object metadata on disk */
-	case FSCACHE_OBJECT_UPDATING:
-		clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
-		fscache_stat(&fscache_n_updates_run);
-		fscache_stat(&fscache_n_cop_update_object);
-		object->cache->ops->update_object(object);
-		fscache_stat_d(&fscache_n_cop_update_object);
-		goto active_transit;
-
-		/* handle an object dying during lookup or creation */
-	case FSCACHE_OBJECT_LC_DYING:
-		object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-		fscache_stat(&fscache_n_cop_lookup_complete);
-		object->cache->ops->lookup_complete(object);
-		fscache_stat_d(&fscache_n_cop_lookup_complete);
-
-		spin_lock(&object->lock);
-		object->state = FSCACHE_OBJECT_DYING;
-		cookie = object->cookie;
-		if (cookie) {
-			if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
-					       &cookie->flags))
-				wake_up_bit(&cookie->flags,
-					    FSCACHE_COOKIE_LOOKING_UP);
-			if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
-					       &cookie->flags))
-				wake_up_bit(&cookie->flags,
-					    FSCACHE_COOKIE_CREATING);
+	       object->debug_id, object->state->name, object->events);
+
+	event_mask = object->event_mask;
+restart:
+	object->event_mask = 0; /* Mask normal event handling */
+	state = object->state;
+restart_masked:
+	events = object->events;
+
+	/* Handle any out-of-band events (typically an error) */
+	if (events & object->oob_event_mask) {
+		_debug("{OBJ%x} oob %lx",
+		       object->debug_id, events & object->oob_event_mask);
+		for (t = object->oob_table; t->events; t++) {
+			if (events & t->events) {
+				state = t->transit_to;
+				ASSERT(state->work != NULL);
+				event = fls(events & t->events) - 1;
+				__clear_bit(event, &object->oob_event_mask);
+				clear_bit(event, &object->events);
+				goto execute_work_state;
+			}
 		}
-		spin_unlock(&object->lock);
+	}
 
-		fscache_done_parent_op(object);
+	/* Wait states are just transition tables */
+	if (!state->work) {
+		if (events & event_mask) {
+			for (t = state->transitions; t->events; t++) {
+				if (events & t->events) {
+					new_state = t->transit_to;
+					event = fls(events & t->events) - 1;
+					clear_bit(event, &object->events);
+					_debug("{OBJ%x} ev %d: %s -> %s",
+					       object->debug_id, event,
+					       state->name, new_state->name);
+					object->state = state = new_state;
+					goto execute_work_state;
+				}
+			}
 
-		/* wait for completion of all active operations on this object
-		 * and the death of all child objects of this object */
-	case FSCACHE_OBJECT_DYING:
-	dying:
-		clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
-		spin_lock(&object->lock);
-		_debug("dying OBJ%x {%d,%d}",
-		       object->debug_id, object->n_ops, object->n_children);
-		if (object->n_ops == 0 && object->n_children == 0) {
-			object->event_mask &=
-				~(1 << FSCACHE_OBJECT_EV_CLEARED);
-			object->event_mask |=
-				(1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-				(1 << FSCACHE_OBJECT_EV_RETIRE) |
-				(1 << FSCACHE_OBJECT_EV_RELEASE) |
-				(1 << FSCACHE_OBJECT_EV_ERROR);
-		} else {
-			object->event_mask &=
-				~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-				  (1 << FSCACHE_OBJECT_EV_RETIRE) |
-				  (1 << FSCACHE_OBJECT_EV_RELEASE) |
-				  (1 << FSCACHE_OBJECT_EV_ERROR));
-			object->event_mask |=
-				1 << FSCACHE_OBJECT_EV_CLEARED;
+			/* The event mask didn't include all the tabled bits */
+			BUG();
 		}
-		spin_unlock(&object->lock);
-		fscache_enqueue_dependents(object);
-		fscache_start_operations(object);
-		goto terminal_transit;
-
-		/* handle an abort during initialisation */
-	case FSCACHE_OBJECT_ABORT_INIT:
-		_debug("handle abort init %lx", object->events);
-		object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
-
-		spin_lock(&object->lock);
-		fscache_dequeue_object(object);
-
-		object->state = FSCACHE_OBJECT_DYING;
-		if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
-				       &object->cookie->flags))
-			wake_up_bit(&object->cookie->flags,
-				    FSCACHE_COOKIE_CREATING);
-		spin_unlock(&object->lock);
-		goto dying;
-
-		/* handle the netfs releasing an object and possibly marking it
-		 * obsolete too */
-	case FSCACHE_OBJECT_RELEASING:
-	case FSCACHE_OBJECT_RECYCLING:
-		object->event_mask &=
-			~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-			  (1 << FSCACHE_OBJECT_EV_RETIRE) |
-			  (1 << FSCACHE_OBJECT_EV_RELEASE) |
-			  (1 << FSCACHE_OBJECT_EV_ERROR));
-		fscache_release_object(object);
-		spin_lock(&object->lock);
-		object->state = FSCACHE_OBJECT_DEAD;
-		spin_unlock(&object->lock);
-		fscache_stat(&fscache_n_object_dead);
-		goto terminal_transit;
-
-		/* handle the parent cache of this object being withdrawn from
-		 * active service */
-	case FSCACHE_OBJECT_WITHDRAWING:
-		object->event_mask &=
-			~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
-			  (1 << FSCACHE_OBJECT_EV_RETIRE) |
-			  (1 << FSCACHE_OBJECT_EV_RELEASE) |
-			  (1 << FSCACHE_OBJECT_EV_ERROR));
-		fscache_withdraw_object(object);
-		spin_lock(&object->lock);
-		object->state = FSCACHE_OBJECT_DEAD;
-		spin_unlock(&object->lock);
-		fscache_stat(&fscache_n_object_dead);
-		goto terminal_transit;
-
-		/* complain about the object being woken up once it is
-		 * deceased */
-	case FSCACHE_OBJECT_DEAD:
-		printk(KERN_ERR "FS-Cache:"
-		       " Unexpected event in dead state %lx\n",
-		       object->events & object->event_mask);
-		BUG();
-
-	default:
-		printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
-		       object->state);
-		BUG();
-	}
-
-	/* determine the transition from a lookup state */
-lookup_transit:
-	event = fls(object->events & object->event_mask) - 1;
-	switch (event) {
-	case FSCACHE_OBJECT_EV_WITHDRAW:
-	case FSCACHE_OBJECT_EV_RETIRE:
-	case FSCACHE_OBJECT_EV_RELEASE:
-	case FSCACHE_OBJECT_EV_ERROR:
-		new_state = FSCACHE_OBJECT_LC_DYING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_INVALIDATE:
-		new_state = FSCACHE_OBJECT_INVALIDATING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_REQUEUE:
-		goto done;
-	case -1:
-		goto done; /* sleep until event */
-	default:
-		goto unsupported_event;
+		/* Randomly woke up */
+		goto unmask_events;
 	}
 
-	/* determine the transition from an active state */
-active_transit:
-	event = fls(object->events & object->event_mask) - 1;
-	switch (event) {
-	case FSCACHE_OBJECT_EV_WITHDRAW:
-	case FSCACHE_OBJECT_EV_RETIRE:
-	case FSCACHE_OBJECT_EV_RELEASE:
-	case FSCACHE_OBJECT_EV_ERROR:
-		new_state = FSCACHE_OBJECT_DYING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_INVALIDATE:
-		new_state = FSCACHE_OBJECT_INVALIDATING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_UPDATE:
-		new_state = FSCACHE_OBJECT_UPDATING;
-		goto change_state;
-	case -1:
-		new_state = FSCACHE_OBJECT_ACTIVE;
-		goto change_state; /* sleep until event */
-	default:
-		goto unsupported_event;
-	}
+execute_work_state:
+	_debug("{OBJ%x} exec %s", object->debug_id, state->name);
 
-	/* determine the transition from a terminal state */
-terminal_transit:
-	event = fls(object->events & object->event_mask) - 1;
-	switch (event) {
-	case FSCACHE_OBJECT_EV_WITHDRAW:
-		new_state = FSCACHE_OBJECT_WITHDRAWING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_RETIRE:
-		new_state = FSCACHE_OBJECT_RECYCLING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_RELEASE:
-		new_state = FSCACHE_OBJECT_RELEASING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_ERROR:
-		new_state = FSCACHE_OBJECT_WITHDRAWING;
-		goto change_state;
-	case FSCACHE_OBJECT_EV_CLEARED:
-		new_state = FSCACHE_OBJECT_DYING;
-		goto change_state;
-	case -1:
-		goto done; /* sleep until event */
-	default:
-		goto unsupported_event;
+	new_state = state->work(object, event);
+	event = -1;
+	if (new_state == NO_TRANSIT) {
+		_debug("{OBJ%x} %s notrans", object->debug_id, state->name);
+		fscache_enqueue_object(object);
+		event_mask = object->oob_event_mask;
+		goto unmask_events;
 	}
 
-change_state:
-	spin_lock(&object->lock);
-	object->state = new_state;
-	spin_unlock(&object->lock);
+	_debug("{OBJ%x} %s -> %s",
+	       object->debug_id, state->name, new_state->name);
+	object->state = state = new_state;
 
-done:
-	_leave(" [->%s]", fscache_object_states[object->state]);
-	return;
+	if (state->work) {
+		if (unlikely(state->work == ((void *)2UL))) {
+			_leave(" [dead]");
+			return;
+		}
+		goto restart_masked;
+	}
 
-unsupported_event:
-	printk(KERN_ERR "FS-Cache:"
-	       " Unsupported event %d [%lx/%lx] in state %s\n",
-	       event, object->events, object->event_mask,
-	       fscache_object_states[object->state]);
-	BUG();
+	/* Transited to wait state */
+	event_mask = object->oob_event_mask;
+	for (t = state->transitions; t->events; t++)
+		event_mask |= t->events;
+
+unmask_events:
+	object->event_mask = event_mask;
+	smp_mb();
+	events = object->events;
+	if (events & event_mask)
+		goto restart;
+	_leave(" [msk %lx]", event_mask);
 }
 
 /*
  * execute an object
  */
-void fscache_object_work_func(struct work_struct *work)
+static void fscache_object_work_func(struct work_struct *work)
 {
 	struct fscache_object *object =
 		container_of(work, struct fscache_object, work);
@@ -372,14 +272,70 @@ void fscache_object_work_func(struct work_struct *work)
 	_enter("{OBJ%x}", object->debug_id);
 
 	start = jiffies;
-	fscache_object_state_machine(object);
+	fscache_object_sm_dispatcher(object);
 	fscache_hist(fscache_objs_histogram, start);
-	if (object->events & object->event_mask)
-		fscache_enqueue_object(object);
-	clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
 	fscache_put_object(object);
 }
-EXPORT_SYMBOL(fscache_object_work_func);
+
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ * @cookie: Cookie object will be attached to
+ * @cache: Cache in which backing object will be found
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_object_init(struct fscache_object *object,
+			 struct fscache_cookie *cookie,
+			 struct fscache_cache *cache)
+{
+	const struct fscache_transition *t;
+
+	atomic_inc(&cache->object_count);
+
+	object->state = STATE(WAIT_FOR_INIT);
+	object->oob_table = fscache_osm_init_oob;
+	object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
+	spin_lock_init(&object->lock);
+	INIT_LIST_HEAD(&object->cache_link);
+	INIT_HLIST_NODE(&object->cookie_link);
+	INIT_WORK(&object->work, fscache_object_work_func);
+	INIT_LIST_HEAD(&object->dependents);
+	INIT_LIST_HEAD(&object->dep_link);
+	INIT_LIST_HEAD(&object->pending_ops);
+	object->n_children = 0;
+	object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+	object->events = 0;
+	object->store_limit = 0;
+	object->store_limit_l = 0;
+	object->cache = cache;
+	object->cookie = cookie;
+	object->parent = NULL;
+
+	object->oob_event_mask = 0;
+	for (t = object->oob_table; t->events; t++)
+		object->oob_event_mask |= t->events;
+	object->event_mask = object->oob_event_mask;
+	for (t = object->state->transitions; t->events; t++)
+		object->event_mask |= t->events;
+}
+EXPORT_SYMBOL(fscache_object_init);
+
+/*
+ * Abort object initialisation before we start it.
+ */
+static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
+								int event)
+{
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	object->oob_event_mask = 0;
+	fscache_dequeue_object(object);
+	return transit_to(KILL_OBJECT);
+}
 
 /*
  * initialise an object
@@ -387,130 +343,136 @@ EXPORT_SYMBOL(fscache_object_work_func);
  *   immediately to do a creation
  * - we may need to start the process of creating a parent and we need to wait
  *   for the parent's lookup and creation to complete if it's not there yet
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- *   leaf-most cookies of the object and all its children
  */
-static void fscache_initialise_object(struct fscache_object *object)
+static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
+							     int event)
 {
 	struct fscache_object *parent;
+	bool success;
 
-	_enter("");
-	ASSERT(object->cookie != NULL);
-	ASSERT(object->cookie->parent != NULL);
-
-	if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
-			      (1 << FSCACHE_OBJECT_EV_RELEASE) |
-			      (1 << FSCACHE_OBJECT_EV_RETIRE) |
-			      (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
-		_debug("abort init %lx", object->events);
-		spin_lock(&object->lock);
-		object->state = FSCACHE_OBJECT_ABORT_INIT;
-		spin_unlock(&object->lock);
-		return;
-	}
+	_enter("{OBJ%x},%d", object->debug_id, event);
 
-	spin_lock(&object->cookie->lock);
-	spin_lock_nested(&object->cookie->parent->lock, 1);
+	ASSERT(list_empty(&object->dep_link));
 
 	parent = object->parent;
 	if (!parent) {
-		_debug("no parent");
-		set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
-	} else {
-		spin_lock(&object->lock);
-		spin_lock_nested(&parent->lock, 1);
-		_debug("parent %s", fscache_object_states[parent->state]);
-
-		if (parent->state >= FSCACHE_OBJECT_DYING) {
-			_debug("bad parent");
-			set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
-		} else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
-			_debug("wait");
-
-			/* we may get woken up in this state by child objects
-			 * binding on to us, so we need to make sure we don't
-			 * add ourself to the list multiple times */
-			if (list_empty(&object->dep_link)) {
-				fscache_stat(&fscache_n_cop_grab_object);
-				object->cache->ops->grab_object(object);
-				fscache_stat_d(&fscache_n_cop_grab_object);
-				list_add(&object->dep_link,
-					 &parent->dependents);
-
-				/* fscache_acquire_non_index_cookie() uses this
-				 * to wake the chain up */
-				if (parent->state == FSCACHE_OBJECT_INIT)
-					fscache_enqueue_object(parent);
-			}
-		} else {
-			_debug("go");
-			parent->n_ops++;
-			parent->n_obj_ops++;
-			object->lookup_jif = jiffies;
-			object->state = FSCACHE_OBJECT_LOOKING_UP;
-			set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-		}
+		_leave(" [no parent]");
+		return transit_to(DROP_OBJECT);
+	}
 
-		spin_unlock(&parent->lock);
-		spin_unlock(&object->lock);
+	_debug("parent: %s of:%lx", parent->state->name, parent->flags);
+
+	if (fscache_object_is_dying(parent)) {
+		_leave(" [bad parent]");
+		return transit_to(DROP_OBJECT);
 	}
 
-	spin_unlock(&object->cookie->parent->lock);
-	spin_unlock(&object->cookie->lock);
+	if (fscache_object_is_available(parent)) {
+		_leave(" [ready]");
+		return transit_to(PARENT_READY);
+	}
+
+	_debug("wait");
+
+	spin_lock(&parent->lock);
+	fscache_stat(&fscache_n_cop_grab_object);
+	success = false;
+	if (fscache_object_is_live(parent) &&
+	    object->cache->ops->grab_object(object)) {
+		list_add(&object->dep_link, &parent->dependents);
+		success = true;
+	}
+	fscache_stat_d(&fscache_n_cop_grab_object);
+	spin_unlock(&parent->lock);
+	if (!success) {
+		_leave(" [grab failed]");
+		return transit_to(DROP_OBJECT);
+	}
+
+	/* fscache_acquire_non_index_cookie() uses this
+	 * to wake the chain up */
+	fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
+	_leave(" [wait]");
+	return transit_to(WAIT_FOR_PARENT);
+}
+
+/*
+ * Once the parent object is ready, we should kick off our lookup op.
+ */
+static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
+							int event)
+{
+	struct fscache_object *parent = object->parent;
+
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	ASSERT(parent != NULL);
+
+	spin_lock(&parent->lock);
+	parent->n_ops++;
+	parent->n_obj_ops++;
+	object->lookup_jif = jiffies;
+	spin_unlock(&parent->lock);
+
 	_leave("");
+	return transit_to(LOOK_UP_OBJECT);
 }
 
 /*
  * look an object up in the cache from which it was allocated
  * - we hold an "access lock" on the parent object, so the parent object cannot
  *   be withdrawn by either party till we've finished
- * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
- *   leaf-most cookies of the object and all its children
  */
-static void fscache_lookup_object(struct fscache_object *object)
+static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
+							  int event)
 {
 	struct fscache_cookie *cookie = object->cookie;
-	struct fscache_object *parent;
+	struct fscache_object *parent = object->parent;
 	int ret;
 
-	_enter("");
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	object->oob_table = fscache_osm_lookup_oob;
 
-	parent = object->parent;
 	ASSERT(parent != NULL);
 	ASSERTCMP(parent->n_ops, >, 0);
 	ASSERTCMP(parent->n_obj_ops, >, 0);
 
 	/* make sure the parent is still available */
-	ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
-
-	if (parent->state >= FSCACHE_OBJECT_DYING ||
-	    test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
-		_debug("unavailable");
-		set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
-		_leave("");
-		return;
+	ASSERT(fscache_object_is_available(parent));
+
+	if (fscache_object_is_dying(parent) ||
+	    test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
+	    !fscache_use_cookie(object)) {
+		_leave(" [unavailable]");
+		return transit_to(LOOKUP_FAILURE);
 	}
 
-	_debug("LOOKUP \"%s/%s\" in \"%s\"",
-	       parent->cookie->def->name, cookie->def->name,
-	       object->cache->tag->name);
+	_debug("LOOKUP \"%s\" in \"%s\"",
+	       cookie->def->name, object->cache->tag->name);
 
 	fscache_stat(&fscache_n_object_lookups);
 	fscache_stat(&fscache_n_cop_lookup_object);
 	ret = object->cache->ops->lookup_object(object);
 	fscache_stat_d(&fscache_n_cop_lookup_object);
 
-	if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
-		set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+	fscache_unuse_cookie(object);
 
 	if (ret == -ETIMEDOUT) {
 		/* probably stuck behind another object, so move this one to
 		 * the back of the queue */
 		fscache_stat(&fscache_n_object_lookups_timed_out);
-		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+		_leave(" [timeout]");
+		return NO_TRANSIT;
 	}
 
-	_leave("");
+	if (ret < 0) {
+		_leave(" [error]");
+		return transit_to(LOOKUP_FAILURE);
+	}
+
+	_leave(" [ok]");
+	return transit_to(OBJECT_AVAILABLE);
 }
 
 /**
@@ -524,32 +486,20 @@ void fscache_object_lookup_negative(struct fscache_object *object)
 {
 	struct fscache_cookie *cookie = object->cookie;
 
-	_enter("{OBJ%x,%s}",
-	       object->debug_id, fscache_object_states[object->state]);
+	_enter("{OBJ%x,%s}", object->debug_id, object->state->name);
 
-	spin_lock(&object->lock);
-	if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+	if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
 		fscache_stat(&fscache_n_object_lookups_negative);
 
-		/* transit here to allow write requests to begin stacking up
-		 * and read requests to begin returning ENODATA */
-		object->state = FSCACHE_OBJECT_CREATING;
-		spin_unlock(&object->lock);
-
-		set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+		/* Allow write requests to begin stacking up and read requests to begin
+		 * returning ENODATA.
+		 */
 		set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
 
 		_debug("wake up lookup %p", &cookie->flags);
-		smp_mb__before_clear_bit();
-		clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-		smp_mb__after_clear_bit();
+		clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
 		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-	} else {
-		ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
-		spin_unlock(&object->lock);
 	}
-
 	_leave("");
 }
 EXPORT_SYMBOL(fscache_object_lookup_negative);
@@ -568,38 +518,26 @@ void fscache_obtained_object(struct fscache_object *object)
 {
 	struct fscache_cookie *cookie = object->cookie;
 
-	_enter("{OBJ%x,%s}",
-	       object->debug_id, fscache_object_states[object->state]);
+	_enter("{OBJ%x,%s}", object->debug_id, object->state->name);
 
 	/* if we were still looking up, then we must have a positive lookup
 	 * result, in which case there may be data available */
-	spin_lock(&object->lock);
-	if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+	if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
 		fscache_stat(&fscache_n_object_lookups_positive);
 
-		clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+		/* We do (presumably) have data */
+		clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
 
-		object->state = FSCACHE_OBJECT_AVAILABLE;
-		spin_unlock(&object->lock);
-
-		smp_mb__before_clear_bit();
-		clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
-		smp_mb__after_clear_bit();
+		/* Allow write requests to begin stacking up and read requests
+		 * to begin shovelling data.
+		 */
+		clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
 		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
-		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
 	} else {
-		ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
 		fscache_stat(&fscache_n_object_created);
-
-		object->state = FSCACHE_OBJECT_AVAILABLE;
-		spin_unlock(&object->lock);
-		set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
-		smp_wmb();
 	}
 
-	if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
-		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
-
+	set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
 	_leave("");
 }
 EXPORT_SYMBOL(fscache_obtained_object);
@@ -607,15 +545,14 @@ EXPORT_SYMBOL(fscache_obtained_object);
 /*
  * handle an object that has just become available
  */
-static void fscache_object_available(struct fscache_object *object)
+static const struct fscache_state *fscache_object_available(struct fscache_object *object,
+							    int event)
 {
-	_enter("{OBJ%x}", object->debug_id);
+	_enter("{OBJ%x},%d", object->debug_id, event);
 
-	spin_lock(&object->lock);
+	object->oob_table = fscache_osm_run_oob;
 
-	if (object->cookie &&
-	    test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
-		wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
+	spin_lock(&object->lock);
 
 	fscache_done_parent_op(object);
 	if (object->n_in_progress == 0) {
@@ -631,130 +568,158 @@ static void fscache_object_available(struct fscache_object *object)
 	fscache_stat(&fscache_n_cop_lookup_complete);
 	object->cache->ops->lookup_complete(object);
 	fscache_stat_d(&fscache_n_cop_lookup_complete);
-	fscache_enqueue_dependents(object);
 
 	fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
 	fscache_stat(&fscache_n_object_avail);
 
 	_leave("");
+	return transit_to(JUMPSTART_DEPS);
 }
 
 /*
- * drop an object's attachments
+ * Wake up this object's dependent objects now that we've become available.
  */
-static void fscache_drop_object(struct fscache_object *object)
+static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
+								int event)
 {
-	struct fscache_object *parent = object->parent;
-	struct fscache_cache *cache = object->cache;
+	_enter("{OBJ%x},%d", object->debug_id, event);
 
-	_enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+	if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
+		return NO_TRANSIT; /* Not finished; requeue */
+	return transit_to(WAIT_FOR_CMD);
+}
 
-	ASSERTCMP(object->cookie, ==, NULL);
-	ASSERT(hlist_unhashed(&object->cookie_link));
+/*
+ * Handle lookup or creation failute.
+ */
+static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
+							  int event)
+{
+	struct fscache_cookie *cookie;
 
-	spin_lock(&cache->object_list_lock);
-	list_del_init(&object->cache_link);
-	spin_unlock(&cache->object_list_lock);
+	_enter("{OBJ%x},%d", object->debug_id, event);
 
-	fscache_stat(&fscache_n_cop_drop_object);
-	cache->ops->drop_object(object);
-	fscache_stat_d(&fscache_n_cop_drop_object);
+	object->oob_event_mask = 0;
 
-	if (parent) {
-		_debug("release parent OBJ%x {%d}",
-		       parent->debug_id, parent->n_children);
+	fscache_stat(&fscache_n_cop_lookup_complete);
+	object->cache->ops->lookup_complete(object);
+	fscache_stat_d(&fscache_n_cop_lookup_complete);
 
-		spin_lock(&parent->lock);
-		parent->n_children--;
-		if (parent->n_children == 0)
-			fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
-		spin_unlock(&parent->lock);
-		object->parent = NULL;
+	cookie = object->cookie;
+	set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+	if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+
+	fscache_done_parent_op(object);
+	return transit_to(KILL_OBJECT);
+}
+
+/*
+ * Wait for completion of all active operations on this object and the death of
+ * all child objects of this object.
+ */
+static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
+						       int event)
+{
+	_enter("{OBJ%x,%d,%d},%d",
+	       object->debug_id, object->n_ops, object->n_children, event);
+
+	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+	object->oob_event_mask = 0;
+
+	if (list_empty(&object->dependents) &&
+	    object->n_ops == 0 &&
+	    object->n_children == 0)
+		return transit_to(DROP_OBJECT);
+
+	if (object->n_in_progress == 0) {
+		spin_lock(&object->lock);
+		if (object->n_ops > 0 && object->n_in_progress == 0)
+			fscache_start_operations(object);
+		spin_unlock(&object->lock);
 	}
 
-	/* this just shifts the object release to the work processor */
-	fscache_put_object(object);
+	if (!list_empty(&object->dependents))
+		return transit_to(KILL_DEPENDENTS);
 
-	_leave("");
+	return transit_to(WAIT_FOR_CLEARANCE);
 }
 
 /*
- * release or recycle an object that the netfs has discarded
+ * Kill dependent objects.
  */
-static void fscache_release_object(struct fscache_object *object)
+static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
+							   int event)
 {
-	_enter("");
+	_enter("{OBJ%x},%d", object->debug_id, event);
 
-	fscache_drop_object(object);
+	if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
+		return NO_TRANSIT; /* Not finished */
+	return transit_to(WAIT_FOR_CLEARANCE);
 }
 
 /*
- * withdraw an object from active service
+ * Drop an object's attachments
  */
-static void fscache_withdraw_object(struct fscache_object *object)
+static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
+						       int event)
 {
-	struct fscache_cookie *cookie;
-	bool detached;
+	struct fscache_object *parent = object->parent;
+	struct fscache_cookie *cookie = object->cookie;
+	struct fscache_cache *cache = object->cache;
+	bool awaken = false;
 
-	_enter("");
+	_enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
 
-	spin_lock(&object->lock);
-	cookie = object->cookie;
-	if (cookie) {
-		/* need to get the cookie lock before the object lock, starting
-		 * from the object pointer */
-		atomic_inc(&cookie->usage);
-		spin_unlock(&object->lock);
+	ASSERT(cookie != NULL);
+	ASSERT(!hlist_unhashed(&object->cookie_link));
 
-		detached = false;
-		spin_lock(&cookie->lock);
-		spin_lock(&object->lock);
+	/* Make sure the cookie no longer points here and that the netfs isn't
+	 * waiting for us.
+	 */
+	spin_lock(&cookie->lock);
+	hlist_del_init(&object->cookie_link);
+	if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+		awaken = true;
+	spin_unlock(&cookie->lock);
 
-		if (object->cookie == cookie) {
-			hlist_del_init(&object->cookie_link);
-			object->cookie = NULL;
-			fscache_invalidation_complete(cookie);
-			detached = true;
-		}
-		spin_unlock(&cookie->lock);
-		fscache_cookie_put(cookie);
-		if (detached)
-			fscache_cookie_put(cookie);
-	}
+	if (awaken)
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
 
+	/* Prevent a race with our last child, which has to signal EV_CLEARED
+	 * before dropping our spinlock.
+	 */
+	spin_lock(&object->lock);
 	spin_unlock(&object->lock);
 
-	fscache_drop_object(object);
-}
+	/* Discard from the cache's collection of objects */
+	spin_lock(&cache->object_list_lock);
+	list_del_init(&object->cache_link);
+	spin_unlock(&cache->object_list_lock);
 
-/*
- * withdraw an object from active service at the behest of the cache
- * - need break the links to a cached object cookie
- * - called under two situations:
- *   (1) recycler decides to reclaim an in-use object
- *   (2) a cache is unmounted
- * - have to take care as the cookie can be being relinquished by the netfs
- *   simultaneously
- * - the object is pinned by the caller holding a refcount on it
- */
-void fscache_withdrawing_object(struct fscache_cache *cache,
-				struct fscache_object *object)
-{
-	bool enqueue = false;
+	fscache_stat(&fscache_n_cop_drop_object);
+	cache->ops->drop_object(object);
+	fscache_stat_d(&fscache_n_cop_drop_object);
 
-	_enter(",OBJ%x", object->debug_id);
+	/* The parent object wants to know when all it dependents have gone */
+	if (parent) {
+		_debug("release parent OBJ%x {%d}",
+		       parent->debug_id, parent->n_children);
 
-	spin_lock(&object->lock);
-	if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
-		object->state = FSCACHE_OBJECT_WITHDRAWING;
-		enqueue = true;
+		spin_lock(&parent->lock);
+		parent->n_children--;
+		if (parent->n_children == 0)
+			fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+		spin_unlock(&parent->lock);
+		object->parent = NULL;
 	}
-	spin_unlock(&object->lock);
 
-	if (enqueue)
-		fscache_enqueue_object(object);
+	/* this just shifts the object release to the work processor */
+	fscache_put_object(object);
+	fscache_stat(&fscache_n_object_dead);
 
 	_leave("");
+	return transit_to(OBJECT_DEAD);
 }
 
 /*
@@ -771,7 +736,7 @@ static int fscache_get_object(struct fscache_object *object)
 }
 
 /*
- * discard a ref on a work item
+ * Discard a ref on an object
  */
 static void fscache_put_object(struct fscache_object *object)
 {
@@ -780,6 +745,22 @@ static void fscache_put_object(struct fscache_object *object)
 	fscache_stat_d(&fscache_n_cop_put_object);
 }
 
+/**
+ * fscache_object_destroy - Note that a cache object is about to be destroyed
+ * @object: The object to be destroyed
+ *
+ * Note the imminent destruction and deallocation of a cache object record.
+ */
+void fscache_object_destroy(struct fscache_object *object)
+{
+	fscache_objlist_remove(object);
+
+	/* We can get rid of the cookie now */
+	fscache_cookie_put(object->cookie);
+	object->cookie = NULL;
+}
+EXPORT_SYMBOL(fscache_object_destroy);
+
 /*
  * enqueue an object for metadata-type processing
  */
@@ -803,7 +784,7 @@ void fscache_enqueue_object(struct fscache_object *object)
 
 /**
  * fscache_object_sleep_till_congested - Sleep until object wq is congested
- * @timoutp: Scheduler sleep timeout
+ * @timeoutp: Scheduler sleep timeout
  *
  * Allow an object handler to sleep until the object workqueue is congested.
  *
@@ -831,18 +812,21 @@ bool fscache_object_sleep_till_congested(signed long *timeoutp)
 EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
 
 /*
- * enqueue the dependents of an object for metadata-type processing
- * - the caller must hold the object's lock
- * - this may cause an already locked object to wind up being processed again
+ * Enqueue the dependents of an object for metadata-type processing.
+ *
+ * If we don't manage to finish the list before the scheduler wants to run
+ * again then return false immediately.  We return true if the list was
+ * cleared.
  */
-static void fscache_enqueue_dependents(struct fscache_object *object)
+static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
 {
 	struct fscache_object *dep;
+	bool ret = true;
 
 	_enter("{OBJ%x}", object->debug_id);
 
 	if (list_empty(&object->dependents))
-		return;
+		return true;
 
 	spin_lock(&object->lock);
 
@@ -851,23 +835,23 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
 				 struct fscache_object, dep_link);
 		list_del_init(&dep->dep_link);
 
-
-		/* sort onto appropriate lists */
-		fscache_enqueue_object(dep);
+		fscache_raise_event(dep, event);
 		fscache_put_object(dep);
 
-		if (!list_empty(&object->dependents))
-			cond_resched_lock(&object->lock);
+		if (!list_empty(&object->dependents) && need_resched()) {
+			ret = false;
+			break;
+		}
 	}
 
 	spin_unlock(&object->lock);
+	return ret;
 }
 
 /*
  * remove an object from whatever queue it's waiting on
- * - the caller must hold object->lock
  */
-void fscache_dequeue_object(struct fscache_object *object)
+static void fscache_dequeue_object(struct fscache_object *object)
 {
 	_enter("{OBJ%x}", object->debug_id);
 
@@ -886,7 +870,10 @@ void fscache_dequeue_object(struct fscache_object *object)
  * @data: The auxiliary data for the object
  * @datalen: The size of the auxiliary data
  *
- * This function consults the netfs about the coherency state of an object
+ * This function consults the netfs about the coherency state of an object.
+ * The caller must be holding a ref on cookie->n_active (held by
+ * fscache_look_up_object() on behalf of the cache backend during object lookup
+ * and creation).
  */
 enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
 					const void *data, uint16_t datalen)
@@ -927,12 +914,23 @@ EXPORT_SYMBOL(fscache_check_aux);
 /*
  * Asynchronously invalidate an object.
  */
-static void fscache_invalidate_object(struct fscache_object *object)
+static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
+							      int event)
 {
 	struct fscache_operation *op;
 	struct fscache_cookie *cookie = object->cookie;
 
-	_enter("{OBJ%x}", object->debug_id);
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	/* We're going to need the cookie.  If the cookie is not available then
+	 * retire the object instead.
+	 */
+	if (!fscache_use_cookie(object)) {
+		ASSERT(object->cookie->stores.rnode == NULL);
+		set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+		_leave(" [no cookie]");
+		return transit_to(KILL_OBJECT);
+	}
 
 	/* Reject any new read/write ops and abort any that are pending. */
 	fscache_invalidate_writes(cookie);
@@ -941,14 +939,13 @@ static void fscache_invalidate_object(struct fscache_object *object)
 
 	/* Now we have to wait for in-progress reads and writes */
 	op = kzalloc(sizeof(*op), GFP_KERNEL);
-	if (!op) {
-		fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
-		_leave(" [ENOMEM]");
-		return;
-	}
+	if (!op)
+		goto nomem;
 
 	fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
-	op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+	op->flags = FSCACHE_OP_ASYNC |
+		(1 << FSCACHE_OP_EXCLUSIVE) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
 
 	spin_lock(&cookie->lock);
 	if (fscache_submit_exclusive_op(object, op) < 0)
@@ -965,13 +962,50 @@ static void fscache_invalidate_object(struct fscache_object *object)
 	/* We can allow read and write requests to come in once again.  They'll
 	 * queue up behind our exclusive invalidation operation.
 	 */
-	fscache_invalidation_complete(cookie);
-	_leave("");
-	return;
+	if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+		wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+	_leave(" [ok]");
+	return transit_to(UPDATE_OBJECT);
+
+nomem:
+	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+	fscache_unuse_cookie(object);
+	_leave(" [ENOMEM]");
+	return transit_to(KILL_OBJECT);
 
 submit_op_failed:
+	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
 	spin_unlock(&cookie->lock);
 	kfree(op);
-	fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
 	_leave(" [EIO]");
+	return transit_to(KILL_OBJECT);
+}
+
+static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
+							     int event)
+{
+	const struct fscache_state *s;
+
+	fscache_stat(&fscache_n_invalidates_run);
+	fscache_stat(&fscache_n_cop_invalidate_object);
+	s = _fscache_invalidate_object(object, event);
+	fscache_stat_d(&fscache_n_cop_invalidate_object);
+	return s;
+}
+
+/*
+ * Asynchronously update an object.
+ */
+static const struct fscache_state *fscache_update_object(struct fscache_object *object,
+							 int event)
+{
+	_enter("{OBJ%x},%d", object->debug_id, event);
+
+	fscache_stat(&fscache_n_updates_run);
+	fscache_stat(&fscache_n_cop_update_object);
+	object->cache->ops->update_object(object);
+	fscache_stat_d(&fscache_n_cop_update_object);
+
+	_leave("");
+	return transit_to(WAIT_FOR_CMD);
 }
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 762a9ec4ffa..318071aca21 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -35,7 +35,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
 
 	ASSERT(list_empty(&op->pend_link));
 	ASSERT(op->processor != NULL);
-	ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+	ASSERT(fscache_object_is_available(op->object));
 	ASSERTCMP(atomic_read(&op->usage), >, 0);
 	ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
 
@@ -119,7 +119,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 		/* need to issue a new write op after this */
 		clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
 		ret = 0;
-	} else if (object->state == FSCACHE_OBJECT_CREATING) {
+	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
 		op->object = object;
 		object->n_ops++;
 		object->n_exclusive++;	/* reads and writes must wait */
@@ -144,7 +144,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
  */
 static void fscache_report_unexpected_submission(struct fscache_object *object,
 						 struct fscache_operation *op,
-						 unsigned long ostate)
+						 const struct fscache_state *ostate)
 {
 	static bool once_only;
 	struct fscache_operation *p;
@@ -155,11 +155,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
 	once_only = true;
 
 	kdebug("unexpected submission OP%x [OBJ%x %s]",
-	       op->debug_id, object->debug_id,
-	       fscache_object_states[object->state]);
-	kdebug("objstate=%s [%s]",
-	       fscache_object_states[object->state],
-	       fscache_object_states[ostate]);
+	       op->debug_id, object->debug_id, object->state->name);
+	kdebug("objstate=%s [%s]", object->state->name, ostate->name);
 	kdebug("objflags=%lx", object->flags);
 	kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
 	kdebug("ops=%u inp=%u exc=%u",
@@ -190,7 +187,7 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
 int fscache_submit_op(struct fscache_object *object,
 		      struct fscache_operation *op)
 {
-	unsigned long ostate;
+	const struct fscache_state *ostate;
 	int ret;
 
 	_enter("{OBJ%x OP%x},{%u}",
@@ -226,16 +223,14 @@ int fscache_submit_op(struct fscache_object *object,
 			fscache_run_op(object, op);
 		}
 		ret = 0;
-	} else if (object->state == FSCACHE_OBJECT_CREATING) {
+	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
 		op->object = object;
 		object->n_ops++;
 		atomic_inc(&op->usage);
 		list_add_tail(&op->pend_link, &object->pending_ops);
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
-	} else if (object->state == FSCACHE_OBJECT_DYING ||
-		   object->state == FSCACHE_OBJECT_LC_DYING ||
-		   object->state == FSCACHE_OBJECT_WITHDRAWING) {
+	} else if (fscache_object_is_dying(object)) {
 		fscache_stat(&fscache_n_op_rejected);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
@@ -265,8 +260,8 @@ void fscache_abort_object(struct fscache_object *object)
 }
 
 /*
- * jump start the operation processing on an object
- * - caller must hold object->lock
+ * Jump start the operation processing on an object.  The caller must hold
+ * object->lock.
  */
 void fscache_start_operations(struct fscache_object *object)
 {
@@ -428,14 +423,10 @@ void fscache_put_operation(struct fscache_operation *op)
 
 	object = op->object;
 
-	if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
-		if (atomic_dec_and_test(&object->n_reads)) {
-			clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
-				  &object->cookie->flags);
-			wake_up_bit(&object->cookie->flags,
-				    FSCACHE_COOKIE_WAITING_ON_READS);
-		}
-	}
+	if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+		atomic_dec(&object->n_reads);
+	if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
+		fscache_unuse_cookie(object);
 
 	/* now... we may get called with the object spinlock held, so we
 	 * complete the cleanup here only if we can immediately acquire the
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e52072..d479ab3c63e 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -109,7 +109,7 @@ page_busy:
 	 * allocator as the work threads writing to the cache may all end up
 	 * sleeping on memory allocation, so we may need to impose a timeout
 	 * too. */
-	if (!(gfp & __GFP_WAIT)) {
+	if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
 		fscache_stat(&fscache_n_store_vmscan_busy);
 		return false;
 	}
@@ -163,10 +163,12 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
 
 	fscache_stat(&fscache_n_attr_changed_calls);
 
-	if (fscache_object_is_active(object)) {
+	if (fscache_object_is_active(object) &&
+	    fscache_use_cookie(object)) {
 		fscache_stat(&fscache_n_cop_attr_changed);
 		ret = object->cache->ops->attr_changed(object);
 		fscache_stat_d(&fscache_n_cop_attr_changed);
+		fscache_unuse_cookie(object);
 		if (ret < 0)
 			fscache_abort_object(object);
 	}
@@ -233,7 +235,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
 
 	_enter("{OP%x}", op->op.debug_id);
 
-	ASSERTCMP(op->n_pages, ==, 0);
+	ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
 
 	fscache_hist(fscache_retrieval_histogram, op->start_time);
 	if (op->context)
@@ -246,6 +248,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
  * allocate a retrieval op
  */
 static struct fscache_retrieval *fscache_alloc_retrieval(
+	struct fscache_cookie *cookie,
 	struct address_space *mapping,
 	fscache_rw_complete_t end_io_func,
 	void *context)
@@ -260,7 +263,10 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 	}
 
 	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
-	op->op.flags	= FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+	atomic_inc(&cookie->n_active);
+	op->op.flags	= FSCACHE_OP_MYTHREAD |
+		(1UL << FSCACHE_OP_WAITING) |
+		(1UL << FSCACHE_OP_UNUSE_COOKIE);
 	op->mapping	= mapping;
 	op->end_io_func	= end_io_func;
 	op->context	= context;
@@ -310,7 +316,7 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
 	struct fscache_retrieval *op =
 		container_of(_op, struct fscache_retrieval, op);
 
-	op->n_pages = 0;
+	atomic_set(&op->n_pages, 0);
 }
 
 /*
@@ -394,12 +400,13 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 	if (fscache_wait_for_deferred_lookup(cookie) < 0)
 		return -ERESTARTSYS;
 
-	op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+	op = fscache_alloc_retrieval(cookie, page->mapping,
+				     end_io_func,context);
 	if (!op) {
 		_leave(" = -ENOMEM");
 		return -ENOMEM;
 	}
-	op->n_pages = 1;
+	atomic_set(&op->n_pages, 1);
 
 	spin_lock(&cookie->lock);
 
@@ -408,7 +415,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 	object = hlist_entry(cookie->backing_objects.first,
 			     struct fscache_object, cookie_link);
 
-	ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+	ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
 
 	atomic_inc(&object->n_reads);
 	__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -465,6 +472,7 @@ nobufs_unlock_dec:
 	atomic_dec(&object->n_reads);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
+	atomic_dec(&cookie->n_active);
 	kfree(op);
 nobufs:
 	fscache_stat(&fscache_n_retrievals_nobufs);
@@ -522,10 +530,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 	if (fscache_wait_for_deferred_lookup(cookie) < 0)
 		return -ERESTARTSYS;
 
-	op = fscache_alloc_retrieval(mapping, end_io_func, context);
+	op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
 	if (!op)
 		return -ENOMEM;
-	op->n_pages = *nr_pages;
+	atomic_set(&op->n_pages, *nr_pages);
 
 	spin_lock(&cookie->lock);
 
@@ -589,6 +597,7 @@ nobufs_unlock_dec:
 	atomic_dec(&object->n_reads);
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
+	atomic_dec(&cookie->n_active);
 	kfree(op);
 nobufs:
 	fscache_stat(&fscache_n_retrievals_nobufs);
@@ -631,10 +640,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 	if (fscache_wait_for_deferred_lookup(cookie) < 0)
 		return -ERESTARTSYS;
 
-	op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+	op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
 	if (!op)
 		return -ENOMEM;
-	op->n_pages = 1;
+	atomic_set(&op->n_pages, 1);
 
 	spin_lock(&cookie->lock);
 
@@ -675,6 +684,7 @@ error:
 
 nobufs_unlock:
 	spin_unlock(&cookie->lock);
+	atomic_dec(&cookie->n_active);
 	kfree(op);
 nobufs:
 	fscache_stat(&fscache_n_allocs_nobufs);
@@ -729,8 +739,9 @@ static void fscache_write_op(struct fscache_operation *_op)
 		 */
 		spin_unlock(&object->lock);
 		fscache_op_complete(&op->op, false);
-		_leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
-		       _op->flags, _op->state, object->state, object->flags);
+		_leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
+		       _op->flags, _op->state, object->state->short_name,
+		       object->flags);
 		return;
 	}
 
@@ -796,11 +807,16 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
 
 	_enter("");
 
-	while (spin_lock(&cookie->stores_lock),
-	       n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
-					      ARRAY_SIZE(results),
-					      FSCACHE_COOKIE_PENDING_TAG),
-	       n > 0) {
+	for (;;) {
+		spin_lock(&cookie->stores_lock);
+		n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+					       ARRAY_SIZE(results),
+					       FSCACHE_COOKIE_PENDING_TAG);
+		if (n == 0) {
+			spin_unlock(&cookie->stores_lock);
+			break;
+		}
+
 		for (i = n - 1; i >= 0; i--) {
 			page = results[i];
 			radix_tree_delete(&cookie->stores, page->index);
@@ -812,7 +828,6 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
 			page_cache_release(results[i]);
 	}
 
-	spin_unlock(&cookie->stores_lock);
 	_leave("");
 }
 
@@ -829,14 +844,12 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
  *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
  *      set)
  *
- *	(a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
- *	    fill op)
+ *	(a) no writes yet
  *
  *	(b) writes deferred till post-creation (mark page for writing and
  *	    return immediately)
  *
  *  (2) negative lookup, object created, initial fill being made from netfs
- *      (FSCACHE_COOKIE_INITIAL_FILL is set)
  *
  *	(a) fill point not yet reached this page (mark page for writing and
  *          return)
@@ -873,7 +886,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 
 	fscache_operation_init(&op->op, fscache_write_op,
 			       fscache_release_write_op);
-	op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
+	op->op.flags = FSCACHE_OP_ASYNC |
+		(1 << FSCACHE_OP_WAITING) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
 
 	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
 	if (ret < 0)
@@ -919,6 +934,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	op->op.debug_id	= atomic_inc_return(&fscache_op_debug_id);
 	op->store_limit = object->store_limit;
 
+	atomic_inc(&cookie->n_active);
 	if (fscache_submit_op(object, &op->op) < 0)
 		goto submit_failed;
 
@@ -945,6 +961,7 @@ already_pending:
 	return 0;
 
 submit_failed:
+	atomic_dec(&cookie->n_active);
 	spin_lock(&cookie->stores_lock);
 	radix_tree_delete(&cookie->stores, page->index);
 	spin_unlock(&cookie->stores_lock);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 8179e8bc4a3..40d13c70ef5 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -287,5 +287,5 @@ const struct file_operations fscache_stats_fops = {
 	.open		= fscache_stats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release        = single_release,
 };
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 75a20c092dd..a0b0855d00a 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -23,7 +23,7 @@ static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
 {
 	struct fuse_conn *fc;
 	mutex_lock(&fuse_mutex);
-	fc = file->f_path.dentry->d_inode->i_private;
+	fc = file_inode(file)->i_private;
 	if (fc)
 		fc = fuse_conn_get(fc);
 	mutex_unlock(&fuse_mutex);
@@ -341,6 +341,7 @@ static struct file_system_type fuse_ctl_fs_type = {
 	.mount		= fuse_ctl_mount,
 	.kill_sb	= fuse_ctl_kill_sb,
 };
+MODULE_ALIAS_FS("fusectl");
 
 int __init fuse_ctl_init(void)
 {
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e397b675b02..aef34b1e635 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,6 +38,7 @@
 #include <linux/device.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/aio.h>
 #include <linux/kdev_t.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
@@ -91,19 +92,24 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
 			 loff_t *ppos)
 {
 	loff_t pos = 0;
+	struct iovec iov = { .iov_base = buf, .iov_len = count };
+	struct fuse_io_priv io = { .async = 0, .file = file };
 
-	return fuse_direct_io(file, buf, count, &pos, 0);
+	return fuse_direct_io(&io, &iov, 1, count, &pos, 0);
 }
 
 static ssize_t cuse_write(struct file *file, const char __user *buf,
 			  size_t count, loff_t *ppos)
 {
 	loff_t pos = 0;
+	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+	struct fuse_io_priv io = { .async = 0, .file = file };
+
 	/*
 	 * No locking or generic_write_checks(), the server is
 	 * responsible for locking and sanity checks.
 	 */
-	return fuse_direct_io(file, buf, count, &pos, 1);
+	return fuse_direct_io(&io, &iov, 1, count, &pos, 1);
 }
 
 static int cuse_open(struct inode *inode, struct file *file)
@@ -419,7 +425,7 @@ static int cuse_send_init(struct cuse_conn *cc)
 
 	BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_for_background(fc, 1);
 	if (IS_ERR(req)) {
 		rc = PTR_ERR(req);
 		goto err;
@@ -449,6 +455,7 @@ static int cuse_send_init(struct cuse_conn *cc)
 	req->out.argvar = 1;
 	req->out.argpages = 1;
 	req->pages[0] = page;
+	req->page_descs[0].length = req->out.args[1].size;
 	req->num_pages = 1;
 	req->end = cuse_process_init_reply;
 	fuse_request_send_background(fc, req);
@@ -500,7 +507,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
 	cc->fc.release = cuse_fc_release;
 
 	cc->fc.connected = 1;
-	cc->fc.blocked = 0;
+	cc->fc.initialized = 1;
 	rc = cuse_send_init(cc);
 	if (rc) {
 		fuse_conn_put(&cc->fc);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e83351aa5ba..1d55f946540 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,6 +19,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/swap.h>
 #include <linux/splice.h>
+#include <linux/aio.h>
 
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 MODULE_ALIAS("devname:fuse");
@@ -34,34 +35,67 @@ static struct fuse_conn *fuse_get_conn(struct file *file)
 	return file->private_data;
 }
 
-static void fuse_request_init(struct fuse_req *req)
+static void fuse_request_init(struct fuse_req *req, struct page **pages,
+			      struct fuse_page_desc *page_descs,
+			      unsigned npages)
 {
 	memset(req, 0, sizeof(*req));
+	memset(pages, 0, sizeof(*pages) * npages);
+	memset(page_descs, 0, sizeof(*page_descs) * npages);
 	INIT_LIST_HEAD(&req->list);
 	INIT_LIST_HEAD(&req->intr_entry);
 	init_waitqueue_head(&req->waitq);
 	atomic_set(&req->count, 1);
+	req->pages = pages;
+	req->page_descs = page_descs;
+	req->max_pages = npages;
 }
 
-struct fuse_req *fuse_request_alloc(void)
+static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
 {
-	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL);
-	if (req)
-		fuse_request_init(req);
+	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
+	if (req) {
+		struct page **pages;
+		struct fuse_page_desc *page_descs;
+
+		if (npages <= FUSE_REQ_INLINE_PAGES) {
+			pages = req->inline_pages;
+			page_descs = req->inline_page_descs;
+		} else {
+			pages = kmalloc(sizeof(struct page *) * npages, flags);
+			page_descs = kmalloc(sizeof(struct fuse_page_desc) *
+					     npages, flags);
+		}
+
+		if (!pages || !page_descs) {
+			kfree(pages);
+			kfree(page_descs);
+			kmem_cache_free(fuse_req_cachep, req);
+			return NULL;
+		}
+
+		fuse_request_init(req, pages, page_descs, npages);
+	}
 	return req;
 }
+
+struct fuse_req *fuse_request_alloc(unsigned npages)
+{
+	return __fuse_request_alloc(npages, GFP_KERNEL);
+}
 EXPORT_SYMBOL_GPL(fuse_request_alloc);
 
-struct fuse_req *fuse_request_alloc_nofs(void)
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
 {
-	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS);
-	if (req)
-		fuse_request_init(req);
-	return req;
+	return __fuse_request_alloc(npages, GFP_NOFS);
 }
 
 void fuse_request_free(struct fuse_req *req)
 {
+	if (req->pages != req->inline_pages) {
+		kfree(req->pages);
+		kfree(req->page_descs);
+	}
 	kmem_cache_free(fuse_req_cachep, req);
 }
 
@@ -78,7 +112,7 @@ static void restore_sigs(sigset_t *oldset)
 	sigprocmask(SIG_SETMASK, oldset, NULL);
 }
 
-static void __fuse_get_request(struct fuse_req *req)
+void __fuse_get_request(struct fuse_req *req)
 {
 	atomic_inc(&req->count);
 }
@@ -97,40 +131,66 @@ static void fuse_req_init_context(struct fuse_req *req)
 	req->in.h.pid = current->pid;
 }
 
-struct fuse_req *fuse_get_req(struct fuse_conn *fc)
+static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
+{
+	return !fc->initialized || (for_background && fc->blocked);
+}
+
+static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
+				       bool for_background)
 {
 	struct fuse_req *req;
-	sigset_t oldset;
-	int intr;
 	int err;
-
 	atomic_inc(&fc->num_waiting);
-	block_sigs(&oldset);
-	intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
-	restore_sigs(&oldset);
-	err = -EINTR;
-	if (intr)
-		goto out;
+
+	if (fuse_block_alloc(fc, for_background)) {
+		sigset_t oldset;
+		int intr;
+
+		block_sigs(&oldset);
+		intr = wait_event_interruptible_exclusive(fc->blocked_waitq,
+				!fuse_block_alloc(fc, for_background));
+		restore_sigs(&oldset);
+		err = -EINTR;
+		if (intr)
+			goto out;
+	}
 
 	err = -ENOTCONN;
 	if (!fc->connected)
 		goto out;
 
-	req = fuse_request_alloc();
+	req = fuse_request_alloc(npages);
 	err = -ENOMEM;
-	if (!req)
+	if (!req) {
+		if (for_background)
+			wake_up(&fc->blocked_waitq);
 		goto out;
+	}
 
 	fuse_req_init_context(req);
 	req->waiting = 1;
+	req->background = for_background;
 	return req;
 
  out:
 	atomic_dec(&fc->num_waiting);
 	return ERR_PTR(err);
 }
+
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
+{
+	return __fuse_get_req(fc, npages, false);
+}
 EXPORT_SYMBOL_GPL(fuse_get_req);
 
+struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
+					     unsigned npages)
+{
+	return __fuse_get_req(fc, npages, true);
+}
+EXPORT_SYMBOL_GPL(fuse_get_req_for_background);
+
 /*
  * Return request in fuse_file->reserved_req.  However that may
  * currently be in use.  If that is the case, wait for it to become
@@ -165,7 +225,7 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
 	struct fuse_file *ff = file->private_data;
 
 	spin_lock(&fc->lock);
-	fuse_request_init(req);
+	fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
 	BUG_ON(ff->reserved_req);
 	ff->reserved_req = req;
 	wake_up_all(&fc->reserved_req_waitq);
@@ -186,24 +246,37 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
  * filesystem should not have it's own file open.  If deadlock is
  * intentional, it can still be broken by "aborting" the filesystem.
  */
-struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file)
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+					     struct file *file)
 {
 	struct fuse_req *req;
 
 	atomic_inc(&fc->num_waiting);
-	wait_event(fc->blocked_waitq, !fc->blocked);
-	req = fuse_request_alloc();
+	wait_event(fc->blocked_waitq, fc->initialized);
+	req = fuse_request_alloc(0);
 	if (!req)
 		req = get_reserved_req(fc, file);
 
 	fuse_req_init_context(req);
 	req->waiting = 1;
+	req->background = 0;
 	return req;
 }
 
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count)) {
+		if (unlikely(req->background)) {
+			/*
+			 * We get here in the unlikely case that a background
+			 * request was allocated but not sent
+			 */
+			spin_lock(&fc->lock);
+			if (!fc->blocked)
+				wake_up(&fc->blocked_waitq);
+			spin_unlock(&fc->lock);
+		}
+
 		if (req->waiting)
 			atomic_dec(&fc->num_waiting);
 
@@ -301,10 +374,15 @@ __releases(fc->lock)
 	list_del(&req->intr_entry);
 	req->state = FUSE_REQ_FINISHED;
 	if (req->background) {
-		if (fc->num_background == fc->max_background) {
+		req->background = 0;
+
+		if (fc->num_background == fc->max_background)
 			fc->blocked = 0;
-			wake_up_all(&fc->blocked_waitq);
-		}
+
+		/* Wake up next waiter, if any */
+		if (!fc->blocked && waitqueue_active(&fc->blocked_waitq))
+			wake_up(&fc->blocked_waitq);
+
 		if (fc->num_background == fc->congestion_threshold &&
 		    fc->connected && fc->bdi_initialized) {
 			clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
@@ -406,9 +484,9 @@ __acquires(fc->lock)
 	}
 }
 
-void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
-	req->isreply = 1;
+	BUG_ON(req->background);
 	spin_lock(&fc->lock);
 	if (!fc->connected)
 		req->out.h.error = -ENOTCONN;
@@ -425,12 +503,18 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 	}
 	spin_unlock(&fc->lock);
 }
+
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 1;
+	__fuse_request_send(fc, req);
+}
 EXPORT_SYMBOL_GPL(fuse_request_send);
 
 static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
 					    struct fuse_req *req)
 {
-	req->background = 1;
+	BUG_ON(!req->background);
 	fc->num_background++;
 	if (fc->num_background == fc->max_background)
 		fc->blocked = 1;
@@ -491,6 +575,27 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
 	fuse_request_send_nowait_locked(fc, req);
 }
 
+void fuse_force_forget(struct file *file, u64 nodeid)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_forget_in inarg;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.nlookup = 1;
+	req = fuse_get_req_nofail_nopages(fc, file);
+	req->in.h.opcode = FUSE_FORGET;
+	req->in.h.nodeid = nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->isreply = 0;
+	__fuse_request_send(fc, req);
+	/* ignore errors */
+	fuse_put_request(fc, req);
+}
+
 /*
  * Lock the request.  Up to the next unlock_request() there mustn't be
  * anything that could cause a page-fault.  If the request was already
@@ -850,11 +955,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 {
 	unsigned i;
 	struct fuse_req *req = cs->req;
-	unsigned offset = req->page_offset;
-	unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
 
 	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
 		int err;
+		unsigned offset = req->page_descs[i].offset;
+		unsigned count = min(nbytes, req->page_descs[i].length);
 
 		err = fuse_copy_page(cs, &req->pages[i], offset, count,
 				     zeroing);
@@ -862,8 +967,6 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 			return err;
 
 		nbytes -= count;
-		count = min(nbytes, (unsigned) PAGE_SIZE);
-		offset = 0;
 	}
 	return 0;
 }
@@ -1261,7 +1364,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 		page_nr++;
 		ret += buf->len;
 
-		if (pipe->inode)
+		if (pipe->files)
 			do_wakeup = 1;
 	}
 
@@ -1536,29 +1639,34 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
 	unsigned int num;
 	unsigned int offset;
 	size_t total_len = 0;
+	int num_pages;
 
-	req = fuse_get_req(fc);
+	offset = outarg->offset & ~PAGE_CACHE_MASK;
+	file_size = i_size_read(inode);
+
+	num = outarg->size;
+	if (outarg->offset > file_size)
+		num = 0;
+	else if (outarg->offset + num > file_size)
+		num = file_size - outarg->offset;
+
+	num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ);
+
+	req = fuse_get_req(fc, num_pages);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	offset = outarg->offset & ~PAGE_CACHE_MASK;
-
 	req->in.h.opcode = FUSE_NOTIFY_REPLY;
 	req->in.h.nodeid = outarg->nodeid;
 	req->in.numargs = 2;
 	req->in.argpages = 1;
-	req->page_offset = offset;
+	req->page_descs[0].offset = offset;
 	req->end = fuse_retrieve_end;
 
 	index = outarg->offset >> PAGE_CACHE_SHIFT;
-	file_size = i_size_read(inode);
-	num = outarg->size;
-	if (outarg->offset > file_size)
-		num = 0;
-	else if (outarg->offset + num > file_size)
-		num = file_size - outarg->offset;
 
-	while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) {
+	while (num && req->num_pages < num_pages) {
 		struct page *page;
 		unsigned int this_num;
 
@@ -1568,6 +1676,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
 
 		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
 		req->pages[req->num_pages] = page;
+		req->page_descs[req->num_pages].length = this_num;
 		req->num_pages++;
 
 		offset = 0;
@@ -2007,6 +2116,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
 	if (fc->connected) {
 		fc->connected = 0;
 		fc->blocked = 0;
+		fc->initialized = 1;
 		end_io_requests(fc);
 		end_queued_requests(fc);
 		end_polls(fc);
@@ -2025,6 +2135,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
 		spin_lock(&fc->lock);
 		fc->connected = 0;
 		fc->blocked = 0;
+		fc->initialized = 1;
 		end_queued_requests(fc);
 		end_polls(fc);
 		wake_up_all(&fc->blocked_waitq);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b7c09f9eb40..0eda52738ec 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,6 +14,29 @@
 #include <linux/namei.h>
 #include <linux/slab.h>
 
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+{
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_inode *fi = get_fuse_inode(dir);
+
+	if (!fc->do_readdirplus)
+		return false;
+	if (!fc->readdirplus_auto)
+		return true;
+	if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
+		return true;
+	if (ctx->pos == 0)
+		return true;
+	return false;
+}
+
+static void fuse_advise_use_readdirplus(struct inode *dir)
+{
+	struct fuse_inode *fi = get_fuse_inode(dir);
+
+	set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
+}
+
 #if BITS_PER_LONG >= 64
 static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
 {
@@ -157,6 +180,8 @@ u64 fuse_get_attr_version(struct fuse_conn *fc)
 static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 {
 	struct inode *inode;
+	struct dentry *parent;
+	struct fuse_conn *fc;
 
 	inode = ACCESS_ONCE(entry->d_inode);
 	if (inode && is_bad_inode(inode))
@@ -164,10 +189,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 	else if (fuse_dentry_time(entry) < get_jiffies_64()) {
 		int err;
 		struct fuse_entry_out outarg;
-		struct fuse_conn *fc;
 		struct fuse_req *req;
 		struct fuse_forget_link *forget;
-		struct dentry *parent;
 		u64 attr_version;
 
 		/* For negative dentries, always do a fresh lookup */
@@ -178,7 +201,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 			return -ECHILD;
 
 		fc = get_fuse_conn(inode);
-		req = fuse_get_req(fc);
+		req = fuse_get_req_nopages(fc);
 		if (IS_ERR(req))
 			return 0;
 
@@ -218,6 +241,13 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 				       entry_attr_timeout(&outarg),
 				       attr_version);
 		fuse_change_entry_timeout(entry, &outarg);
+	} else if (inode) {
+		fc = get_fuse_conn(inode);
+		if (fc->readdirplus_auto) {
+			parent = dget_parent(entry);
+			fuse_advise_use_readdirplus(parent->d_inode);
+			dput(parent);
+		}
 	}
 	return 1;
 }
@@ -271,7 +301,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 	if (name->len > FUSE_NAME_MAX)
 		goto out;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	err = PTR_ERR(req);
 	if (IS_ERR(req))
 		goto out;
@@ -355,6 +385,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	else
 		fuse_invalidate_entry_cache(entry);
 
+	fuse_advise_use_readdirplus(dir);
 	return newent;
 
  out_iput:
@@ -391,7 +422,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	if (!forget)
 		goto out_err;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	err = PTR_ERR(req);
 	if (IS_ERR(req))
 		goto out_put_forget_req;
@@ -592,7 +623,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
 {
 	struct fuse_mknod_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -623,7 +654,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
 {
 	struct fuse_mkdir_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -647,7 +678,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 {
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	unsigned len = strlen(link) + 1;
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -664,7 +695,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 {
 	int err;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -682,7 +713,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 
 		spin_lock(&fc->lock);
 		fi->attr_version = ++fc->attr_version;
-		drop_nlink(inode);
+		/*
+		 * If i_nlink == 0 then unlink doesn't make sense, yet this can
+		 * happen if userspace filesystem is careless.  It would be
+		 * difficult to enforce correct nlink usage so just ignore this
+		 * condition here
+		 */
+		if (inode->i_nlink > 0)
+			drop_nlink(inode);
 		spin_unlock(&fc->lock);
 		fuse_invalidate_attr(inode);
 		fuse_invalidate_attr(dir);
@@ -696,7 +734,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 {
 	int err;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -723,7 +761,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	int err;
 	struct fuse_rename_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(olddir);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 
 	if (IS_ERR(req))
 		return PTR_ERR(req);
@@ -776,7 +814,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
 	struct fuse_link_in inarg;
 	struct inode *inode = entry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -848,7 +886,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
 	struct fuse_req *req;
 	u64 attr_version;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -985,7 +1023,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 
 /*
  * Calling into a user-controlled filesystem gives the filesystem
- * daemon ptrace-like capabilities over the requester process.  This
+ * daemon ptrace-like capabilities over the current process.  This
  * means, that the filesystem daemon is able to record the exact
  * filesystem operations performed, and can also control the behavior
  * of the requester process in otherwise impossible ways.  For example
@@ -996,27 +1034,23 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
  * for which the owner of the mount has ptrace privilege.  This
  * excludes processes started by other users, suid or sgid processes.
  */
-int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
+int fuse_allow_current_process(struct fuse_conn *fc)
 {
 	const struct cred *cred;
-	int ret;
 
 	if (fc->flags & FUSE_ALLOW_OTHER)
 		return 1;
 
-	rcu_read_lock();
-	ret = 0;
-	cred = __task_cred(task);
+	cred = current_cred();
 	if (uid_eq(cred->euid, fc->user_id) &&
 	    uid_eq(cred->suid, fc->user_id) &&
 	    uid_eq(cred->uid,  fc->user_id) &&
 	    gid_eq(cred->egid, fc->group_id) &&
 	    gid_eq(cred->sgid, fc->group_id) &&
 	    gid_eq(cred->gid,  fc->group_id))
-		ret = 1;
-	rcu_read_unlock();
+		return 1;
 
-	return ret;
+	return 0;
 }
 
 static int fuse_access(struct inode *inode, int mask)
@@ -1029,7 +1063,7 @@ static int fuse_access(struct inode *inode, int mask)
 	if (fc->no_access)
 		return 0;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1077,7 +1111,7 @@ static int fuse_permission(struct inode *inode, int mask)
 	bool refreshed = false;
 	int err = 0;
 
-	if (!fuse_allow_task(fc, current))
+	if (!fuse_allow_current_process(fc))
 		return -EACCES;
 
 	/*
@@ -1131,43 +1165,178 @@ static int fuse_permission(struct inode *inode, int mask)
 }
 
 static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
-			 void *dstbuf, filldir_t filldir)
+			 struct dir_context *ctx)
 {
 	while (nbytes >= FUSE_NAME_OFFSET) {
 		struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
 		size_t reclen = FUSE_DIRENT_SIZE(dirent);
-		int over;
 		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
 			return -EIO;
 		if (reclen > nbytes)
 			break;
 
-		over = filldir(dstbuf, dirent->name, dirent->namelen,
-			       file->f_pos, dirent->ino, dirent->type);
-		if (over)
+		if (!dir_emit(ctx, dirent->name, dirent->namelen,
+			       dirent->ino, dirent->type))
 			break;
 
 		buf += reclen;
 		nbytes -= reclen;
-		file->f_pos = dirent->off;
+		ctx->pos = dirent->off;
 	}
 
 	return 0;
 }
 
-static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+static int fuse_direntplus_link(struct file *file,
+				struct fuse_direntplus *direntplus,
+				u64 attr_version)
 {
 	int err;
+	struct fuse_entry_out *o = &direntplus->entry_out;
+	struct fuse_dirent *dirent = &direntplus->dirent;
+	struct dentry *parent = file->f_path.dentry;
+	struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
+	struct dentry *dentry;
+	struct dentry *alias;
+	struct inode *dir = parent->d_inode;
+	struct fuse_conn *fc;
+	struct inode *inode;
+
+	if (!o->nodeid) {
+		/*
+		 * Unlike in the case of fuse_lookup, zero nodeid does not mean
+		 * ENOENT. Instead, it only means the userspace filesystem did
+		 * not want to return attributes/handle for this entry.
+		 *
+		 * So do nothing.
+		 */
+		return 0;
+	}
+
+	if (name.name[0] == '.') {
+		/*
+		 * We could potentially refresh the attributes of the directory
+		 * and its parent?
+		 */
+		if (name.len == 1)
+			return 0;
+		if (name.name[1] == '.' && name.len == 2)
+			return 0;
+	}
+	fc = get_fuse_conn(dir);
+
+	name.hash = full_name_hash(name.name, name.len);
+	dentry = d_lookup(parent, &name);
+	if (dentry && dentry->d_inode) {
+		inode = dentry->d_inode;
+		if (get_node_id(inode) == o->nodeid) {
+			struct fuse_inode *fi;
+			fi = get_fuse_inode(inode);
+			spin_lock(&fc->lock);
+			fi->nlookup++;
+			spin_unlock(&fc->lock);
+
+			/*
+			 * The other branch to 'found' comes via fuse_iget()
+			 * which bumps nlookup inside
+			 */
+			goto found;
+		}
+		err = d_invalidate(dentry);
+		if (err)
+			goto out;
+		dput(dentry);
+		dentry = NULL;
+	}
+
+	dentry = d_alloc(parent, &name);
+	err = -ENOMEM;
+	if (!dentry)
+		goto out;
+
+	inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+			  &o->attr, entry_attr_timeout(o), attr_version);
+	if (!inode)
+		goto out;
+
+	alias = d_materialise_unique(dentry, inode);
+	err = PTR_ERR(alias);
+	if (IS_ERR(alias))
+		goto out;
+	if (alias) {
+		dput(dentry);
+		dentry = alias;
+	}
+
+found:
+	fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o),
+			       attr_version);
+
+	fuse_change_entry_timeout(dentry, o);
+
+	err = 0;
+out:
+	if (dentry)
+		dput(dentry);
+	return err;
+}
+
+static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
+			     struct dir_context *ctx, u64 attr_version)
+{
+	struct fuse_direntplus *direntplus;
+	struct fuse_dirent *dirent;
+	size_t reclen;
+	int over = 0;
+	int ret;
+
+	while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
+		direntplus = (struct fuse_direntplus *) buf;
+		dirent = &direntplus->dirent;
+		reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
+
+		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+			return -EIO;
+		if (reclen > nbytes)
+			break;
+
+		if (!over) {
+			/* We fill entries into dstbuf only as much as
+			   it can hold. But we still continue iterating
+			   over remaining entries to link them. If not,
+			   we need to send a FORGET for each of those
+			   which we did not link.
+			*/
+			over = !dir_emit(ctx, dirent->name, dirent->namelen,
+				       dirent->ino, dirent->type);
+			ctx->pos = dirent->off;
+		}
+
+		buf += reclen;
+		nbytes -= reclen;
+
+		ret = fuse_direntplus_link(file, direntplus, attr_version);
+		if (ret)
+			fuse_force_forget(file, direntplus->entry_out.nodeid);
+	}
+
+	return 0;
+}
+
+static int fuse_readdir(struct file *file, struct dir_context *ctx)
+{
+	int plus, err;
 	size_t nbytes;
 	struct page *page;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
+	u64 attr_version = 0;
 
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req(fc, 1);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1176,17 +1345,34 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 		fuse_put_request(fc, req);
 		return -ENOMEM;
 	}
+
+	plus = fuse_use_readdirplus(inode, ctx);
 	req->out.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
-	fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
+	req->page_descs[0].length = PAGE_SIZE;
+	if (plus) {
+		attr_version = fuse_get_attr_version(fc);
+		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
+			       FUSE_READDIRPLUS);
+	} else {
+		fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
+			       FUSE_READDIR);
+	}
 	fuse_request_send(fc, req);
 	nbytes = req->out.args[0].size;
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
-	if (!err)
-		err = parse_dirfile(page_address(page), nbytes, file, dstbuf,
-				    filldir);
+	if (!err) {
+		if (plus) {
+			err = parse_dirplusfile(page_address(page), nbytes,
+						file, ctx,
+						attr_version);
+		} else {
+			err = parse_dirfile(page_address(page), nbytes, file,
+					    ctx);
+		}
+	}
 
 	__free_page(page);
 	fuse_invalidate_attr(inode); /* atime changed */
@@ -1197,7 +1383,7 @@ static char *read_link(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	char *link;
 
 	if (IS_ERR(req))
@@ -1379,10 +1565,9 @@ void fuse_release_nowrite(struct inode *inode)
  * vmtruncate() doesn't allow for this case, so do the rlimit checking
  * and the actual truncation by hand.
  */
-static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
-			   struct file *file)
+int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+		    struct file *file)
 {
-	struct inode *inode = entry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
 	struct fuse_setattr_in inarg;
@@ -1391,9 +1576,6 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 	loff_t oldsize;
 	int err;
 
-	if (!fuse_allow_task(fc, current))
-		return -EACCES;
-
 	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
 		attr->ia_valid |= ATTR_FORCE;
 
@@ -1410,7 +1592,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 	if (attr->ia_valid & ATTR_SIZE)
 		is_truncate = true;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1488,10 +1670,15 @@ error:
 
 static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 {
+	struct inode *inode = entry->d_inode;
+
+	if (!fuse_allow_current_process(get_fuse_conn(inode)))
+		return -EACCES;
+
 	if (attr->ia_valid & ATTR_FILE)
-		return fuse_do_setattr(entry, attr, attr->ia_file);
+		return fuse_do_setattr(inode, attr, attr->ia_file);
 	else
-		return fuse_do_setattr(entry, attr, NULL);
+		return fuse_do_setattr(inode, attr, NULL);
 }
 
 static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
@@ -1500,7 +1687,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
 	struct inode *inode = entry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	if (!fuse_allow_task(fc, current))
+	if (!fuse_allow_current_process(fc))
 		return -EACCES;
 
 	return fuse_update_attributes(inode, stat, NULL, NULL);
@@ -1518,7 +1705,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	if (fc->no_setxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1557,7 +1744,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
 	if (fc->no_getxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1603,13 +1790,13 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
 	struct fuse_getxattr_out outarg;
 	ssize_t ret;
 
-	if (!fuse_allow_task(fc, current))
+	if (!fuse_allow_current_process(fc))
 		return -EACCES;
 
 	if (fc->no_listxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1654,7 +1841,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 	if (fc->no_removexattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1696,7 +1883,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
 static const struct file_operations fuse_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= fuse_readdir,
+	.iterate	= fuse_readdir,
 	.open		= fuse_dir_open,
 	.release	= fuse_dir_release,
 	.fsync		= fuse_dir_fsync,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f3ab824fa30..5c121fe19c5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,6 +15,8 @@
 #include <linux/module.h>
 #include <linux/compat.h>
 #include <linux/swap.h>
+#include <linux/aio.h>
+#include <linux/falloc.h>
 
 static const struct file_operations fuse_direct_io_file_operations;
 
@@ -25,7 +27,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 	struct fuse_req *req;
 	int err;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -57,7 +59,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 		return NULL;
 
 	ff->fc = fc;
-	ff->reserved_req = fuse_request_alloc();
+	ff->reserved_req = fuse_request_alloc(0);
 	if (unlikely(!ff->reserved_req)) {
 		kfree(ff);
 		return NULL;
@@ -126,11 +128,13 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
 		struct fuse_req *req = ff->reserved_req;
 
 		if (sync) {
+			req->background = 0;
 			fuse_request_send(ff->fc, req);
 			path_put(&req->misc.release.path);
 			fuse_put_request(ff->fc, req);
 		} else {
 			req->end = fuse_release_end;
+			req->background = 1;
 			fuse_request_send_background(ff->fc, req);
 		}
 		kfree(ff);
@@ -282,6 +286,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags)
 	WARN_ON(atomic_read(&ff->count) > 1);
 	fuse_prepare_release(ff, flags, FUSE_RELEASE);
 	ff->reserved_req->force = 1;
+	ff->reserved_req->background = 0;
 	fuse_request_send(ff->fc, ff->reserved_req);
 	fuse_put_request(ff->fc, ff->reserved_req);
 	kfree(ff);
@@ -355,7 +360,7 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 
 static int fuse_flush(struct file *file, fl_owner_t id)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_file *ff = file->private_data;
 	struct fuse_req *req;
@@ -368,7 +373,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	if (fc->no_flush)
 		return 0;
 
-	req = fuse_get_req_nofail(fc, file);
+	req = fuse_get_req_nofail_nopages(fc, file);
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
 	inarg.lock_owner = fuse_lock_owner_id(fc, id);
@@ -436,7 +441,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 
 	fuse_sync_writes(inode);
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto out;
@@ -491,9 +496,114 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
 	req->out.args[0].size = count;
 }
 
-static size_t fuse_send_read(struct fuse_req *req, struct file *file,
+static void fuse_release_user_pages(struct fuse_req *req, int write)
+{
+	unsigned i;
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		if (write)
+			set_page_dirty_lock(page);
+		put_page(page);
+	}
+}
+
+/**
+ * In case of short read, the caller sets 'pos' to the position of
+ * actual end of fuse request in IO request. Otherwise, if bytes_requested
+ * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
+ *
+ * An example:
+ * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
+ * both submitted asynchronously. The first of them was ACKed by userspace as
+ * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
+ * second request was ACKed as short, e.g. only 1K was read, resulting in
+ * pos == 33K.
+ *
+ * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
+ * will be equal to the length of the longest contiguous fragment of
+ * transferred data starting from the beginning of IO request.
+ */
+static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
+{
+	int left;
+
+	spin_lock(&io->lock);
+	if (err)
+		io->err = io->err ? : err;
+	else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
+		io->bytes = pos;
+
+	left = --io->reqs;
+	spin_unlock(&io->lock);
+
+	if (!left) {
+		long res;
+
+		if (io->err)
+			res = io->err;
+		else if (io->bytes >= 0 && io->write)
+			res = -EIO;
+		else {
+			res = io->bytes < 0 ? io->size : io->bytes;
+
+			if (!is_sync_kiocb(io->iocb)) {
+				struct inode *inode = file_inode(io->iocb->ki_filp);
+				struct fuse_conn *fc = get_fuse_conn(inode);
+				struct fuse_inode *fi = get_fuse_inode(inode);
+
+				spin_lock(&fc->lock);
+				fi->attr_version = ++fc->attr_version;
+				spin_unlock(&fc->lock);
+			}
+		}
+
+		aio_complete(io->iocb, res, 0);
+		kfree(io);
+	}
+}
+
+static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct fuse_io_priv *io = req->io;
+	ssize_t pos = -1;
+
+	fuse_release_user_pages(req, !io->write);
+
+	if (io->write) {
+		if (req->misc.write.in.size != req->misc.write.out.size)
+			pos = req->misc.write.in.offset - io->offset +
+				req->misc.write.out.size;
+	} else {
+		if (req->misc.read.in.size != req->out.args[0].size)
+			pos = req->misc.read.in.offset - io->offset +
+				req->out.args[0].size;
+	}
+
+	fuse_aio_complete(io, req->out.h.error, pos);
+}
+
+static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
+		size_t num_bytes, struct fuse_io_priv *io)
+{
+	spin_lock(&io->lock);
+	io->size += num_bytes;
+	io->reqs++;
+	spin_unlock(&io->lock);
+
+	req->io = io;
+	req->end = fuse_aio_complete_req;
+
+	__fuse_get_request(req);
+	fuse_request_send_background(fc, req);
+
+	return num_bytes;
+}
+
+static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
 			     loff_t pos, size_t count, fl_owner_t owner)
 {
+	struct file *file = io->file;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
 
@@ -504,6 +614,10 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
 		inarg->read_flags |= FUSE_READ_LOCKOWNER;
 		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
 	}
+
+	if (io->async)
+		return fuse_async_req_send(fc, req, count, io);
+
 	fuse_request_send(fc, req);
 	return req->out.args[0].size;
 }
@@ -524,6 +638,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
 
 static int fuse_readpage(struct file *file, struct page *page)
 {
+	struct fuse_io_priv io = { .async = 0, .file = file };
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
@@ -544,7 +659,7 @@ static int fuse_readpage(struct file *file, struct page *page)
 	 */
 	fuse_wait_on_page_writeback(inode, page->index);
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req(fc, 1);
 	err = PTR_ERR(req);
 	if (IS_ERR(req))
 		goto out;
@@ -555,7 +670,8 @@ static int fuse_readpage(struct file *file, struct page *page)
 	req->out.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
-	num_read = fuse_send_read(req, file, pos, count, NULL);
+	req->page_descs[0].length = count;
+	num_read = fuse_send_read(req, &io, pos, count, NULL);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 
@@ -641,6 +757,7 @@ struct fuse_fill_data {
 	struct fuse_req *req;
 	struct file *file;
 	struct inode *inode;
+	unsigned nr_pages;
 };
 
 static int fuse_readpages_fill(void *_data, struct page *page)
@@ -656,16 +773,31 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
 	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
 	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+		int nr_alloc = min_t(unsigned, data->nr_pages,
+				     FUSE_MAX_PAGES_PER_REQ);
 		fuse_send_readpages(req, data->file);
-		data->req = req = fuse_get_req(fc);
+		if (fc->async_read)
+			req = fuse_get_req_for_background(fc, nr_alloc);
+		else
+			req = fuse_get_req(fc, nr_alloc);
+
+		data->req = req;
 		if (IS_ERR(req)) {
 			unlock_page(page);
 			return PTR_ERR(req);
 		}
 	}
+
+	if (WARN_ON(req->num_pages >= req->max_pages)) {
+		fuse_put_request(fc, req);
+		return -EIO;
+	}
+
 	page_cache_get(page);
 	req->pages[req->num_pages] = page;
+	req->page_descs[req->num_pages].length = PAGE_SIZE;
 	req->num_pages++;
+	data->nr_pages--;
 	return 0;
 }
 
@@ -676,6 +808,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_fill_data data;
 	int err;
+	int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);
 
 	err = -EIO;
 	if (is_bad_inode(inode))
@@ -683,7 +816,11 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 
 	data.file = file;
 	data.inode = inode;
-	data.req = fuse_get_req(fc);
+	if (fc->async_read)
+		data.req = fuse_get_req_for_background(fc, nr_alloc);
+	else
+		data.req = fuse_get_req(fc, nr_alloc);
+	data.nr_pages = nr_pages;
 	err = PTR_ERR(data.req);
 	if (IS_ERR(data.req))
 		goto out;
@@ -744,9 +881,10 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
 	req->out.args[0].value = outarg;
 }
 
-static size_t fuse_send_write(struct fuse_req *req, struct file *file,
+static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
 			      loff_t pos, size_t count, fl_owner_t owner)
 {
+	struct file *file = io->file;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
 	struct fuse_write_in *inarg = &req->misc.write.in;
@@ -757,6 +895,10 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
 		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
 		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
 	}
+
+	if (io->async)
+		return fuse_async_req_send(fc, req, count, io);
+
 	fuse_request_send(fc, req);
 	return req->misc.write.out.size;
 }
@@ -780,13 +922,14 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
 	size_t res;
 	unsigned offset;
 	unsigned i;
+	struct fuse_io_priv io = { .async = 0, .file = file };
 
 	for (i = 0; i < req->num_pages; i++)
 		fuse_wait_on_page_writeback(inode, req->pages[i]->index);
 
-	res = fuse_send_write(req, file, pos, count, NULL);
+	res = fuse_send_write(req, &io, pos, count, NULL);
 
-	offset = req->page_offset;
+	offset = req->page_descs[0].offset;
 	count = res;
 	for (i = 0; i < req->num_pages; i++) {
 		struct page *page = req->pages[i];
@@ -817,7 +960,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 	int err;
 
 	req->in.argpages = 1;
-	req->page_offset = offset;
+	req->page_descs[0].offset = offset;
 
 	do {
 		size_t tmp;
@@ -857,6 +1000,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 
 		err = 0;
 		req->pages[req->num_pages] = page;
+		req->page_descs[req->num_pages].length = tmp;
 		req->num_pages++;
 
 		iov_iter_advance(ii, tmp);
@@ -869,11 +1013,19 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 		if (!fc->big_writes)
 			break;
 	} while (iov_iter_count(ii) && count < fc->max_write &&
-		 req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
+		 req->num_pages < req->max_pages && offset == 0);
 
 	return count > 0 ? count : err;
 }
 
+static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
+{
+	return min_t(unsigned,
+		     ((pos + len - 1) >> PAGE_CACHE_SHIFT) -
+		     (pos >> PAGE_CACHE_SHIFT) + 1,
+		     FUSE_MAX_PAGES_PER_REQ);
+}
+
 static ssize_t fuse_perform_write(struct file *file,
 				  struct address_space *mapping,
 				  struct iov_iter *ii, loff_t pos)
@@ -889,8 +1041,9 @@ static ssize_t fuse_perform_write(struct file *file,
 	do {
 		struct fuse_req *req;
 		ssize_t count;
+		unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii));
 
-		req = fuse_get_req(fc);
+		req = fuse_get_req(fc, nr_pages);
 		if (IS_ERR(req)) {
 			err = PTR_ERR(req);
 			break;
@@ -947,7 +1100,6 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		return err;
 
 	count = ocount;
-	sb_start_write(inode->i_sb);
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
@@ -1006,73 +1158,130 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 out:
 	current->backing_dev_info = NULL;
 	mutex_unlock(&inode->i_mutex);
-	sb_end_write(inode->i_sb);
 
 	return written ? written : err;
 }
 
-static void fuse_release_user_pages(struct fuse_req *req, int write)
+static inline void fuse_page_descs_length_init(struct fuse_req *req,
+		unsigned index, unsigned nr_pages)
 {
-	unsigned i;
+	int i;
 
-	for (i = 0; i < req->num_pages; i++) {
-		struct page *page = req->pages[i];
-		if (write)
-			set_page_dirty_lock(page);
-		put_page(page);
-	}
+	for (i = index; i < index + nr_pages; i++)
+		req->page_descs[i].length = PAGE_SIZE -
+			req->page_descs[i].offset;
 }
 
-static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
+static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
+{
+	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+}
+
+static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
+					size_t max_size)
+{
+	return min(iov_iter_single_seg_count(ii), max_size);
+}
+
+static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
 			       size_t *nbytesp, int write)
 {
-	size_t nbytes = *nbytesp;
-	unsigned long user_addr = (unsigned long) buf;
-	unsigned offset = user_addr & ~PAGE_MASK;
-	int npages;
+	size_t nbytes = 0;  /* # bytes already packed in req */
 
 	/* Special case for kernel I/O: can copy directly into the buffer */
 	if (segment_eq(get_fs(), KERNEL_DS)) {
+		unsigned long user_addr = fuse_get_user_addr(ii);
+		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
+
 		if (write)
 			req->in.args[1].value = (void *) user_addr;
 		else
 			req->out.args[0].value = (void *) user_addr;
 
+		iov_iter_advance(ii, frag_size);
+		*nbytesp = frag_size;
 		return 0;
 	}
 
-	nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
-	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
-	npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
-	if (npages < 0)
-		return npages;
+	while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
+		unsigned npages;
+		unsigned long user_addr = fuse_get_user_addr(ii);
+		unsigned offset = user_addr & ~PAGE_MASK;
+		size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes);
+		int ret;
+
+		unsigned n = req->max_pages - req->num_pages;
+		frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT);
 
-	req->num_pages = npages;
-	req->page_offset = offset;
+		npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		npages = clamp(npages, 1U, n);
+
+		ret = get_user_pages_fast(user_addr, npages, !write,
+					  &req->pages[req->num_pages]);
+		if (ret < 0)
+			return ret;
+
+		npages = ret;
+		frag_size = min_t(size_t, frag_size,
+				  (npages << PAGE_SHIFT) - offset);
+		iov_iter_advance(ii, frag_size);
+
+		req->page_descs[req->num_pages].offset = offset;
+		fuse_page_descs_length_init(req, req->num_pages, npages);
+
+		req->num_pages += npages;
+		req->page_descs[req->num_pages - 1].length -=
+			(npages << PAGE_SHIFT) - offset - frag_size;
+
+		nbytes += frag_size;
+	}
 
 	if (write)
 		req->in.argpages = 1;
 	else
 		req->out.argpages = 1;
 
-	nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
-	*nbytesp = min(*nbytesp, nbytes);
+	*nbytesp = nbytes;
 
 	return 0;
 }
 
-ssize_t fuse_direct_io(struct file *file, const char __user *buf,
-		       size_t count, loff_t *ppos, int write)
+static inline int fuse_iter_npages(const struct iov_iter *ii_p)
 {
+	struct iov_iter ii = *ii_p;
+	int npages = 0;
+
+	while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) {
+		unsigned long user_addr = fuse_get_user_addr(&ii);
+		unsigned offset = user_addr & ~PAGE_MASK;
+		size_t frag_size = iov_iter_single_seg_count(&ii);
+
+		npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		iov_iter_advance(&ii, frag_size);
+	}
+
+	return min(npages, FUSE_MAX_PAGES_PER_REQ);
+}
+
+ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
+		       unsigned long nr_segs, size_t count, loff_t *ppos,
+		       int write)
+{
+	struct file *file = io->file;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
 	size_t nmax = write ? fc->max_write : fc->max_read;
 	loff_t pos = *ppos;
 	ssize_t res = 0;
 	struct fuse_req *req;
+	struct iov_iter ii;
+
+	iov_iter_init(&ii, iov, nr_segs, count, 0);
 
-	req = fuse_get_req(fc);
+	if (io->async)
+		req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii));
+	else
+		req = fuse_get_req(fc, fuse_iter_npages(&ii));
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1080,18 +1289,19 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 		size_t nres;
 		fl_owner_t owner = current->files;
 		size_t nbytes = min(count, nmax);
-		int err = fuse_get_user_pages(req, buf, &nbytes, write);
+		int err = fuse_get_user_pages(req, &ii, &nbytes, write);
 		if (err) {
 			res = err;
 			break;
 		}
 
 		if (write)
-			nres = fuse_send_write(req, file, pos, nbytes, owner);
+			nres = fuse_send_write(req, io, pos, nbytes, owner);
 		else
-			nres = fuse_send_read(req, file, pos, nbytes, owner);
+			nres = fuse_send_read(req, io, pos, nbytes, owner);
 
-		fuse_release_user_pages(req, !write);
+		if (!io->async)
+			fuse_release_user_pages(req, !write);
 		if (req->out.h.error) {
 			if (!res)
 				res = req->out.h.error;
@@ -1103,12 +1313,15 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 		count -= nres;
 		res += nres;
 		pos += nres;
-		buf += nres;
 		if (nres != nbytes)
 			break;
 		if (count) {
 			fuse_put_request(fc, req);
-			req = fuse_get_req(fc);
+			if (io->async)
+				req = fuse_get_req_for_background(fc,
+					fuse_iter_npages(&ii));
+			else
+				req = fuse_get_req(fc, fuse_iter_npages(&ii));
 			if (IS_ERR(req))
 				break;
 		}
@@ -1122,34 +1335,45 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 }
 EXPORT_SYMBOL_GPL(fuse_direct_io);
 
-static ssize_t fuse_direct_read(struct file *file, char __user *buf,
-				     size_t count, loff_t *ppos)
+static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
+				  const struct iovec *iov,
+				  unsigned long nr_segs, loff_t *ppos,
+				  size_t count)
 {
 	ssize_t res;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct file *file = io->file;
+	struct inode *inode = file_inode(file);
 
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	res = fuse_direct_io(file, buf, count, ppos, 0);
+	res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0);
 
 	fuse_invalidate_attr(inode);
 
 	return res;
 }
 
-static ssize_t __fuse_direct_write(struct file *file, const char __user *buf,
-				   size_t count, loff_t *ppos)
+static ssize_t fuse_direct_read(struct file *file, char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	struct fuse_io_priv io = { .async = 0, .file = file };
+	struct iovec iov = { .iov_base = buf, .iov_len = count };
+	return __fuse_direct_read(&io, &iov, 1, ppos, count);
+}
+
+static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
+				   const struct iovec *iov,
+				   unsigned long nr_segs, loff_t *ppos)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct file *file = io->file;
+	struct inode *inode = file_inode(file);
+	size_t count = iov_length(iov, nr_segs);
 	ssize_t res;
 
 	res = generic_write_checks(file, ppos, &count, 0);
-	if (!res) {
-		res = fuse_direct_io(file, buf, count, ppos, 1);
-		if (res > 0)
-			fuse_write_update_size(inode, *ppos);
-	}
+	if (!res)
+		res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1);
 
 	fuse_invalidate_attr(inode);
 
@@ -1159,15 +1383,19 @@ static ssize_t __fuse_direct_write(struct file *file, const char __user *buf,
 static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 				 size_t count, loff_t *ppos)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+	struct inode *inode = file_inode(file);
 	ssize_t res;
+	struct fuse_io_priv io = { .async = 0, .file = file };
 
 	if (is_bad_inode(inode))
 		return -EIO;
 
 	/* Don't allow parallel writes to the same file */
 	mutex_lock(&inode->i_mutex);
-	res = __fuse_direct_write(file, buf, count, ppos);
+	res = __fuse_direct_write(&io, &iov, 1, ppos);
+	if (res > 0)
+		fuse_write_update_size(inode, *ppos);
 	mutex_unlock(&inode->i_mutex);
 
 	return res;
@@ -1272,10 +1500,11 @@ static int fuse_writepage_locked(struct page *page)
 
 	set_page_writeback(page);
 
-	req = fuse_request_alloc_nofs();
+	req = fuse_request_alloc_nofs(1);
 	if (!req)
 		goto err;
 
+	req->background = 1; /* writeback always goes to bg_queue */
 	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 	if (!tmp_page)
 		goto err_free;
@@ -1293,7 +1522,8 @@ static int fuse_writepage_locked(struct page *page)
 	req->in.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = tmp_page;
-	req->page_offset = 0;
+	req->page_descs[0].offset = 0;
+	req->page_descs[0].length = PAGE_SIZE;
 	req->end = fuse_writepage_end;
 	req->inode = inode;
 
@@ -1385,7 +1615,7 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
-		struct inode *inode = file->f_dentry->d_inode;
+		struct inode *inode = file_inode(file);
 		struct fuse_conn *fc = get_fuse_conn(inode);
 		struct fuse_inode *fi = get_fuse_inode(inode);
 		struct fuse_file *ff = file->private_data;
@@ -1443,7 +1673,7 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file,
 			 const struct file_lock *fl, int opcode, pid_t pid,
 			 int flock)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_file *ff = file->private_data;
 	struct fuse_lk_in *arg = &req->misc.lk_in;
@@ -1465,13 +1695,13 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file,
 
 static int fuse_getlk(struct file *file, struct file_lock *fl)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
 	struct fuse_lk_out outarg;
 	int err;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1490,7 +1720,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
 
 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
 	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
@@ -1506,7 +1736,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
 	if (fl->fl_flags & FL_CLOSE)
 		return 0;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1522,7 +1752,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
 
 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
 
@@ -1545,7 +1775,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
 
 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
 
@@ -1575,7 +1805,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
 	if (!inode->i_sb->s_bdev || fc->no_bmap)
 		return 0;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return 0;
 
@@ -1602,7 +1832,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	loff_t retval;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 
 	/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
 	if (whence == SEEK_CUR || whence == SEEK_SET)
@@ -1873,7 +2103,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		num_pages++;
 	}
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req(fc, num_pages);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		req = NULL;
@@ -1881,6 +2111,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 	}
 	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
 	req->num_pages = num_pages;
+	fuse_page_descs_length_init(req, 0, req->num_pages);
 
 	/* okay, let's send it to the client */
 	req->in.h.opcode = FUSE_IOCTL;
@@ -1978,10 +2209,10 @@ EXPORT_SYMBOL_GPL(fuse_do_ioctl);
 long fuse_ioctl_common(struct file *file, unsigned int cmd,
 		       unsigned long arg, unsigned int flags)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	if (!fuse_allow_task(fc, current))
+	if (!fuse_allow_current_process(fc))
 		return -EACCES;
 
 	if (is_bad_inode(inode))
@@ -2066,6 +2297,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
 		return DEFAULT_POLLMASK;
 
 	poll_wait(file, &ff->poll_wait, wait);
+	inarg.events = (__u32)poll_requested_events(wait);
 
 	/*
 	 * Ask for notification iff there's someone waiting for it.
@@ -2076,7 +2308,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
 		fuse_register_polled_file(fc, ff);
 	}
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return POLLERR;
 
@@ -2126,53 +2358,99 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,
 	return 0;
 }
 
-static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov,
-			     unsigned long nr_segs, loff_t *ppos, int rw)
+static void fuse_do_truncate(struct file *file)
 {
-	const struct iovec *vector = iov;
-	ssize_t ret = 0;
-
-	while (nr_segs > 0) {
-		void __user *base;
-		size_t len;
-		ssize_t nr;
-
-		base = vector->iov_base;
-		len = vector->iov_len;
-		vector++;
-		nr_segs--;
+	struct inode *inode = file->f_mapping->host;
+	struct iattr attr;
 
-		if (rw == WRITE)
-			nr = __fuse_direct_write(filp, base, len, ppos);
-		else
-			nr = fuse_direct_read(filp, base, len, ppos);
+	attr.ia_valid = ATTR_SIZE;
+	attr.ia_size = i_size_read(inode);
 
-		if (nr < 0) {
-			if (!ret)
-				ret = nr;
-			break;
-		}
-		ret += nr;
-		if (nr != len)
-			break;
-	}
+	attr.ia_file = file;
+	attr.ia_valid |= ATTR_FILE;
 
-	return ret;
+	fuse_do_setattr(inode, &attr, file);
 }
 
+static inline loff_t fuse_round_up(loff_t off)
+{
+	return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+}
 
 static ssize_t
 fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs)
 {
 	ssize_t ret = 0;
-	struct file *file = NULL;
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	bool async_dio = ff->fc->async_dio;
 	loff_t pos = 0;
+	struct inode *inode;
+	loff_t i_size;
+	size_t count = iov_length(iov, nr_segs);
+	struct fuse_io_priv *io;
 
-	file = iocb->ki_filp;
 	pos = offset;
+	inode = file->f_mapping->host;
+	i_size = i_size_read(inode);
+
+	/* optimization for short read */
+	if (async_dio && rw != WRITE && offset + count > i_size) {
+		if (offset >= i_size)
+			return 0;
+		count = min_t(loff_t, count, fuse_round_up(i_size - offset));
+	}
+
+	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
+	if (!io)
+		return -ENOMEM;
+	spin_lock_init(&io->lock);
+	io->reqs = 1;
+	io->bytes = -1;
+	io->size = 0;
+	io->offset = offset;
+	io->write = (rw == WRITE);
+	io->err = 0;
+	io->file = file;
+	/*
+	 * By default, we want to optimize all I/Os with async request
+	 * submission to the client filesystem if supported.
+	 */
+	io->async = async_dio;
+	io->iocb = iocb;
 
-	ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw);
+	/*
+	 * We cannot asynchronously extend the size of a file. We have no method
+	 * to wait on real async I/O requests, so we must submit this request
+	 * synchronously.
+	 */
+	if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE)
+		io->async = false;
+
+	if (rw == WRITE)
+		ret = __fuse_direct_write(io, iov, nr_segs, &pos);
+	else
+		ret = __fuse_direct_read(io, iov, nr_segs, &pos, count);
+
+	if (io->async) {
+		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
+
+		/* we have a non-extending, async request, so return */
+		if (!is_sync_kiocb(iocb))
+			return -EIOCBQUEUED;
+
+		ret = wait_on_sync_kiocb(iocb);
+	} else {
+		kfree(io);
+	}
+
+	if (rw == WRITE) {
+		if (ret > 0)
+			fuse_write_update_size(inode, pos);
+		else if (ret < 0 && offset + count > i_size)
+			fuse_do_truncate(file);
+	}
 
 	return ret;
 }
@@ -2181,6 +2459,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 				loff_t length)
 {
 	struct fuse_file *ff = file->private_data;
+	struct inode *inode = file->f_inode;
 	struct fuse_conn *fc = ff->fc;
 	struct fuse_req *req;
 	struct fuse_fallocate_in inarg = {
@@ -2190,13 +2469,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 		.mode = mode
 	};
 	int err;
+	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
+			   (mode & FALLOC_FL_PUNCH_HOLE);
 
 	if (fc->no_fallocate)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_req(fc);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (lock_inode) {
+		mutex_lock(&inode->i_mutex);
+		if (mode & FALLOC_FL_PUNCH_HOLE)
+			fuse_set_nowrite(inode);
+	}
+
+	req = fuse_get_req_nopages(fc);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
 
 	req->in.h.opcode = FUSE_FALLOCATE;
 	req->in.h.nodeid = ff->nodeid;
@@ -2211,6 +2500,25 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	}
 	fuse_put_request(fc, req);
 
+	if (err)
+		goto out;
+
+	/* we could have extended the file */
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		fuse_write_update_size(inode, offset + length);
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		truncate_pagecache_range(inode, offset, offset + length - 1);
+
+	fuse_invalidate_attr(inode);
+
+out:
+	if (lock_inode) {
+		if (mode & FALLOC_FL_PUNCH_HOLE)
+			fuse_release_nowrite(inode);
+		mutex_unlock(&inode->i_mutex);
+	}
+
 	return err;
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e105a53fc72..fde7249a3a9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -44,6 +44,9 @@
     doing the mount will be allowed to access the filesystem */
 #define FUSE_ALLOW_OTHER         (1 << 1)
 
+/** Number of page pointers embedded in fuse_req */
+#define FUSE_REQ_INLINE_PAGES 1
+
 /** List of active connections */
 extern struct list_head fuse_conn_list;
 
@@ -103,6 +106,15 @@ struct fuse_inode {
 
 	/** List of writepage requestst (pending or sent) */
 	struct list_head writepages;
+
+	/** Miscellaneous bits describing inode state */
+	unsigned long state;
+};
+
+/** FUSE inode state bits */
+enum {
+	/** Advise readdirplus  */
+	FUSE_I_ADVISE_RDPLUS,
 };
 
 struct fuse_conn;
@@ -200,6 +212,12 @@ struct fuse_out {
 	struct fuse_arg args[3];
 };
 
+/** FUSE page descriptor */
+struct fuse_page_desc {
+	unsigned int length;
+	unsigned int offset;
+};
+
 /** The request state */
 enum fuse_req_state {
 	FUSE_REQ_INIT = 0,
@@ -210,6 +228,20 @@ enum fuse_req_state {
 	FUSE_REQ_FINISHED
 };
 
+/** The request IO state (for asynchronous processing) */
+struct fuse_io_priv {
+	int async;
+	spinlock_t lock;
+	unsigned reqs;
+	ssize_t bytes;
+	size_t size;
+	__u64 offset;
+	bool write;
+	int err;
+	struct kiocb *iocb;
+	struct file *file;
+};
+
 /**
  * A request to the client
  */
@@ -291,20 +323,32 @@ struct fuse_req {
 	} misc;
 
 	/** page vector */
-	struct page *pages[FUSE_MAX_PAGES_PER_REQ];
+	struct page **pages;
+
+	/** page-descriptor vector */
+	struct fuse_page_desc *page_descs;
+
+	/** size of the 'pages' array */
+	unsigned max_pages;
+
+	/** inline page vector */
+	struct page *inline_pages[FUSE_REQ_INLINE_PAGES];
+
+	/** inline page-descriptor vector */
+	struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
 
 	/** number of pages in vector */
 	unsigned num_pages;
 
-	/** offset of data on first page */
-	unsigned page_offset;
-
 	/** File used in the request (or NULL) */
 	struct fuse_file *ff;
 
 	/** Inode used in the request or NULL */
 	struct inode *inode;
 
+	/** AIO control block */
+	struct fuse_io_priv *io;
+
 	/** Link on fi->writepages */
 	struct list_head writepages_entry;
 
@@ -390,6 +434,10 @@ struct fuse_conn {
 	/** Batching of FORGET requests (positive indicates FORGET batch) */
 	int forget_batch;
 
+	/** Flag indicating that INIT reply has been received. Allocating
+	 * any fuse request will be suspended until the flag is set */
+	int initialized;
+
 	/** Flag indicating if connection is blocked.  This will be
 	    the case before the INIT reply is received, and if there
 	    are too many outstading backgrounds requests */
@@ -487,6 +535,15 @@ struct fuse_conn {
 	/** Use enhanced/automatic page cache invalidation. */
 	unsigned auto_inval_data:1;
 
+	/** Does the filesystem support readdirplus? */
+	unsigned do_readdirplus:1;
+
+	/** Does the filesystem want adaptive readdirplus? */
+	unsigned readdirplus_auto:1;
+
+	/** Does the filesystem support asynchronous direct-IO submission? */
+	unsigned async_dio:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
@@ -578,6 +635,9 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
 
 struct fuse_forget_link *fuse_alloc_forget(void);
 
+/* Used by READDIRPLUS */
+void fuse_force_forget(struct file *file, u64 nodeid);
+
 /**
  * Initialize READ or READDIR request
  */
@@ -658,9 +718,9 @@ void fuse_ctl_cleanup(void);
 /**
  * Allocate a request
  */
-struct fuse_req *fuse_request_alloc(void);
+struct fuse_req *fuse_request_alloc(unsigned npages);
 
-struct fuse_req *fuse_request_alloc_nofs(void);
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
 
 /**
  * Free a request
@@ -668,14 +728,32 @@ struct fuse_req *fuse_request_alloc_nofs(void);
 void fuse_request_free(struct fuse_req *req);
 
 /**
- * Get a request, may fail with -ENOMEM
+ * Get a request, may fail with -ENOMEM,
+ * caller should specify # elements in req->pages[] explicitly
+ */
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages);
+struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
+					     unsigned npages);
+
+/*
+ * Increment reference count on request
+ */
+void __fuse_get_request(struct fuse_req *req);
+
+/**
+ * Get a request, may fail with -ENOMEM,
+ * useful for callers who doesn't use req->pages[]
  */
-struct fuse_req *fuse_get_req(struct fuse_conn *fc);
+static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc)
+{
+	return fuse_get_req(fc, 0);
+}
 
 /**
  * Gets a requests for a file operation, always succeeds
  */
-struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file);
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+					     struct file *file);
 
 /**
  * Decrement reference count of a request.  If count goes to zero free
@@ -739,9 +817,9 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc);
 int fuse_valid_type(int m);
 
 /**
- * Is task allowed to perform filesystem operation?
+ * Is current process allowed to perform filesystem operation?
  */
-int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task);
+int fuse_allow_current_process(struct fuse_conn *fc);
 
 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
 
@@ -776,8 +854,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir);
-ssize_t fuse_direct_io(struct file *file, const char __user *buf,
-		       size_t count, loff_t *ppos, int write);
+ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
+		       unsigned long nr_segs, size_t count, loff_t *ppos,
+		       int write);
 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		   unsigned int flags);
 long fuse_ioctl_common(struct file *file, unsigned int cmd,
@@ -787,4 +866,7 @@ int fuse_dev_release(struct inode *inode, struct file *file);
 
 void fuse_write_update_size(struct inode *inode, loff_t pos);
 
+int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+		    struct file *file);
+
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 73ca6b72bea..0b578598c6a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -92,6 +92,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->attr_version = 0;
 	fi->writectr = 0;
 	fi->orig_ino = 0;
+	fi->state = 0;
 	INIT_LIST_HEAD(&fi->write_files);
 	INIT_LIST_HEAD(&fi->queued_writes);
 	INIT_LIST_HEAD(&fi->writepages);
@@ -345,6 +346,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
 		fc->destroy_req = NULL;
 		req->in.h.opcode = FUSE_DESTROY;
 		req->force = 1;
+		req->background = 0;
 		fuse_request_send(fc, req);
 		fuse_put_request(fc, req);
 	}
@@ -361,6 +363,7 @@ void fuse_conn_kill(struct fuse_conn *fc)
 	spin_lock(&fc->lock);
 	fc->connected = 0;
 	fc->blocked = 0;
+	fc->initialized = 1;
 	spin_unlock(&fc->lock);
 	/* Flush all readers on this fs */
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -408,12 +411,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct fuse_statfs_out outarg;
 	int err;
 
-	if (!fuse_allow_task(fc, current)) {
+	if (!fuse_allow_current_process(fc)) {
 		buf->f_type = FUSE_SUPER_MAGIC;
 		return 0;
 	}
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -580,7 +583,8 @@ void fuse_conn_init(struct fuse_conn *fc)
 	fc->khctr = 0;
 	fc->polled_files = RB_ROOT;
 	fc->reqctr = 0;
-	fc->blocked = 1;
+	fc->blocked = 0;
+	fc->initialized = 0;
 	fc->attr_version = 1;
 	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
 }
@@ -678,7 +682,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 
 	if (*max_len < len) {
 		*max_len = len;
-		return  255;
+		return  FILEID_INVALID;
 	}
 
 	nodeid = get_fuse_inode(inode)->nodeid;
@@ -781,7 +785,7 @@ static const struct super_operations fuse_super_operations = {
 static void sanitize_global_limit(unsigned *limit)
 {
 	if (*limit == 0)
-		*limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+		*limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
 			 sizeof(struct fuse_req);
 
 	if (*limit >= 1 << 16)
@@ -863,6 +867,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 				fc->dont_mask = 1;
 			if (arg->flags & FUSE_AUTO_INVAL_DATA)
 				fc->auto_inval_data = 1;
+			if (arg->flags & FUSE_DO_READDIRPLUS) {
+				fc->do_readdirplus = 1;
+				if (arg->flags & FUSE_READDIRPLUS_AUTO)
+					fc->readdirplus_auto = 1;
+			}
+			if (arg->flags & FUSE_ASYNC_DIO)
+				fc->async_dio = 1;
 		} else {
 			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
 			fc->no_lock = 1;
@@ -875,7 +886,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->max_write = max_t(unsigned, 4096, fc->max_write);
 		fc->conn_init = 1;
 	}
-	fc->blocked = 0;
+	fc->initialized = 1;
 	wake_up_all(&fc->blocked_waitq);
 }
 
@@ -889,7 +900,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
 		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
 		FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
-		FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA;
+		FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
+		FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
@@ -1034,12 +1046,13 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	/* only now - we want root dentry with NULL ->d_op */
 	sb->s_d_op = &fuse_dentry_operations;
 
-	init_req = fuse_request_alloc();
+	init_req = fuse_request_alloc(0);
 	if (!init_req)
 		goto err_put_root;
+	init_req->background = 1;
 
 	if (is_bdev) {
-		fc->destroy_req = fuse_request_alloc();
+		fc->destroy_req = fuse_request_alloc(0);
 		if (!fc->destroy_req)
 			goto err_free_init_req;
 	}
@@ -1111,6 +1124,7 @@ static struct file_system_type fuse_fs_type = {
 	.mount		= fuse_mount,
 	.kill_sb	= fuse_kill_sb_anon,
 };
+MODULE_ALIAS_FS("fuse");
 
 #ifdef CONFIG_BLOCK
 static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
@@ -1140,6 +1154,7 @@ static struct file_system_type fuseblk_fs_type = {
 	.kill_sb	= fuse_kill_sb_blk,
 	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
 };
+MODULE_ALIAS_FS("fuseblk");
 
 static inline int register_fuseblk(void)
 {
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index eb08c9e43c2..90c6a8faaec 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -20,13 +20,12 @@ config GFS2_FS
 	  be found here: http://sources.redhat.com/cluster
 
 	  The "nolock" lock module is now built in to GFS2 by default. If
-	  you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
-	  networking.
+	  you want to use the DLM, be sure to enable IPv4/6 networking.
 
 config GFS2_FS_LOCKING_DLM
 	bool "GFS2 DLM locking"
 	depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
-		HOTPLUG && DLM && CONFIGFS_FS && SYSFS
+		CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
 	help
 	  Multiple node locking module for GFS2
 
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index f850020ad90..f69ac0af549 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -237,7 +237,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
 		return -EINVAL;
 	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
 		return value ? -EACCES : 0;
-	if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER))
 		return -EPERM;
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 30de4f2a2ea..ee48ad37d9c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,6 +20,7 @@
 #include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/backing-dev.h>
+#include <linux/aio.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -51,7 +52,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
 			continue;
 		if (gfs2_is_jdata(ip))
 			set_buffer_uptodate(bh);
-		gfs2_trans_add_bh(ip->i_gl, bh, 0);
+		gfs2_trans_add_data(ip->i_gl, bh);
 	}
 }
 
@@ -109,7 +110,7 @@ static int gfs2_writepage_common(struct page *page,
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
 	if (page->index > end_index || (page->index == end_index && !offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0);
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
 		goto out;
 	}
 	return 1;
@@ -230,16 +231,14 @@ out_ignore:
 }
 
 /**
- * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
+ * gfs2_writepages - Write a bunch of dirty pages back to disk
  * @mapping: The mapping to write
  * @wbc: Write-back control
  *
- * For the data=writeback case we can already ignore buffer heads
- * and write whole extents at once. This is a big reduction in the
- * number of I/O requests we send and the bmap calls we make in this case.
+ * Used for both ordered and writeback modes.
  */
-static int gfs2_writeback_writepages(struct address_space *mapping,
-				     struct writeback_control *wbc)
+static int gfs2_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
 {
 	return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
 }
@@ -300,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
 
 		/* Is the page fully outside i_size? (truncate in progress) */
 		if (page->index > end_index || (page->index == end_index && !offset)) {
-			page->mapping->a_ops->invalidatepage(page, 0);
+			page->mapping->a_ops->invalidatepage(page, 0,
+							     PAGE_CACHE_SIZE);
 			unlock_page(page);
 			continue;
 		}
@@ -852,7 +852,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 		goto failed;
 	}
 
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 
 	if (gfs2_is_stuffed(ip))
 		return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
@@ -944,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
 	unlock_buffer(bh);
 }
 
-static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+static void gfs2_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	unsigned int stop = offset + length;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
 	struct buffer_head *bh, *head;
 	unsigned long pos = 0;
 
 	BUG_ON(!PageLocked(page));
-	if (offset == 0)
+	if (!partial_page)
 		ClearPageChecked(page);
 	if (!page_has_buffers(page))
 		goto out;
 
 	bh = head = page_buffers(page);
 	do {
+		if (pos + bh->b_size > stop)
+			return;
+
 		if (offset <= pos)
 			gfs2_discard(sdp, bh);
 		pos += bh->b_size;
 		bh = bh->b_this_page;
 	} while (bh != head);
 out:
-	if (offset == 0)
+	if (!partial_page)
 		try_to_release_page(page, 0);
 }
 
@@ -1057,7 +1063,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 		if (atomic_read(&bh->b_count))
 			goto cannot_release;
 		bd = bh->b_private;
-		if (bd && bd->bd_ail)
+		if (bd && bd->bd_tr)
 			goto cannot_release;
 		if (buffer_pinned(bh) || buffer_dirty(bh))
 			goto not_possible;
@@ -1102,7 +1108,7 @@ cannot_release:
 
 static const struct address_space_operations gfs2_writeback_aops = {
 	.writepage = gfs2_writeback_writepage,
-	.writepages = gfs2_writeback_writepages,
+	.writepages = gfs2_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
 	.write_begin = gfs2_write_begin,
@@ -1118,6 +1124,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
 
 static const struct address_space_operations gfs2_ordered_aops = {
 	.writepage = gfs2_ordered_writepage,
+	.writepages = gfs2_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
 	.write_begin = gfs2_write_begin,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index a68e91bcef3..5e2f56fccf6 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -22,6 +22,7 @@
 #include "meta_io.h"
 #include "quota.h"
 #include "rgrp.h"
+#include "log.h"
 #include "super.h"
 #include "trans.h"
 #include "dir.h"
@@ -93,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (!gfs2_is_jdata(ip))
 		mark_buffer_dirty(bh);
 	if (!gfs2_is_writeback(ip))
-		gfs2_trans_add_bh(ip->i_gl, bh, 0);
+		gfs2_trans_add_data(ip->i_gl, bh);
 
 	if (release) {
 		unlock_page(page);
@@ -153,7 +154,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 
 	/*  Set up the pointer to the new block  */
 
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 	di = (struct gfs2_dinode *)dibh->b_data;
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 
@@ -405,7 +406,7 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp,
 	BUG_ON(i < 1);
 	BUG_ON(mp->mp_bh[i] != NULL);
 	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
-	gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
+	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 	ptr += offset;
@@ -468,7 +469,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 	BUG_ON(sheight < 1);
 	BUG_ON(dibh == NULL);
 
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 
 	if (height == sheight) {
 		struct buffer_head *bh;
@@ -544,7 +545,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 		/* Branching from existing tree */
 		case ALLOC_GROW_DEPTH:
 			if (i > 1 && i < height)
-				gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
+				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 			for (; i < height && n > 0; i++, n--)
 				gfs2_indirect_init(mp, ip->i_gl, i,
 						   mp->mp_list[i-1], bn++);
@@ -556,7 +557,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 		case ALLOC_DATA:
 			BUG_ON(n > dblks);
 			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
-			gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
+			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 			dblks = n;
 			ptr = metapointer(end_of_metadata, mp);
 			dblock = bn;
@@ -786,7 +787,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 		goto out_rlist;
 
 	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
-		gfs2_rs_deltree(ip, ip->i_res);
+		gfs2_rs_deltree(ip->i_res);
 
 	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
 				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
@@ -796,8 +797,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 
 	down_write(&ip->i_rw_mutex);
 
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
+	gfs2_trans_add_meta(ip->i_gl, bh);
 
 	bstart = 0;
 	blen = 0;
@@ -981,7 +982,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
 	}
 
 	if (!gfs2_is_writeback(ip))
-		gfs2_trans_add_bh(ip->i_gl, bh, 0);
+		gfs2_trans_add_data(ip->i_gl, bh);
 
 	zero_user(page, offset, length);
 	mark_buffer_dirty(bh);
@@ -1046,7 +1047,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 	if (error)
 		goto out;
 
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 
 	if (gfs2_is_stuffed(ip)) {
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
@@ -1098,7 +1099,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
 	if (error)
 		return error;
 
-	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
 	if (error)
 		return error;
 
@@ -1137,11 +1138,12 @@ static int trunc_end(struct gfs2_inode *ip)
 		ip->i_height = 0;
 		ip->i_goal = ip->i_no_addr;
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+		gfs2_ordered_del_inode(ip);
 	}
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
 
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 
@@ -1230,7 +1232,9 @@ static int do_grow(struct inode *inode, u64 size)
 		unstuff = 1;
 	}
 
-	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
+	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
+				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
+				  0 : RES_QUOTA), 0);
 	if (error)
 		goto do_grow_release;
 
@@ -1246,7 +1250,7 @@ static int do_grow(struct inode *inode, u64 size)
 
 	i_size_write(inode, size);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 
@@ -1284,13 +1288,26 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
 	if (ret)
 		return ret;
 
+	ret = get_write_access(inode);
+	if (ret)
+		return ret;
+
 	inode_dio_wait(inode);
 
+	ret = gfs2_rs_alloc(GFS2_I(inode));
+	if (ret)
+		goto out;
+
 	oldsize = inode->i_size;
-	if (newsize >= oldsize)
-		return do_grow(inode, newsize);
+	if (newsize >= oldsize) {
+		ret = do_grow(inode, newsize);
+		goto out;
+	}
 
-	return do_shrink(inode, oldsize, newsize);
+	ret = do_shrink(inode, oldsize, newsize);
+out:
+	put_write_access(inode);
+	return ret;
 }
 
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4fddb3c22d2..f2448ab2aac 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -109,8 +109,7 @@ fail:
 	return 0;
 }
 
-static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *str)
+static int gfs2_dhash(const struct dentry *dentry, struct qstr *str)
 {
 	str->hash = gfs2_disk_hash(str->name, str->len);
 	return 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 9a35670fdc3..0cb4c1557f2 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -93,7 +93,7 @@ int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
 	struct buffer_head *bh;
 
 	bh = gfs2_meta_new(ip->i_gl, block);
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, bh);
 	gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
 	gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
 	*bhp = bh;
@@ -127,7 +127,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
 	if (error)
 		return error;
 
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
 	if (ip->i_inode.i_size < offset + size)
 		i_size_write(&ip->i_inode, offset + size);
@@ -209,7 +209,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
 		if (error)
 			goto fail;
 
-		gfs2_trans_add_bh(ip->i_gl, bh, 1);
+		gfs2_trans_add_meta(ip->i_gl, bh);
 		memcpy(bh->b_data + o, buf, amount);
 		brelse(bh);
 
@@ -231,7 +231,7 @@ out:
 		i_size_write(&ip->i_inode, offset + copied);
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 
@@ -354,22 +354,31 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
 		return ERR_PTR(-EIO);
 	}
 
-	hc = kmalloc(hsize, GFP_NOFS);
-	ret = -ENOMEM;
+	hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN);
+	if (hc == NULL)
+		hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL);
+
 	if (hc == NULL)
 		return ERR_PTR(-ENOMEM);
 
 	ret = gfs2_dir_read_data(ip, hc, hsize);
 	if (ret < 0) {
-		kfree(hc);
+		if (is_vmalloc_addr(hc))
+			vfree(hc);
+		else
+			kfree(hc);
 		return ERR_PTR(ret);
 	}
 
 	spin_lock(&inode->i_lock);
-	if (ip->i_hash_cache)
-		kfree(hc);
-	else
+	if (ip->i_hash_cache) {
+		if (is_vmalloc_addr(hc))
+			vfree(hc);
+		else
+			kfree(hc);
+	} else {
 		ip->i_hash_cache = hc;
+	}
 	spin_unlock(&inode->i_lock);
 
 	return ip->i_hash_cache;
@@ -385,7 +394,10 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip)
 {
 	__be64 *hc = ip->i_hash_cache;
 	ip->i_hash_cache = NULL;
-	kfree(hc);
+	if (is_vmalloc_addr(hc))
+		vfree(hc);
+	else
+		kfree(hc);
 }
 
 static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
@@ -647,7 +659,7 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
 		return;
 	}
 
-	gfs2_trans_add_bh(dip->i_gl, bh, 1);
+	gfs2_trans_add_meta(dip->i_gl, bh);
 
 	/* If there is no prev entry, this is the first entry in the block.
 	   The de_rec_len is already as big as it needs to be.  Just zero
@@ -690,7 +702,7 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
 		offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
 	totlen = be16_to_cpu(dent->de_rec_len);
 	BUG_ON(offset + name->len > totlen);
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, bh);
 	ndent = (struct gfs2_dirent *)((char *)dent + offset);
 	dent->de_rec_len = cpu_to_be16(offset);
 	gfs2_qstr2dirent(name, totlen - offset, ndent);
@@ -831,7 +843,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 		return NULL;
 
 	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, bh);
 	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
 	leaf = (struct gfs2_leaf *)bh->b_data;
 	leaf->lf_depth = cpu_to_be16(depth);
@@ -916,7 +928,7 @@ static int dir_make_exhash(struct inode *inode)
 	/*  We're done with the new leaf block, now setup the new
 	    hash table.  */
 
-	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(dip->i_gl, dibh);
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 
 	lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
@@ -976,7 +988,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 		return 1; /* can't split */
 	}
 
-	gfs2_trans_add_bh(dip->i_gl, obh, 1);
+	gfs2_trans_add_meta(dip->i_gl, obh);
 
 	nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
 	if (!nleaf) {
@@ -1069,7 +1081,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
-		gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+		gfs2_trans_add_meta(dip->i_gl, dibh);
 		gfs2_add_inode_blocks(&dip->i_inode, 1);
 		gfs2_dinode_out(dip, dibh->b_data);
 		brelse(dibh);
@@ -1113,10 +1125,14 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 	if (IS_ERR(hc))
 		return PTR_ERR(hc);
 
-	h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS);
+	hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
+	if (hc2 == NULL)
+		hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
+
 	if (!hc2)
 		return -ENOMEM;
 
+	h = hc2;
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (error)
 		goto out_kfree;
@@ -1145,7 +1161,10 @@ fail:
 	gfs2_dinode_out(dip, dibh->b_data);
 	brelse(dibh);
 out_kfree:
-	kfree(hc2);
+	if (is_vmalloc_addr(hc2))
+		vfree(hc2);
+	else
+		kfree(hc2);
 	return error;
 }
 
@@ -1194,9 +1213,7 @@ static int compare_dents(const void *a, const void *b)
 /**
  * do_filldir_main - read out directory entries
  * @dip: The GFS2 inode
- * @offset: The offset in the file to read from
- * @opaque: opaque data to pass to filldir
- * @filldir: The function to pass entries to
+ * @ctx: what to feed the entries to
  * @darr: an array of struct gfs2_dirent pointers to read
  * @entries: the number of entries in darr
  * @copied: pointer to int that's non-zero if a entry has been copied out
@@ -1206,11 +1223,10 @@ static int compare_dents(const void *a, const void *b)
  * the possibility that they will fall into different readdir buffers or
  * that someone will want to seek to that location.
  *
- * Returns: errno, >0 on exception from filldir
+ * Returns: errno, >0 if the actor tells you to stop
  */
 
-static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
-			   void *opaque, filldir_t filldir,
+static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
 			   const struct gfs2_dirent **darr, u32 entries,
 			   int *copied)
 {
@@ -1218,7 +1234,6 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
 	u64 off, off_next;
 	unsigned int x, y;
 	int run = 0;
-	int error = 0;
 
 	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
 
@@ -1235,9 +1250,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
 			off_next = be32_to_cpu(dent_next->de_hash);
 			off_next = gfs2_disk_hash2offset(off_next);
 
-			if (off < *offset)
+			if (off < ctx->pos)
 				continue;
-			*offset = off;
+			ctx->pos = off;
 
 			if (off_next == off) {
 				if (*copied && !run)
@@ -1246,26 +1261,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
 			} else
 				run = 0;
 		} else {
-			if (off < *offset)
+			if (off < ctx->pos)
 				continue;
-			*offset = off;
+			ctx->pos = off;
 		}
 
-		error = filldir(opaque, (const char *)(dent + 1),
+		if (!dir_emit(ctx, (const char *)(dent + 1),
 				be16_to_cpu(dent->de_name_len),
-				off, be64_to_cpu(dent->de_inum.no_addr),
-				be16_to_cpu(dent->de_type));
-		if (error)
+				be64_to_cpu(dent->de_inum.no_addr),
+				be16_to_cpu(dent->de_type)))
 			return 1;
 
 		*copied = 1;
 	}
 
-	/* Increment the *offset by one, so the next time we come into the
+	/* Increment the ctx->pos by one, so the next time we come into the
 	   do_filldir fxn, we get the next entry instead of the last one in the
 	   current leaf */
 
-	(*offset)++;
+	ctx->pos++;
 
 	return 0;
 }
@@ -1289,8 +1303,8 @@ static void gfs2_free_sort_buffer(void *ptr)
 		kfree(ptr);
 }
 
-static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
-			      filldir_t filldir, int *copied, unsigned *depth,
+static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
+			      int *copied, unsigned *depth,
 			      u64 leaf_no)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -1368,8 +1382,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
 	} while(lfn);
 
 	BUG_ON(entries2 != entries);
-	error = do_filldir_main(ip, offset, opaque, filldir, darr,
-				entries, copied);
+	error = do_filldir_main(ip, ctx, darr, entries, copied);
 out_free:
 	for(i = 0; i < leaf; i++)
 		brelse(larr[i]);
@@ -1428,15 +1441,13 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
 /**
  * dir_e_read - Reads the entries from a directory into a filldir buffer
  * @dip: dinode pointer
- * @offset: the hash of the last entry read shifted to the right once
- * @opaque: buffer for the filldir function to fill
- * @filldir: points to the filldir function to use
+ * @ctx: actor to feed the entries to
  *
  * Returns: errno
  */
 
-static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
-		      filldir_t filldir, struct file_ra_state *f_ra)
+static int dir_e_read(struct inode *inode, struct dir_context *ctx,
+		      struct file_ra_state *f_ra)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
 	u32 hsize, len = 0;
@@ -1447,7 +1458,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	unsigned depth = 0;
 
 	hsize = 1 << dip->i_depth;
-	hash = gfs2_dir_offset2hash(*offset);
+	hash = gfs2_dir_offset2hash(ctx->pos);
 	index = hash >> (32 - dip->i_depth);
 
 	if (dip->i_hash_cache == NULL)
@@ -1459,7 +1470,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	gfs2_dir_readahead(inode, hsize, index, f_ra);
 
 	while (index < hsize) {
-		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+		error = gfs2_dir_read_leaf(inode, ctx,
 					   &copied, &depth,
 					   be64_to_cpu(lp[index]));
 		if (error)
@@ -1474,8 +1485,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	return error;
 }
 
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-		  filldir_t filldir, struct file_ra_state *f_ra)
+int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+		  struct file_ra_state *f_ra)
 {
 	struct gfs2_inode *dip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1489,7 +1500,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 		return 0;
 
 	if (dip->i_diskflags & GFS2_DIF_EXHASH)
-		return dir_e_read(inode, offset, opaque, filldir, f_ra);
+		return dir_e_read(inode, ctx, f_ra);
 
 	if (!gfs2_is_stuffed(dip)) {
 		gfs2_consist_inode(dip);
@@ -1521,7 +1532,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 			error = -EIO;
 			goto out;
 		}
-		error = do_filldir_main(dip, offset, opaque, filldir, darr,
+		error = do_filldir_main(dip, ctx, darr,
 					dip->i_entries, &copied);
 out:
 		kfree(darr);
@@ -1537,9 +1548,9 @@ out:
 
 /**
  * gfs2_dir_search - Search a directory
- * @dip: The GFS2 inode
- * @filename:
- * @inode:
+ * @dip: The GFS2 dir inode
+ * @name: The name we are looking up
+ * @fail_on_exist: Fail if the name exists rather than looking it up
  *
  * This routine searches a directory for a file or another directory.
  * Assumes a glock is held on dip.
@@ -1547,22 +1558,25 @@ out:
  * Returns: errno
  */
 
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
+struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
+			      bool fail_on_exist)
 {
 	struct buffer_head *bh;
 	struct gfs2_dirent *dent;
-	struct inode *inode;
+	u64 addr, formal_ino;
+	u16 dtype;
 
 	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
 	if (dent) {
 		if (IS_ERR(dent))
 			return ERR_CAST(dent);
-		inode = gfs2_inode_lookup(dir->i_sb, 
-				be16_to_cpu(dent->de_type),
-				be64_to_cpu(dent->de_inum.no_addr),
-				be64_to_cpu(dent->de_inum.no_formal_ino), 0);
+		dtype = be16_to_cpu(dent->de_type);
+		addr = be64_to_cpu(dent->de_inum.no_addr);
+		formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
 		brelse(bh);
-		return inode;
+		if (fail_on_exist)
+			return ERR_PTR(-EEXIST);
+		return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
 	}
 	return ERR_PTR(-ENOENT);
 }
@@ -1622,7 +1636,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 			return error;
 	} while(1);
 
-	gfs2_trans_add_bh(ip->i_gl, obh, 1);
+	gfs2_trans_add_meta(ip->i_gl, obh);
 
 	leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
 	if (!leaf) {
@@ -1636,7 +1650,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 	error = gfs2_meta_inode_buffer(ip, &bh);
 	if (error)
 		return error;
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, bh);
 	gfs2_add_inode_blocks(&ip->i_inode, 1);
 	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
@@ -1795,7 +1809,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 	if (IS_ERR(dent))
 		return PTR_ERR(dent);
 
-	gfs2_trans_add_bh(dip->i_gl, bh, 1);
+	gfs2_trans_add_meta(dip->i_gl, bh);
 	gfs2_inum_out(nip, dent);
 	dent->de_type = cpu_to_be16(new_type);
 
@@ -1804,7 +1818,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 		error = gfs2_meta_inode_buffer(dip, &bh);
 		if (error)
 			return error;
-		gfs2_trans_add_bh(dip->i_gl, bh, 1);
+		gfs2_trans_add_meta(dip->i_gl, bh);
 	}
 
 	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
@@ -1846,10 +1860,12 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
 
 	ht = kzalloc(size, GFP_NOFS);
+	if (ht == NULL)
+		ht = vzalloc(size);
 	if (!ht)
 		return -ENOMEM;
 
-	error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	error = gfs2_quota_hold(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
 	if (error)
 		goto out;
 
@@ -1917,7 +1933,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 	if (error)
 		goto out_end_trans;
 
-	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(dip->i_gl, dibh);
 	/* On the last dealloc, make this a regular file in case we crash.
 	   (We don't want to free these blocks a second time.)  */
 	if (last_dealloc)
@@ -1933,7 +1949,10 @@ out_rlist:
 	gfs2_rlist_free(&rlist);
 	gfs2_quota_unhold(dip);
 out:
-	kfree(ht);
+	if (is_vmalloc_addr(ht))
+		vfree(ht);
+	else
+		kfree(ht);
 	return error;
 }
 
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 98c960beab3..4f03bbd1873 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -18,14 +18,15 @@ struct gfs2_inode;
 struct gfs2_inum;
 
 extern struct inode *gfs2_dir_search(struct inode *dir,
-				     const struct qstr *filename);
+				     const struct qstr *filename,
+				     bool fail_on_exist);
 extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
 			  const struct gfs2_inode *ip);
 extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
 			const struct gfs2_inode *ip);
 extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
-extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-			 filldir_t filldir, struct file_ra_state *f_ra);
+extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
+			 struct file_ra_state *f_ra);
 extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 			  const struct gfs2_inode *nip, unsigned int new_type);
 
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 4767774a5f3..8b9b3775e2e 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -37,10 +37,10 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
 
 	if (parent && (*len < GFS2_LARGE_FH_SIZE)) {
 		*len = GFS2_LARGE_FH_SIZE;
-		return 255;
+		return FILEID_INVALID;
 	} else if (*len < GFS2_SMALL_FH_SIZE) {
 		*len = GFS2_SMALL_FH_SIZE;
-		return 255;
+		return FILEID_INVALID;
 	}
 
 	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
@@ -64,6 +64,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
 }
 
 struct get_name_filldir {
+	struct dir_context ctx;
 	struct gfs2_inum_host inum;
 	char *name;
 };
@@ -88,9 +89,11 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 	struct inode *dir = parent->d_inode;
 	struct inode *inode = child->d_inode;
 	struct gfs2_inode *dip, *ip;
-	struct get_name_filldir gnfd;
+	struct get_name_filldir gnfd = {
+		.ctx.actor = get_name_filldir,
+		.name = name
+	};
 	struct gfs2_holder gh;
-	u64 offset = 0;
 	int error;
 	struct file_ra_state f_ra = { .start = 0 };
 
@@ -106,13 +109,12 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 	*name = 0;
 	gnfd.inum.no_addr = ip->i_no_addr;
 	gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
-	gnfd.name = name;
 
 	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
 	if (error)
 		return error;
 
-	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra);
+	error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra);
 
 	gfs2_glock_dq_uninit(&gh);
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 991ab2d484d..72c3866a732 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,6 +25,7 @@
 #include <asm/uaccess.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
+#include <linux/aio.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -81,35 +82,28 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
 }
 
 /**
- * gfs2_readdir - Read directory entries from a directory
+ * gfs2_readdir - Iterator for a directory
  * @file: The directory to read from
- * @dirent: Buffer for dirents
- * @filldir: Function used to do the copying
+ * @ctx: What to feed directory entries to
  *
  * Returns: errno
  */
 
-static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int gfs2_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *dir = file->f_mapping->host;
 	struct gfs2_inode *dip = GFS2_I(dir);
 	struct gfs2_holder d_gh;
-	u64 offset = file->f_pos;
 	int error;
 
-	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
-	error = gfs2_glock_nq(&d_gh);
-	if (error) {
-		gfs2_holder_uninit(&d_gh);
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+	if (error)
 		return error;
-	}
 
-	error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra);
+	error = gfs2_dir_read(dir, ctx, &file->f_ra);
 
 	gfs2_glock_dq_uninit(&d_gh);
 
-	file->f_pos = offset;
-
 	return error;
 }
 
@@ -157,7 +151,7 @@ static const u32 gfs2_to_fsflags[32] = {
 
 static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int error;
@@ -217,7 +211,7 @@ void gfs2_set_inode_flags(struct inode *inode)
  */
 static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *bh;
@@ -276,7 +270,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	error = gfs2_meta_inode_buffer(ip, &bh);
 	if (error)
 		goto out_trans_end;
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, bh);
 	ip->i_diskflags = new_flags;
 	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
@@ -293,7 +287,7 @@ out_drop_write:
 
 static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	u32 fsflags, gfsflags;
 
 	if (get_user(fsflags, ptr))
@@ -336,7 +330,7 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
 {
-	struct inode *inode = filep->f_dentry->d_inode;
+	struct inode *inode = file_inode(filep);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
@@ -386,7 +380,7 @@ static int gfs2_allocate_page_backing(struct page *page)
 static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(vma->vm_file);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	unsigned long last_index;
@@ -401,16 +395,20 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	/* Update file times before taking page lock */
 	file_update_time(vma->vm_file);
 
+	ret = get_write_access(inode);
+	if (ret)
+		goto out;
+
 	ret = gfs2_rs_alloc(ip);
 	if (ret)
-		return ret;
+		goto out_write_access;
 
 	gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
-		goto out;
+		goto out_uninit;
 
 	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
@@ -479,12 +477,15 @@ out_quota_unlock:
 	gfs2_quota_unlock(ip);
 out_unlock:
 	gfs2_glock_dq(&gh);
-out:
+out_uninit:
 	gfs2_holder_uninit(&gh);
 	if (ret == 0) {
 		set_page_dirty(page);
-		wait_on_page_writeback(page);
+		wait_for_stable_page(page);
 	}
+out_write_access:
+	put_write_access(inode);
+out:
 	sb_end_pagefault(inode->i_sb);
 	return block_page_mkwrite_return(ret);
 }
@@ -530,21 +531,30 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 }
 
 /**
- * gfs2_open - open a file
- * @inode: the inode to open
- * @file: the struct file for this opening
+ * gfs2_open_common - This is common to open and atomic_open
+ * @inode: The inode being opened
+ * @file: The file being opened
  *
- * Returns: errno
+ * This maybe called under a glock or not depending upon how it has
+ * been called. We must always be called under a glock for regular
+ * files, however. For other file types, it does not matter whether
+ * we hold the glock or not.
+ *
+ * Returns: Error code or 0 for success
  */
 
-static int gfs2_open(struct inode *inode, struct file *file)
+int gfs2_open_common(struct inode *inode, struct file *file)
 {
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder i_gh;
 	struct gfs2_file *fp;
-	int error;
+	int ret;
+
+	if (S_ISREG(inode->i_mode)) {
+		ret = generic_file_open(inode, file);
+		if (ret)
+			return ret;
+	}
 
-	fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+	fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS);
 	if (!fp)
 		return -ENOMEM;
 
@@ -552,29 +562,43 @@ static int gfs2_open(struct inode *inode, struct file *file)
 
 	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
 	file->private_data = fp;
+	return 0;
+}
+
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * After atomic_open, this function is only used for opening files
+ * which are already cached. We must still get the glock for regular
+ * files to ensure that we have the file size uptodate for the large
+ * file check which is in the common code. That is only an issue for
+ * regular files though.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	int error;
+	bool need_unlock = false;
 
 	if (S_ISREG(ip->i_inode.i_mode)) {
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
 					   &i_gh);
 		if (error)
-			goto fail;
+			return error;
+		need_unlock = true;
+	}
 
-		if (!(file->f_flags & O_LARGEFILE) &&
-		    i_size_read(inode) > MAX_NON_LFS) {
-			error = -EOVERFLOW;
-			goto fail_gunlock;
-		}
+	error = gfs2_open_common(inode, file);
 
+	if (need_unlock)
 		gfs2_glock_dq_uninit(&i_gh);
-	}
-
-	return 0;
 
-fail_gunlock:
-	gfs2_glock_dq_uninit(&i_gh);
-fail:
-	file->private_data = NULL;
-	kfree(fp);
 	return error;
 }
 
@@ -593,10 +617,10 @@ static int gfs2_release(struct inode *inode, struct file *file)
 	kfree(file->private_data);
 	file->private_data = NULL;
 
-	if ((file->f_mode & FMODE_WRITE) &&
-	    (atomic_read(&inode->i_writecount) == 1))
-		gfs2_rs_delete(ip);
+	if (!(file->f_mode & FMODE_WRITE))
+		return 0;
 
+	gfs2_rs_delete(ip);
 	return 0;
 }
 
@@ -673,8 +697,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *file = iocb->ki_filp;
 	size_t writesize = iov_length(iov, nr_segs);
-	struct dentry *dentry = file->f_dentry;
-	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_inode *ip = GFS2_I(file_inode(file));
 	int ret;
 
 	ret = gfs2_rs_alloc(ip);
@@ -709,7 +732,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
 	if (unlikely(error))
 		return error;
 
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 
 	if (gfs2_is_stuffed(ip)) {
 		error = gfs2_unstuff_dinode(ip, NULL);
@@ -772,7 +795,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
 static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 			   loff_t len)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
@@ -889,7 +912,7 @@ out_uninit:
  * cluster; until we do, disable leases (by just returning -EINVAL),
  * unless the administrator has requested purely local locking.
  *
- * Locking: called under lock_flocks
+ * Locking: called under i_lock
  *
  * Returns: errno
  */
@@ -924,8 +947,11 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 		cmd = F_SETLK;
 		fl->fl_type = F_UNLCK;
 	}
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+		if (fl->fl_type == F_UNLCK)
+			posix_lock_file_wait(file, fl);
 		return -EIO;
+	}
 	if (IS_GETLK(cmd))
 		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
 	else if (fl->fl_type == F_UNLCK)
@@ -938,7 +964,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 {
 	struct gfs2_file *fp = file->private_data;
 	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
-	struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode);
+	struct gfs2_inode *ip = GFS2_I(file_inode(file));
 	struct gfs2_glock *gl;
 	unsigned int state;
 	int flags;
@@ -1038,7 +1064,7 @@ const struct file_operations gfs2_file_fops = {
 };
 
 const struct file_operations gfs2_dir_fops = {
-	.readdir	= gfs2_readdir,
+	.iterate	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
 	.release	= gfs2_release,
@@ -1068,7 +1094,7 @@ const struct file_operations gfs2_file_fops_nolock = {
 };
 
 const struct file_operations gfs2_dir_fops_nolock = {
-	.readdir	= gfs2_readdir,
+	.iterate	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
 	.release	= gfs2_release,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 992c5c0cb50..9435384562a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -30,6 +30,7 @@
 #include <linux/rculist_bl.h>
 #include <linux/bit_spinlock.h>
 #include <linux/percpu.h>
+#include <linux/list_sort.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -911,7 +912,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
  */
 
 static void handle_callback(struct gfs2_glock *gl, unsigned int state,
-			    unsigned long delay)
+			    unsigned long delay, bool remote)
 {
 	int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
 
@@ -924,8 +925,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 		gl->gl_demote_state = LM_ST_UNLOCKED;
 	}
 	if (gl->gl_ops->go_callback)
-		gl->gl_ops->go_callback(gl);
-	trace_gfs2_demote_rq(gl);
+		gl->gl_ops->go_callback(gl, remote);
+	trace_gfs2_demote_rq(gl, remote);
 }
 
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
@@ -1016,11 +1017,11 @@ do_cancel:
 	return;
 
 trap_recursive:
-	print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
+	printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip);
 	printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
 	printk(KERN_ERR "lock type: %d req lock state : %d\n",
 	       gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
-	print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
+	printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip);
 	printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
 	printk(KERN_ERR "lock type: %d req lock state : %d\n",
 	       gh->gh_gl->gl_name.ln_type, gh->gh_state);
@@ -1090,7 +1091,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 
 	spin_lock(&gl->gl_spin);
 	if (gh->gh_flags & GL_NOCACHE)
-		handle_callback(gl, LM_ST_UNLOCKED, 0);
+		handle_callback(gl, LM_ST_UNLOCKED, 0, false);
 
 	list_del_init(&gh->gh_list);
 	if (find_first_holder(gl) == NULL) {
@@ -1278,19 +1279,6 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 		gfs2_glock_dq(&ghs[num_gh]);
 }
 
-/**
- * gfs2_glock_dq_uninit_m - release multiple glocks
- * @num_gh: the number of structures
- * @ghs: an array of struct gfs2_holder structures
- *
- */
-
-void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
-{
-	while (num_gh--)
-		gfs2_glock_dq_uninit(&ghs[num_gh]);
-}
-
 void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 {
 	unsigned long delay = 0;
@@ -1308,7 +1296,7 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 	}
 
 	spin_lock(&gl->gl_spin);
-	handle_callback(gl, state, delay);
+	handle_callback(gl, state, delay, true);
 	spin_unlock(&gl->gl_spin);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
 		gfs2_glock_put(gl);
@@ -1376,56 +1364,105 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 		gfs2_glock_put(gl);
 }
 
+static int glock_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct gfs2_glock *gla, *glb;
 
-static int gfs2_shrink_glock_memory(struct shrinker *shrink,
-				    struct shrink_control *sc)
+	gla = list_entry(a, struct gfs2_glock, gl_lru);
+	glb = list_entry(b, struct gfs2_glock, gl_lru);
+
+	if (gla->gl_name.ln_number > glb->gl_name.ln_number)
+		return 1;
+	if (gla->gl_name.ln_number < glb->gl_name.ln_number)
+		return -1;
+
+	return 0;
+}
+
+/**
+ * gfs2_dispose_glock_lru - Demote a list of glocks
+ * @list: The list to dispose of
+ *
+ * Disposing of glocks may involve disk accesses, so that here we sort
+ * the glocks by number (i.e. disk location of the inodes) so that if
+ * there are any such accesses, they'll be sent in order (mostly).
+ *
+ * Must be called under the lru_lock, but may drop and retake this
+ * lock. While the lru_lock is dropped, entries may vanish from the
+ * list, but no new entries will appear on the list (since it is
+ * private)
+ */
+
+static void gfs2_dispose_glock_lru(struct list_head *list)
+__releases(&lru_lock)
+__acquires(&lru_lock)
 {
 	struct gfs2_glock *gl;
-	int may_demote;
-	int nr_skipped = 0;
-	int nr = sc->nr_to_scan;
-	gfp_t gfp_mask = sc->gfp_mask;
-	LIST_HEAD(skipped);
 
-	if (nr == 0)
-		goto out;
+	list_sort(NULL, list, glock_cmp);
 
-	if (!(gfp_mask & __GFP_FS))
-		return -1;
+	while(!list_empty(list)) {
+		gl = list_entry(list->next, struct gfs2_glock, gl_lru);
+		list_del_init(&gl->gl_lru);
+		clear_bit(GLF_LRU, &gl->gl_flags);
+		gfs2_glock_hold(gl);
+		spin_unlock(&lru_lock);
+		spin_lock(&gl->gl_spin);
+		if (demote_ok(gl))
+			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
+		WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
+		smp_mb__after_clear_bit();
+		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+			gfs2_glock_put_nolock(gl);
+		spin_unlock(&gl->gl_spin);
+		spin_lock(&lru_lock);
+	}
+}
+
+/**
+ * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote
+ * @nr: The number of entries to scan
+ *
+ * This function selects the entries on the LRU which are able to
+ * be demoted, and then kicks off the process by calling
+ * gfs2_dispose_glock_lru() above.
+ */
+
+static void gfs2_scan_glock_lru(int nr)
+{
+	struct gfs2_glock *gl;
+	LIST_HEAD(skipped);
+	LIST_HEAD(dispose);
 
 	spin_lock(&lru_lock);
 	while(nr && !list_empty(&lru_list)) {
 		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
-		list_del_init(&gl->gl_lru);
-		clear_bit(GLF_LRU, &gl->gl_flags);
-		atomic_dec(&lru_count);
 
 		/* Test for being demotable */
 		if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-			gfs2_glock_hold(gl);
-			spin_unlock(&lru_lock);
-			spin_lock(&gl->gl_spin);
-			may_demote = demote_ok(gl);
-			if (may_demote) {
-				handle_callback(gl, LM_ST_UNLOCKED, 0);
-				nr--;
-			}
-			clear_bit(GLF_LOCK, &gl->gl_flags);
-			smp_mb__after_clear_bit();
-			if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-				gfs2_glock_put_nolock(gl);
-			spin_unlock(&gl->gl_spin);
-			spin_lock(&lru_lock);
+			list_move(&gl->gl_lru, &dispose);
+			atomic_dec(&lru_count);
+			nr--;
 			continue;
 		}
-		nr_skipped++;
-		list_add(&gl->gl_lru, &skipped);
-		set_bit(GLF_LRU, &gl->gl_flags);
+
+		list_move(&gl->gl_lru, &skipped);
 	}
 	list_splice(&skipped, &lru_list);
-	atomic_add(nr_skipped, &lru_count);
+	if (!list_empty(&dispose))
+		gfs2_dispose_glock_lru(&dispose);
 	spin_unlock(&lru_lock);
-out:
+}
+
+static int gfs2_shrink_glock_memory(struct shrinker *shrink,
+				    struct shrink_control *sc)
+{
+	if (sc->nr_to_scan) {
+		if (!(sc->gfp_mask & __GFP_FS))
+			return -1;
+		gfs2_scan_glock_lru(sc->nr_to_scan);
+	}
+
 	return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
 }
 
@@ -1497,7 +1534,7 @@ static void clear_glock(struct gfs2_glock *gl)
 
 	spin_lock(&gl->gl_spin);
 	if (gl->gl_state != LM_ST_UNLOCKED)
-		handle_callback(gl, LM_ST_UNLOCKED, 0);
+		handle_callback(gl, LM_ST_UNLOCKED, 0, false);
 	spin_unlock(&gl->gl_spin);
 	gfs2_glock_hold(gl);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1540,6 +1577,7 @@ static void dump_glock_func(struct gfs2_glock *gl)
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
 	set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
+	flush_workqueue(glock_workqueue);
 	glock_hash_walk(clear_glock, sdp);
 	flush_workqueue(glock_workqueue);
 	wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index fd580b7861d..69f66e3d22b 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -201,7 +201,6 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
 			     struct gfs2_holder *gh);
 extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
 extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
 extern __printf(2, 3)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78d4184ffc7..5f2e5224c51 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -54,7 +54,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 	struct gfs2_bufdata *bd, *tmp;
 	struct buffer_head *bh;
 	const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
-	sector_t blocknr;
 
 	gfs2_log_lock(sdp);
 	spin_lock(&sdp->sd_ail_lock);
@@ -65,13 +64,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 				continue;
 			gfs2_ail_error(gl, bh);
 		}
-		blocknr = bh->b_blocknr;
-		bh->b_private = NULL;
-		gfs2_remove_from_ail(bd); /* drops ref on bh */
-
-		bd->bd_bh = NULL;
-		bd->bd_blkno = blocknr;
-
 		gfs2_trans_add_revoke(sdp, bd);
 	}
 	GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
@@ -322,8 +314,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 		break;
 	};
 
-	ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
-	ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
+	i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid));
+	i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid));
 	gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
 	i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
 	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
@@ -515,12 +507,12 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
  *
  * gl_spin lock is held while calling this
  */
-static void iopen_go_callback(struct gfs2_glock *gl)
+static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
 {
 	struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 
-	if (sdp->sd_vfs->s_flags & MS_RDONLY)
+	if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY))
 		return;
 
 	if (gl->gl_demote_state == LM_ST_UNLOCKED &&
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index c373a24fedd..26aabd7caba 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -31,7 +31,6 @@ struct gfs2_holder;
 struct gfs2_glock;
 struct gfs2_quota_data;
 struct gfs2_trans;
-struct gfs2_ail;
 struct gfs2_jdesc;
 struct gfs2_sbd;
 struct lm_lockops;
@@ -52,9 +51,8 @@ struct gfs2_log_header_host {
  */
 
 struct gfs2_log_operations {
-	void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
 	void (*lo_before_commit) (struct gfs2_sbd *sdp);
-	void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
+	void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
 	void (*lo_before_scan) (struct gfs2_jdesc *jd,
 				struct gfs2_log_header_host *head, int pass);
 	int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
@@ -140,7 +138,7 @@ struct gfs2_bufdata {
 	struct list_head bd_list;
 	const struct gfs2_log_operations *bd_ops;
 
-	struct gfs2_ail *bd_ail;
+	struct gfs2_trans *bd_tr;
 	struct list_head bd_ail_st_list;
 	struct list_head bd_ail_gl_list;
 };
@@ -212,7 +210,7 @@ struct gfs2_glock_operations {
 	int (*go_lock) (struct gfs2_holder *gh);
 	void (*go_unlock) (struct gfs2_holder *gh);
 	int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
-	void (*go_callback) (struct gfs2_glock *gl);
+	void (*go_callback)(struct gfs2_glock *gl, bool remote);
 	const int go_type;
 	const unsigned long go_flags;
 #define GLOF_ASPACE 1
@@ -341,6 +339,7 @@ enum {
 	GIF_QD_LOCKED		= 1,
 	GIF_ALLOC_FAILED	= 2,
 	GIF_SW_PAGED		= 3,
+	GIF_ORDERED		= 4,
 };
 
 struct gfs2_inode {
@@ -357,6 +356,7 @@ struct gfs2_inode {
 	struct gfs2_rgrpd *i_rgd;
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
+	struct list_head i_ordered;
 	struct list_head i_trunc_list;
 	__be64 *i_hash_cache;
 	u32 i_entries;
@@ -391,7 +391,6 @@ struct gfs2_revoke_replay {
 };
 
 enum {
-	QDF_USER		= 0,
 	QDF_CHANGE		= 1,
 	QDF_LOCKED		= 2,
 	QDF_REFRESH		= 3,
@@ -403,7 +402,7 @@ struct gfs2_quota_data {
 
 	atomic_t qd_count;
 
-	u32 qd_id;
+	struct kqid qd_id;
 	unsigned long qd_flags;		/* QDF_... */
 
 	s64 qd_change;
@@ -433,6 +432,7 @@ struct gfs2_trans {
 	struct gfs2_holder tr_t_gh;
 
 	int tr_touched;
+	int tr_attached;
 
 	unsigned int tr_num_buf_new;
 	unsigned int tr_num_databuf_new;
@@ -440,14 +440,12 @@ struct gfs2_trans {
 	unsigned int tr_num_databuf_rm;
 	unsigned int tr_num_revoke;
 	unsigned int tr_num_revoke_rm;
-};
 
-struct gfs2_ail {
-	struct list_head ai_list;
+	struct list_head tr_list;
 
-	unsigned int ai_first;
-	struct list_head ai_ail1_list;
-	struct list_head ai_ail2_list;
+	unsigned int tr_first;
+	struct list_head tr_ail1_list;
+	struct list_head tr_ail2_list;
 };
 
 struct gfs2_journal_extent {
@@ -588,6 +586,7 @@ struct lm_lockstruct {
 	struct dlm_lksb ls_control_lksb; /* control_lock */
 	char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */
 	struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
+	char *ls_lvb_bits;
 
 	spinlock_t ls_recover_spin; /* protects following fields */
 	unsigned long ls_recover_flags; /* DFL_ */
@@ -641,6 +640,7 @@ struct gfs2_sbd {
 	wait_queue_head_t sd_glock_wait;
 	atomic_t sd_glock_disposal;
 	struct completion sd_locking_init;
+	struct completion sd_wdack;
 	struct delayed_work sd_control_work;
 
 	/* Inode Stuff */
@@ -708,6 +708,7 @@ struct gfs2_sbd {
 
 	spinlock_t sd_log_lock;
 
+	struct gfs2_trans *sd_log_tr;
 	unsigned int sd_log_blks_reserved;
 	unsigned int sd_log_commited_buf;
 	unsigned int sd_log_commited_databuf;
@@ -723,6 +724,7 @@ struct gfs2_sbd {
 	struct list_head sd_log_le_revoke;
 	struct list_head sd_log_le_databuf;
 	struct list_head sd_log_le_ordered;
+	spinlock_t sd_ordered_lock;
 
 	atomic_t sd_log_thresh1;
 	atomic_t sd_log_thresh2;
@@ -758,10 +760,7 @@ struct gfs2_sbd {
 	unsigned int sd_replayed_blocks;
 
 	/* For quiescing the filesystem */
-
 	struct gfs2_holder sd_freeze_gh;
-	struct mutex sd_freeze_lock;
-	unsigned int sd_freeze_count;
 
 	char sd_fsname[GFS2_FSNAME_LEN];
 	char sd_table_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2b6f5698ef1..bbb2715171c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -189,6 +189,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 	return inode;
 
 fail_refresh:
+	ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
 	ip->i_iopen_gh.gh_gl->gl_object = NULL;
 	gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 fail_iopen:
@@ -312,7 +313,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 			goto out;
 	}
 
-	inode = gfs2_dir_search(dir, name);
+	inode = gfs2_dir_search(dir, name, false);
 	if (IS_ERR(inode))
 		error = PTR_ERR(inode);
 out:
@@ -345,17 +346,6 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 	if (!dip->i_inode.i_nlink)
 		return -ENOENT;
 
-	error = gfs2_dir_check(&dip->i_inode, name, NULL);
-	switch (error) {
-	case -ENOENT:
-		error = 0;
-		break;
-	case 0:
-		return -EEXIST;
-	default:
-		return error;
-	}
-
 	if (dip->i_entries == (u32)-1)
 		return -EFBIG;
 	if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
@@ -368,10 +358,11 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip,
 			       struct inode *inode)
 {
 	if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
-	    (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) {
+	    (dip->i_inode.i_mode & S_ISUID) &&
+	    !uid_eq(dip->i_inode.i_uid, GLOBAL_ROOT_UID)) {
 		if (S_ISDIR(inode->i_mode))
 			inode->i_mode |= S_ISUID;
-		else if (dip->i_inode.i_uid != current_fsuid())
+		else if (!uid_eq(dip->i_inode.i_uid, current_fsuid()))
 			inode->i_mode &= ~07111;
 		inode->i_uid = dip->i_inode.i_uid;
 	} else
@@ -391,11 +382,15 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
 	int error;
 	int dblocks = 1;
 
-	error = gfs2_inplace_reserve(ip, RES_DINODE, flags);
+	error = gfs2_quota_lock_check(ip);
 	if (error)
 		goto out;
 
-	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
+	error = gfs2_inplace_reserve(ip, RES_DINODE, flags);
+	if (error)
+		goto out_quota;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0);
 	if (error)
 		goto out_ipreserv;
 
@@ -408,6 +403,8 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
 
 out_ipreserv:
 	gfs2_inplace_release(ip);
+out_quota:
+	gfs2_quota_unlock(ip);
 out:
 	return error;
 }
@@ -439,59 +436,27 @@ static void gfs2_init_dir(struct buffer_head *dibh,
  */
 
 static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
-			const char *symname, struct buffer_head **bhp)
+			const char *symname)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_dinode *di;
 	struct buffer_head *dibh;
-	struct timespec tv = CURRENT_TIME;
 
 	dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr);
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
-	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 	di = (struct gfs2_dinode *)dibh->b_data;
+	gfs2_dinode_out(ip, di);
 
-	di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
-	di->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
-	di->di_mode = cpu_to_be32(ip->i_inode.i_mode);
-	di->di_uid = cpu_to_be32(ip->i_inode.i_uid);
-	di->di_gid = cpu_to_be32(ip->i_inode.i_gid);
-	di->di_nlink = 0;
-	di->di_size = cpu_to_be64(ip->i_inode.i_size);
-	di->di_blocks = cpu_to_be64(1);
-	di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec);
 	di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev));
 	di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev));
-	di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_no_addr);
-	di->di_generation = cpu_to_be64(ip->i_generation);
-	di->di_flags = 0;
 	di->__pad1 = 0;
-	di->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) ? GFS2_FORMAT_DE : 0);
-	di->di_height = 0;
 	di->__pad2 = 0;
 	di->__pad3 = 0;
-	di->di_depth = 0;
-	di->di_entries = 0;
 	memset(&di->__pad4, 0, sizeof(di->__pad4));
-	di->di_eattr = 0;
-	di->di_atime_nsec = cpu_to_be32(tv.tv_nsec);
-	di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec);
-	di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
 	memset(&di->di_reserved, 0, sizeof(di->di_reserved));
+	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 
 	switch(ip->i_inode.i_mode & S_IFMT) {
-	case S_IFREG:
-		if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
-		    gfs2_tune_get(sdp, gt_new_files_jdata))
-			di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
-		break;
 	case S_IFDIR:
-		di->di_flags |= cpu_to_be32(dip->i_diskflags &
-					    GFS2_DIF_INHERIT_JDATA);
-		di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
-		di->di_size = cpu_to_be64(sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
-		di->di_entries = cpu_to_be32(2);
 		gfs2_init_dir(dibh, dip);
 		break;
 	case S_IFLNK:
@@ -500,63 +465,17 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
 	}
 
 	set_buffer_uptodate(dibh);
-
-	*bhp = dibh;
-}
-
-static int make_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
-		       const char *symname, struct buffer_head **bhp)
-{
-	struct inode *inode = &ip->i_inode;
-	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-	int error;
-
-	error = gfs2_rindex_update(sdp);
-	if (error)
-		return error;
-
-	error = gfs2_quota_lock(dip, inode->i_uid, inode->i_gid);
-	if (error)
-		return error;
-
-	error = gfs2_quota_check(dip, inode->i_uid, inode->i_gid);
-	if (error)
-		goto out_quota;
-
-	error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
-	if (error)
-		goto out_quota;
-
-	init_dinode(dip, ip, symname, bhp);
-	gfs2_quota_change(dip, +1, inode->i_uid, inode->i_gid);
-	gfs2_trans_end(sdp);
-
-out_quota:
-	gfs2_quota_unlock(dip);
-	return error;
+	brelse(dibh);
 }
 
 static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
-		       struct gfs2_inode *ip)
+		       struct gfs2_inode *ip, int arq)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-	int alloc_required;
-	struct buffer_head *dibh;
 	int error;
 
-	error = gfs2_rindex_update(sdp);
-	if (error)
-		return error;
-
-	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
-	if (error)
-		goto fail;
-
-	error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
-	if (alloc_required < 0)
-		goto fail_quota_locks;
-	if (alloc_required) {
-		error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
+	if (arq) {
+		error = gfs2_quota_lock_check(dip);
 		if (error)
 			goto fail_quota_locks;
 
@@ -580,26 +499,12 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 	if (error)
 		goto fail_end_trans;
 
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto fail_end_trans;
-	set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-	gfs2_dinode_out(ip, dibh->b_data);
-	brelse(dibh);
-	return 0;
-
 fail_end_trans:
 	gfs2_trans_end(sdp);
-
 fail_ipreserv:
-	if (alloc_required)
-		gfs2_inplace_release(dip);
-
+	gfs2_inplace_release(dip);
 fail_quota_locks:
 	gfs2_quota_unlock(dip);
-
-fail:
 	return error;
 }
 
@@ -630,6 +535,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
  * gfs2_create_inode - Create a new inode
  * @dir: The parent directory
  * @dentry: The new dentry
+ * @file: If non-NULL, the file which is being opened
  * @mode: The permissions on the new inode
  * @dev: For device nodes, this is the device number
  * @symname: For symlinks, this is the link destination
@@ -639,8 +545,9 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
  */
 
 static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+			     struct file *file,
 			     umode_t mode, dev_t dev, const char *symname,
-			     unsigned int size, int excl)
+			     unsigned int size, int excl, int *opened)
 {
 	const struct qstr *name = &dentry->d_name;
 	struct gfs2_holder ghs[2];
@@ -648,9 +555,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	struct gfs2_inode *dip = GFS2_I(dir), *ip;
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_glock *io_gl;
+	struct dentry *d;
 	int error;
-	struct buffer_head *bh = NULL;
 	u32 aflags = 0;
+	int arq;
 
 	if (!name->len || name->len > GFS2_FNAMESIZE)
 		return -ENAMETOOLONG;
@@ -659,36 +567,81 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (error)
 		return error;
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
 	if (error)
 		goto fail;
 
 	error = create_ok(dip, name, mode);
-	if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
-		inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-		gfs2_glock_dq_uninit(ghs);
-		d_instantiate(dentry, inode);
-		return IS_ERR(inode) ? PTR_ERR(inode) : 0;
-	}
 	if (error)
 		goto fail_gunlock;
 
-	inode = new_inode(sdp->sd_vfs);
-	if (!inode) {
+	inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
+	error = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		d = d_splice_alias(inode, dentry);
+		error = 0;
+		if (file && !IS_ERR(d)) {
+			if (d == NULL)
+				d = dentry;
+			if (S_ISREG(inode->i_mode))
+				error = finish_open(file, d, gfs2_open_common, opened);
+			else
+				error = finish_no_open(file, d);
+		}
 		gfs2_glock_dq_uninit(ghs);
-		return -ENOMEM;
+		if (IS_ERR(d))
+			return PTR_RET(d);
+		return error;
+	} else if (error != -ENOENT) {
+		goto fail_gunlock;
 	}
+
+	arq = error = gfs2_diradd_alloc_required(dir, name);
+	if (error < 0)
+		goto fail_gunlock;
+
+	inode = new_inode(sdp->sd_vfs);
+	error = -ENOMEM;
+	if (!inode)
+		goto fail_gunlock;
+
 	ip = GFS2_I(inode);
 	error = gfs2_rs_alloc(ip);
 	if (error)
 		goto fail_free_inode;
 
-	set_bit(GIF_INVALID, &ip->i_flags);
 	inode->i_mode = mode;
+	set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
 	inode->i_rdev = dev;
 	inode->i_size = size;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	gfs2_set_inode_blocks(inode, 1);
 	munge_mode_uid_gid(dip, inode);
 	ip->i_goal = dip->i_goal;
+	ip->i_diskflags = 0;
+	ip->i_eattr = 0;
+	ip->i_height = 0;
+	ip->i_depth = 0;
+	ip->i_entries = 0;
+
+	switch(mode & S_IFMT) {
+	case S_IFREG:
+		if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
+		    gfs2_tune_get(sdp, gt_new_files_jdata))
+			ip->i_diskflags |= GFS2_DIF_JDATA;
+		gfs2_set_aops(inode);
+		break;
+	case S_IFDIR:
+		ip->i_diskflags |= (dip->i_diskflags & GFS2_DIF_INHERIT_JDATA);
+		ip->i_diskflags |= GFS2_DIF_JDATA;
+		ip->i_entries = 2;
+		break;
+	}
+	gfs2_set_inode_flags(inode);
 
 	if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) ||
 	    (dip->i_diskflags & GFS2_DIF_TOPDIR))
@@ -707,10 +660,13 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (error)
 		goto fail_free_inode;
 
-	error = make_dinode(dip, ip, symname, &bh);
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
 	if (error)
 		goto fail_gunlock2;
 
+	init_dinode(dip, ip, symname);
+	gfs2_trans_end(sdp);
+
 	error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
 	if (error)
 		goto fail_gunlock2;
@@ -724,10 +680,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	gfs2_set_iop(inode);
 	insert_inode_hash(inode);
 
-	error = gfs2_inode_refresh(ip);
-	if (error)
-		goto fail_gunlock3;
-
 	error = gfs2_acl_create(dip, inode);
 	if (error)
 		goto fail_gunlock3;
@@ -736,20 +688,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (error)
 		goto fail_gunlock3;
 
-	error = link_dinode(dip, name, ip);
+	error = link_dinode(dip, name, ip, arq);
 	if (error)
 		goto fail_gunlock3;
 
-	if (bh)
-		brelse(bh);
-
-	gfs2_trans_end(sdp);
-	gfs2_inplace_release(dip);
-	gfs2_quota_unlock(dip);
 	mark_inode_dirty(inode);
-	gfs2_glock_dq_uninit_m(2, ghs);
 	d_instantiate(dentry, inode);
-	return 0;
+	if (file)
+		error = finish_open(file, dentry, gfs2_open_common, opened);
+	gfs2_glock_dq_uninit(ghs);
+	gfs2_glock_dq_uninit(ghs + 1);
+	return error;
 
 fail_gunlock3:
 	gfs2_glock_dq_uninit(ghs + 1);
@@ -768,12 +717,12 @@ fail_free_inode:
 fail_gunlock:
 	gfs2_glock_dq_uninit(ghs);
 	if (inode && !IS_ERR(inode)) {
+		clear_nlink(inode);
+		mark_inode_dirty(inode);
 		set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags);
 		iput(inode);
 	}
 fail:
-	if (bh)
-		brelse(bh);
 	return error;
 }
 
@@ -789,36 +738,56 @@ fail:
 static int gfs2_create(struct inode *dir, struct dentry *dentry,
 		       umode_t mode, bool excl)
 {
-	return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl);
+	return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL);
 }
 
 /**
- * gfs2_lookup - Look up a filename in a directory and return its inode
+ * __gfs2_lookup - Look up a filename in a directory and return its inode
  * @dir: The directory inode
  * @dentry: The dentry of the new inode
- * @nd: passed from Linux VFS, ignored by us
+ * @file: File to be opened
+ * @opened: atomic_open flags
  *
- * Called by the VFS layer. Lock dir and call gfs2_lookupi()
  *
  * Returns: errno
  */
 
-static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
-				  unsigned int flags)
+static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
+				    struct file *file, int *opened)
 {
-	struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-	if (inode && !IS_ERR(inode)) {
-		struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
-		struct gfs2_holder gh;
-		int error;
-		error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-		if (error) {
-			iput(inode);
-			return ERR_PTR(error);
-		}
-		gfs2_glock_dq_uninit(&gh);
+	struct inode *inode;
+	struct dentry *d;
+	struct gfs2_holder gh;
+	struct gfs2_glock *gl;
+	int error;
+
+	inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+	if (!inode)
+		return NULL;
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	gl = GFS2_I(inode)->i_gl;
+	error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+	if (error) {
+		iput(inode);
+		return ERR_PTR(error);
 	}
-	return d_splice_alias(inode, dentry);
+
+	d = d_splice_alias(inode, dentry);
+	if (file && S_ISREG(inode->i_mode))
+		error = finish_open(file, dentry, gfs2_open_common, opened);
+
+	gfs2_glock_dq_uninit(&gh);
+	if (error)
+		return ERR_PTR(error);
+	return d;
+}
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+				  unsigned flags)
+{
+	return __gfs2_lookup(dir, dentry, NULL, NULL);
 }
 
 /**
@@ -931,7 +900,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (error)
 		goto out_brelse;
 
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 	inc_nlink(&ip->i_inode);
 	ip->i_inode.i_ctime = CURRENT_TIME;
 	ihold(inode);
@@ -978,8 +947,8 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 		return -EPERM;
 
 	if ((dip->i_inode.i_mode & S_ISVTX) &&
-	    dip->i_inode.i_uid != current_fsuid() &&
-	    ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+	    !uid_eq(dip->i_inode.i_uid, current_fsuid()) &&
+	    !uid_eq(ip->i_inode.i_uid, current_fsuid()) && !capable(CAP_FOWNER))
 		return -EPERM;
 
 	if (IS_APPEND(&dip->i_inode))
@@ -1136,7 +1105,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
 		return -ENAMETOOLONG;
 
-	return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0);
+	return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL);
 }
 
 /**
@@ -1150,7 +1119,9 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 
 static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0, 0);
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+	return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL);
 }
 
 /**
@@ -1165,7 +1136,43 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 		      dev_t dev)
 {
-	return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0);
+	return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL);
+}
+
+/**
+ * gfs2_atomic_open - Atomically open a file
+ * @dir: The directory
+ * @dentry: The proposed new entry
+ * @file: The proposed new struct file
+ * @flags: open flags
+ * @mode: File mode
+ * @opened: Flag to say whether the file has been opened or not
+ *
+ * Returns: error code or 0 for success
+ */
+
+static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
+                            struct file *file, unsigned flags,
+                            umode_t mode, int *opened)
+{
+	struct dentry *d;
+	bool excl = !!(flags & O_EXCL);
+
+	d = __gfs2_lookup(dir, dentry, file, opened);
+	if (IS_ERR(d))
+		return PTR_ERR(d);
+	if (d == NULL)
+		d = dentry;
+	if (d->d_inode) {
+		if (!(*opened & FILE_OPENED))
+			return finish_no_open(file, d);
+		return 0;
+	}
+
+	if (!(flags & O_CREAT))
+		return -ENOENT;
+
+	return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened);
 }
 
 /*
@@ -1412,7 +1419,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		if (error)
 			goto out_end_trans;
 		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_trans_add_meta(ip->i_gl, dibh);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
@@ -1580,7 +1587,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	u32 ouid, ogid, nuid, ngid;
+	kuid_t ouid, nuid;
+	kgid_t ogid, ngid;
 	int error;
 
 	ouid = inode->i_uid;
@@ -1588,16 +1596,17 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	nuid = attr->ia_uid;
 	ngid = attr->ia_gid;
 
-	if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
-		ouid = nuid = NO_QUOTA_CHANGE;
-	if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
-		ogid = ngid = NO_QUOTA_CHANGE;
+	if (!(attr->ia_valid & ATTR_UID) || uid_eq(ouid, nuid))
+		ouid = nuid = NO_UID_QUOTA_CHANGE;
+	if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
+		ogid = ngid = NO_GID_QUOTA_CHANGE;
 
 	error = gfs2_quota_lock(ip, nuid, ngid);
 	if (error)
 		return error;
 
-	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+	if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
+	    !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
 		error = gfs2_quota_check(ip, nuid, ngid);
 		if (error)
 			goto out_gunlock_q;
@@ -1611,7 +1620,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (error)
 		goto out_end_trans;
 
-	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+	if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
+	    !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
 		u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
 		gfs2_quota_change(ip, -blocks, ouid, ogid);
 		gfs2_quota_change(ip, blocks, nuid, ngid);
@@ -1842,6 +1852,7 @@ const struct inode_operations gfs2_dir_iops = {
 	.removexattr = gfs2_removexattr,
 	.fiemap = gfs2_fiemap,
 	.get_acl = gfs2_get_acl,
+	.atomic_open = gfs2_atomic_open,
 };
 
 const struct inode_operations gfs2_symlink_iops = {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c53c7477f6d..ba4d9492d42 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,6 +109,7 @@ extern int gfs2_permission(struct inode *inode, int mask);
 extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern int gfs2_open_common(struct inode *inode, struct file *file);
 
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 9802de0f85e..c8423d6de6c 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -483,12 +483,8 @@ static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
 
 static int all_jid_bits_clear(char *lvb)
 {
-	int i;
-	for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) {
-		if (lvb[i])
-			return 0;
-	}
-	return 1;
+	return !memchr_inv(lvb + JID_BITMAP_OFFSET, 0,
+			GDLM_LVB_SIZE - JID_BITMAP_OFFSET);
 }
 
 static void sync_wait_cb(void *arg)
@@ -580,7 +576,6 @@ static void gfs2_control_func(struct work_struct *work)
 {
 	struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	char lvb_bits[GDLM_LVB_SIZE];
 	uint32_t block_gen, start_gen, lvb_gen, flags;
 	int recover_set = 0;
 	int write_lvb = 0;
@@ -634,7 +629,7 @@ static void gfs2_control_func(struct work_struct *work)
 		return;
 	}
 
-	control_lvb_read(ls, &lvb_gen, lvb_bits);
+	control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits);
 
 	spin_lock(&ls->ls_recover_spin);
 	if (block_gen != ls->ls_recover_block ||
@@ -664,10 +659,10 @@ static void gfs2_control_func(struct work_struct *work)
 
 			ls->ls_recover_result[i] = 0;
 
-			if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET))
+			if (!test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET))
 				continue;
 
-			__clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+			__clear_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET);
 			write_lvb = 1;
 		}
 	}
@@ -691,7 +686,7 @@ static void gfs2_control_func(struct work_struct *work)
 				continue;
 			if (ls->ls_recover_submit[i] < start_gen) {
 				ls->ls_recover_submit[i] = 0;
-				__set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+				__set_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET);
 			}
 		}
 		/* even if there are no bits to set, we need to write the
@@ -705,7 +700,7 @@ static void gfs2_control_func(struct work_struct *work)
 	spin_unlock(&ls->ls_recover_spin);
 
 	if (write_lvb) {
-		control_lvb_write(ls, start_gen, lvb_bits);
+		control_lvb_write(ls, start_gen, ls->ls_lvb_bits);
 		flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK;
 	} else {
 		flags = DLM_LKF_CONVERT;
@@ -725,7 +720,7 @@ static void gfs2_control_func(struct work_struct *work)
 	 */
 
 	for (i = 0; i < recover_size; i++) {
-		if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) {
+		if (test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) {
 			fs_info(sdp, "recover generation %u jid %d\n",
 				start_gen, i);
 			gfs2_recover_set(sdp, i);
@@ -758,7 +753,6 @@ static void gfs2_control_func(struct work_struct *work)
 static int control_mount(struct gfs2_sbd *sdp)
 {
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	char lvb_bits[GDLM_LVB_SIZE];
 	uint32_t start_gen, block_gen, mount_gen, lvb_gen;
 	int mounted_mode;
 	int retries = 0;
@@ -857,7 +851,7 @@ locks_done:
 	 * lvb_gen will be non-zero.
 	 */
 
-	control_lvb_read(ls, &lvb_gen, lvb_bits);
+	control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits);
 
 	if (lvb_gen == 0xFFFFFFFF) {
 		/* special value to force mount attempts to fail */
@@ -887,7 +881,7 @@ locks_done:
 	 * and all lvb bits to be clear (no pending journal recoveries.)
 	 */
 
-	if (!all_jid_bits_clear(lvb_bits)) {
+	if (!all_jid_bits_clear(ls->ls_lvb_bits)) {
 		/* journals need recovery, wait until all are clear */
 		fs_info(sdp, "control_mount wait for journal recovery\n");
 		goto restart;
@@ -949,7 +943,6 @@ static int dlm_recovery_wait(void *word)
 static int control_first_done(struct gfs2_sbd *sdp)
 {
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	char lvb_bits[GDLM_LVB_SIZE];
 	uint32_t start_gen, block_gen;
 	int error;
 
@@ -991,8 +984,8 @@ restart:
 	memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
 	spin_unlock(&ls->ls_recover_spin);
 
-	memset(lvb_bits, 0, sizeof(lvb_bits));
-	control_lvb_write(ls, start_gen, lvb_bits);
+	memset(ls->ls_lvb_bits, 0, GDLM_LVB_SIZE);
+	control_lvb_write(ls, start_gen, ls->ls_lvb_bits);
 
 	error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT);
 	if (error)
@@ -1022,6 +1015,12 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
 	uint32_t old_size, new_size;
 	int i, max_jid;
 
+	if (!ls->ls_lvb_bits) {
+		ls->ls_lvb_bits = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
+		if (!ls->ls_lvb_bits)
+			return -ENOMEM;
+	}
+
 	max_jid = 0;
 	for (i = 0; i < num_slots; i++) {
 		if (max_jid < slots[i].slot - 1)
@@ -1057,6 +1056,7 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
 
 static void free_recover_size(struct lm_lockstruct *ls)
 {
+	kfree(ls->ls_lvb_bits);
 	kfree(ls->ls_recover_submit);
 	kfree(ls->ls_recover_result);
 	ls->ls_recover_submit = NULL;
@@ -1205,6 +1205,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
 	ls->ls_recover_size = 0;
 	ls->ls_recover_submit = NULL;
 	ls->ls_recover_result = NULL;
+	ls->ls_lvb_bits = NULL;
 
 	error = set_recover_size(sdp, NULL, 0);
 	if (error)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index f4beeb9c81c..610613fb65b 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -73,7 +73,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 
 void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 {
-	bd->bd_ail = NULL;
+	bd->bd_tr = NULL;
 	list_del_init(&bd->bd_ail_st_list);
 	list_del_init(&bd->bd_ail_gl_list);
 	atomic_dec(&bd->bd_gl->gl_ail_count);
@@ -90,7 +90,7 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 
 static int gfs2_ail1_start_one(struct gfs2_sbd *sdp,
 			       struct writeback_control *wbc,
-			       struct gfs2_ail *ai)
+			       struct gfs2_trans *tr)
 __releases(&sdp->sd_ail_lock)
 __acquires(&sdp->sd_ail_lock)
 {
@@ -99,15 +99,15 @@ __acquires(&sdp->sd_ail_lock)
 	struct gfs2_bufdata *bd, *s;
 	struct buffer_head *bh;
 
-	list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) {
+	list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) {
 		bh = bd->bd_bh;
 
-		gfs2_assert(sdp, bd->bd_ail == ai);
+		gfs2_assert(sdp, bd->bd_tr == tr);
 
 		if (!buffer_busy(bh)) {
 			if (!buffer_uptodate(bh))
 				gfs2_io_error_bh(sdp, bh);
-			list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+			list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
 			continue;
 		}
 
@@ -116,7 +116,7 @@ __acquires(&sdp->sd_ail_lock)
 		if (gl == bd->bd_gl)
 			continue;
 		gl = bd->bd_gl;
-		list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+		list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list);
 		mapping = bh->b_page->mapping;
 		if (!mapping)
 			continue;
@@ -144,15 +144,15 @@ __acquires(&sdp->sd_ail_lock)
 void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
 {
 	struct list_head *head = &sdp->sd_ail1_list;
-	struct gfs2_ail *ai;
+	struct gfs2_trans *tr;
 
 	trace_gfs2_ail_flush(sdp, wbc, 1);
 	spin_lock(&sdp->sd_ail_lock);
 restart:
-	list_for_each_entry_reverse(ai, head, ai_list) {
+	list_for_each_entry_reverse(tr, head, tr_list) {
 		if (wbc->nr_to_write <= 0)
 			break;
-		if (gfs2_ail1_start_one(sdp, wbc, ai))
+		if (gfs2_ail1_start_one(sdp, wbc, tr))
 			goto restart;
 	}
 	spin_unlock(&sdp->sd_ail_lock);
@@ -183,20 +183,20 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
  *
  */
 
-static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
 	struct gfs2_bufdata *bd, *s;
 	struct buffer_head *bh;
 
-	list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+	list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list,
 					 bd_ail_st_list) {
 		bh = bd->bd_bh;
-		gfs2_assert(sdp, bd->bd_ail == ai);
+		gfs2_assert(sdp, bd->bd_tr == tr);
 		if (buffer_busy(bh))
 			continue;
 		if (!buffer_uptodate(bh))
 			gfs2_io_error_bh(sdp, bh);
-		list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+		list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
 	}
 
 }
@@ -210,16 +210,17 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 
 static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
 {
-	struct gfs2_ail *ai, *s;
+	struct gfs2_trans *tr, *s;
+	int oldest_tr = 1;
 	int ret;
 
 	spin_lock(&sdp->sd_ail_lock);
-	list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
-		gfs2_ail1_empty_one(sdp, ai);
-		if (list_empty(&ai->ai_ail1_list))
-			list_move(&ai->ai_list, &sdp->sd_ail2_list);
+	list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
+		gfs2_ail1_empty_one(sdp, tr);
+		if (list_empty(&tr->tr_ail1_list) && oldest_tr)
+			list_move(&tr->tr_list, &sdp->sd_ail2_list);
 		else
-			break;
+			oldest_tr = 0;
 	}
 	ret = list_empty(&sdp->sd_ail1_list);
 	spin_unlock(&sdp->sd_ail_lock);
@@ -229,13 +230,13 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
 
 static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
 {
-	struct gfs2_ail *ai;
+	struct gfs2_trans *tr;
 	struct gfs2_bufdata *bd;
 	struct buffer_head *bh;
 
 	spin_lock(&sdp->sd_ail_lock);
-	list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) {
-		list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) {
+	list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
+		list_for_each_entry(bd, &tr->tr_ail1_list, bd_ail_st_list) {
 			bh = bd->bd_bh;
 			if (!buffer_locked(bh))
 				continue;
@@ -256,40 +257,40 @@ static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
  *
  */
 
-static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
-	struct list_head *head = &ai->ai_ail2_list;
+	struct list_head *head = &tr->tr_ail2_list;
 	struct gfs2_bufdata *bd;
 
 	while (!list_empty(head)) {
 		bd = list_entry(head->prev, struct gfs2_bufdata,
 				bd_ail_st_list);
-		gfs2_assert(sdp, bd->bd_ail == ai);
+		gfs2_assert(sdp, bd->bd_tr == tr);
 		gfs2_remove_from_ail(bd);
 	}
 }
 
 static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
 {
-	struct gfs2_ail *ai, *safe;
+	struct gfs2_trans *tr, *safe;
 	unsigned int old_tail = sdp->sd_log_tail;
 	int wrap = (new_tail < old_tail);
 	int a, b, rm;
 
 	spin_lock(&sdp->sd_ail_lock);
 
-	list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
-		a = (old_tail <= ai->ai_first);
-		b = (ai->ai_first < new_tail);
+	list_for_each_entry_safe(tr, safe, &sdp->sd_ail2_list, tr_list) {
+		a = (old_tail <= tr->tr_first);
+		b = (tr->tr_first < new_tail);
 		rm = (wrap) ? (a || b) : (a && b);
 		if (!rm)
 			continue;
 
-		gfs2_ail2_empty_one(sdp, ai);
-		list_del(&ai->ai_list);
-		gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
-		gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
-		kfree(ai);
+		gfs2_ail2_empty_one(sdp, tr);
+		list_del(&tr->tr_list);
+		gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list));
+		gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list));
+		kfree(tr);
 	}
 
 	spin_unlock(&sdp->sd_ail_lock);
@@ -317,7 +318,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
 
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 {
-	unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
+	unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
 	unsigned wanted = blks + reserved_blks;
 	DEFINE_WAIT(wait);
 	int did_wait = 0;
@@ -435,7 +436,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
 
 static unsigned int current_tail(struct gfs2_sbd *sdp)
 {
-	struct gfs2_ail *ai;
+	struct gfs2_trans *tr;
 	unsigned int tail;
 
 	spin_lock(&sdp->sd_ail_lock);
@@ -443,8 +444,9 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
 	if (list_empty(&sdp->sd_ail1_list)) {
 		tail = sdp->sd_log_head;
 	} else {
-		ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list);
-		tail = ai->ai_first;
+		tr = list_entry(sdp->sd_ail1_list.prev, struct gfs2_trans,
+				tr_list);
+		tail = tr->tr_first;
 	}
 
 	spin_unlock(&sdp->sd_ail_lock);
@@ -482,70 +484,136 @@ static void log_flush_wait(struct gfs2_sbd *sdp)
 	}
 }
 
-static int bd_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int ip_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
-	struct gfs2_bufdata *bda, *bdb;
+	struct gfs2_inode *ipa, *ipb;
 
-	bda = list_entry(a, struct gfs2_bufdata, bd_list);
-	bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+	ipa = list_entry(a, struct gfs2_inode, i_ordered);
+	ipb = list_entry(b, struct gfs2_inode, i_ordered);
 
-	if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+	if (ipa->i_no_addr < ipb->i_no_addr)
 		return -1;
-	if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+	if (ipa->i_no_addr > ipb->i_no_addr)
 		return 1;
 	return 0;
 }
 
 static void gfs2_ordered_write(struct gfs2_sbd *sdp)
 {
-	struct gfs2_bufdata *bd;
-	struct buffer_head *bh;
+	struct gfs2_inode *ip;
 	LIST_HEAD(written);
 
-	gfs2_log_lock(sdp);
-	list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp);
+	spin_lock(&sdp->sd_ordered_lock);
+	list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp);
 	while (!list_empty(&sdp->sd_log_le_ordered)) {
-		bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list);
-		list_move(&bd->bd_list, &written);
-		bh = bd->bd_bh;
-		if (!buffer_dirty(bh))
+		ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
+		list_move(&ip->i_ordered, &written);
+		if (ip->i_inode.i_mapping->nrpages == 0)
 			continue;
-		get_bh(bh);
-		gfs2_log_unlock(sdp);
-		lock_buffer(bh);
-		if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
-			bh->b_end_io = end_buffer_write_sync;
-			submit_bh(WRITE_SYNC, bh);
-		} else {
-			unlock_buffer(bh);
-			brelse(bh);
-		}
-		gfs2_log_lock(sdp);
+		spin_unlock(&sdp->sd_ordered_lock);
+		filemap_fdatawrite(ip->i_inode.i_mapping);
+		spin_lock(&sdp->sd_ordered_lock);
 	}
 	list_splice(&written, &sdp->sd_log_le_ordered);
-	gfs2_log_unlock(sdp);
+	spin_unlock(&sdp->sd_ordered_lock);
 }
 
 static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
 {
-	struct gfs2_bufdata *bd;
-	struct buffer_head *bh;
+	struct gfs2_inode *ip;
 
-	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ordered_lock);
 	while (!list_empty(&sdp->sd_log_le_ordered)) {
-		bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list);
-		bh = bd->bd_bh;
-		if (buffer_locked(bh)) {
-			get_bh(bh);
-			gfs2_log_unlock(sdp);
-			wait_on_buffer(bh);
-			brelse(bh);
-			gfs2_log_lock(sdp);
+		ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
+		list_del(&ip->i_ordered);
+		WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags));
+		if (ip->i_inode.i_mapping->nrpages == 0)
 			continue;
+		spin_unlock(&sdp->sd_ordered_lock);
+		filemap_fdatawait(ip->i_inode.i_mapping);
+		spin_lock(&sdp->sd_ordered_lock);
+	}
+	spin_unlock(&sdp->sd_ordered_lock);
+}
+
+void gfs2_ordered_del_inode(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+	spin_lock(&sdp->sd_ordered_lock);
+	if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags))
+		list_del(&ip->i_ordered);
+	spin_unlock(&sdp->sd_ordered_lock);
+}
+
+void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+	struct buffer_head *bh = bd->bd_bh;
+	struct gfs2_glock *gl = bd->bd_gl;
+
+	gfs2_remove_from_ail(bd);
+	bd->bd_bh = NULL;
+	bh->b_private = NULL;
+	bd->bd_blkno = bh->b_blocknr;
+	bd->bd_ops = &gfs2_revoke_lops;
+	sdp->sd_log_num_revoke++;
+	atomic_inc(&gl->gl_revokes);
+	set_bit(GLF_LFLUSH, &gl->gl_flags);
+	list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
+}
+
+void gfs2_write_revokes(struct gfs2_sbd *sdp)
+{
+	struct gfs2_trans *tr;
+	struct gfs2_bufdata *bd, *tmp;
+	int have_revokes = 0;
+	int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
+
+	gfs2_ail1_empty(sdp);
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+		list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
+			if (list_empty(&bd->bd_list)) {
+				have_revokes = 1;
+				goto done;
+			}
+		}
+	}
+done:
+	spin_unlock(&sdp->sd_ail_lock);
+	if (have_revokes == 0)
+		return;
+	while (sdp->sd_log_num_revoke > max_revokes)
+		max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
+	max_revokes -= sdp->sd_log_num_revoke;
+	if (!sdp->sd_log_num_revoke) {
+		atomic_dec(&sdp->sd_log_blks_free);
+		/* If no blocks have been reserved, we need to also
+		 * reserve a block for the header */
+		if (!sdp->sd_log_blks_reserved)
+			atomic_dec(&sdp->sd_log_blks_free);
+	}
+	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
+	list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+		list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
+			if (max_revokes == 0)
+				goto out_of_blocks;
+			if (!list_empty(&bd->bd_list))
+				continue;
+			gfs2_add_revoke(sdp, bd);
+			max_revokes--;
 		}
-		list_del_init(&bd->bd_list);
 	}
+out_of_blocks:
+	spin_unlock(&sdp->sd_ail_lock);
 	gfs2_log_unlock(sdp);
+
+	if (!sdp->sd_log_num_revoke) {
+		atomic_inc(&sdp->sd_log_blks_free);
+		if (!sdp->sd_log_blks_reserved)
+			atomic_inc(&sdp->sd_log_blks_free);
+	}
 }
 
 /**
@@ -565,7 +633,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
 	lh = page_address(page);
 	clear_page(lh);
 
-	gfs2_ail1_empty(sdp);
 	tail = current_tail(sdp);
 
 	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -604,7 +671,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
 
 void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 {
-	struct gfs2_ail *ai;
+	struct gfs2_trans *tr;
 
 	down_write(&sdp->sd_log_flush_lock);
 
@@ -615,9 +682,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	}
 	trace_gfs2_log_flush(sdp, 1);
 
-	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
-	INIT_LIST_HEAD(&ai->ai_ail1_list);
-	INIT_LIST_HEAD(&ai->ai_ail2_list);
+	tr = sdp->sd_log_tr;
+	if (tr) {
+		sdp->sd_log_tr = NULL;
+		INIT_LIST_HEAD(&tr->tr_ail1_list);
+		INIT_LIST_HEAD(&tr->tr_ail2_list);
+	}
 
 	if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
 		printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
@@ -634,7 +704,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 
 	sdp->sd_log_flush_head = sdp->sd_log_head;
 	sdp->sd_log_flush_wrapped = 0;
-	ai->ai_first = sdp->sd_log_flush_head;
+	if (tr)
+		tr->tr_first = sdp->sd_log_flush_head;
 
 	gfs2_ordered_write(sdp);
 	lops_before_commit(sdp);
@@ -647,7 +718,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 		trace_gfs2_log_blocks(sdp, -1);
 		log_write_header(sdp, 0);
 	}
-	lops_after_commit(sdp, ai);
+	lops_after_commit(sdp, tr);
 
 	gfs2_log_lock(sdp);
 	sdp->sd_log_head = sdp->sd_log_flush_head;
@@ -657,16 +728,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	sdp->sd_log_commited_revoke = 0;
 
 	spin_lock(&sdp->sd_ail_lock);
-	if (!list_empty(&ai->ai_ail1_list)) {
-		list_add(&ai->ai_list, &sdp->sd_ail1_list);
-		ai = NULL;
+	if (tr && !list_empty(&tr->tr_ail1_list)) {
+		list_add(&tr->tr_list, &sdp->sd_ail1_list);
+		tr = NULL;
 	}
 	spin_unlock(&sdp->sd_ail_lock);
 	gfs2_log_unlock(sdp);
 	trace_gfs2_log_flush(sdp, 0);
 	up_write(&sdp->sd_log_flush_lock);
 
-	kfree(ai);
+	kfree(tr);
 }
 
 static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
@@ -691,6 +762,12 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 			     sdp->sd_jdesc->jd_blocks);
 	sdp->sd_log_blks_reserved = reserved;
 
+	if (sdp->sd_log_tr == NULL &&
+	    (tr->tr_num_buf_new || tr->tr_num_databuf_new)) {
+		gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
+		sdp->sd_log_tr = tr;
+		tr->tr_attached = 1;
+	}
 	gfs2_log_unlock(sdp);
 }
 
@@ -712,7 +789,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
 	log_refund(sdp, tr);
-	up_read(&sdp->sd_log_flush_lock);
 
 	if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
 	    ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3fd5215ea25..37216634f0a 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -48,6 +48,18 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
 	sdp->sd_log_head = sdp->sd_log_tail = value;
 }
 
+static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+	if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
+		spin_lock(&sdp->sd_ordered_lock);
+		if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
+			list_add(&ip->i_ordered, &sdp->sd_log_le_ordered);
+		spin_unlock(&sdp->sd_ordered_lock);
+	}
+}
+extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
 extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 			    unsigned int ssize);
 
@@ -60,5 +72,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
 extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
 extern int gfs2_logd(void *data);
+extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
 
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9ceccb1595a..17c5b5d7dc8 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -16,6 +16,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
 #include <linux/fs.h>
+#include <linux/list_sort.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -37,7 +38,7 @@
  *
  * The log lock must be held when calling this function
  */
-static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
 {
 	struct gfs2_bufdata *bd;
 
@@ -53,8 +54,8 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
 	 * to in-place disk block, remove it from the AIL.
 	 */
 	spin_lock(&sdp->sd_ail_lock);
-	if (bd->bd_ail)
-		list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+	if (bd->bd_tr)
+		list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list);
 	spin_unlock(&sdp->sd_ail_lock);
 	get_bh(bh);
 	atomic_inc(&sdp->sd_log_pinned);
@@ -94,7 +95,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
  */
 
 static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
-		       struct gfs2_ail *ai)
+		       struct gfs2_trans *tr)
 {
 	struct gfs2_bufdata *bd = bh->b_private;
 
@@ -109,7 +110,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 		maybe_release_space(bd);
 
 	spin_lock(&sdp->sd_ail_lock);
-	if (bd->bd_ail) {
+	if (bd->bd_tr) {
 		list_del(&bd->bd_ail_st_list);
 		brelse(bh);
 	} else {
@@ -117,8 +118,8 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 		list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
 		atomic_inc(&gl->gl_ail_count);
 	}
-	bd->bd_ail = ai;
-	list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+	bd->bd_tr = tr;
+	list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list);
 	spin_unlock(&sdp->sd_ail_lock);
 
 	clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
@@ -212,7 +213,7 @@ static void gfs2_end_log_write(struct bio *bio, int error)
 		fs_err(sdp, "Error %d writing to log\n", error);
 	}
 
-	bio_for_each_segment(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		page = bvec->bv_page;
 		if (page_has_buffers(page))
 			gfs2_end_log_write_bh(sdp, bvec, error);
@@ -300,7 +301,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
 	u64 nblk;
 
 	if (bio) {
-		nblk = bio->bi_sector + bio_sectors(bio);
+		nblk = bio_end_sector(bio);
 		nblk >>= sdp->sd_fsb2bb_shift;
 		if (blkno == nblk)
 			return bio;
@@ -388,32 +389,6 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
 	return page;
 }
 
-static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
-	struct gfs2_meta_header *mh;
-	struct gfs2_trans *tr;
-
-	tr = current->journal_info;
-	tr->tr_touched = 1;
-	if (!list_empty(&bd->bd_list))
-		return;
-	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
-	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
-	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
-	if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
-		printk(KERN_ERR
-		       "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
-		       (unsigned long long)bd->bd_bh->b_blocknr);
-		BUG();
-	}
-	gfs2_pin(sdp, bd->bd_bh);
-	mh->__pad0 = cpu_to_be64(0);
-	mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
-	sdp->sd_log_num_buf++;
-	list_add(&bd->bd_list, &sdp->sd_log_le_buf);
-	tr->tr_num_buf_new++;
-}
-
 static void gfs2_check_magic(struct buffer_head *bh)
 {
 	void *kaddr;
@@ -427,6 +402,20 @@ static void gfs2_check_magic(struct buffer_head *bh)
 	kunmap_atomic(kaddr);
 }
 
+static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct gfs2_bufdata *bda, *bdb;
+
+	bda = list_entry(a, struct gfs2_bufdata, bd_list);
+	bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+
+	if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+		return -1;
+	if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+		return 1;
+	return 0;
+}
+
 static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
 				unsigned int total, struct list_head *blist,
 				bool is_databuf)
@@ -439,13 +428,16 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
 	__be64 *ptr;
 
 	gfs2_log_lock(sdp);
+	list_sort(NULL, blist, blocknr_cmp);
 	bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
 	while(total) {
 		num = total;
 		if (total > limit)
 			num = limit;
 		gfs2_log_unlock(sdp);
-		page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA, num + 1, num);
+		page = gfs2_get_log_desc(sdp,
+					 is_databuf ? GFS2_LOG_DESC_JDATA :
+					 GFS2_LOG_DESC_METADATA, num + 1, num);
 		ld = page_address(page);
 		gfs2_log_lock(sdp);
 		ptr = (__be64 *)(ld + 1);
@@ -506,17 +498,22 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 			   &sdp->sd_log_le_buf, 0);
 }
 
-static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
 	struct list_head *head = &sdp->sd_log_le_buf;
 	struct gfs2_bufdata *bd;
 
+	if (tr == NULL) {
+		gfs2_assert(sdp, list_empty(head));
+		return;
+	}
+
 	while (!list_empty(head)) {
 		bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
 		list_del_init(&bd->bd_list);
 		sdp->sd_log_num_buf--;
 
-		gfs2_unpin(sdp, bd->bd_bh, ai);
+		gfs2_unpin(sdp, bd->bd_bh, tr);
 	}
 	gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
 }
@@ -600,20 +597,6 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 	        jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
 }
 
-static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
-	struct gfs2_glock *gl = bd->bd_gl;
-	struct gfs2_trans *tr;
-
-	tr = current->journal_info;
-	tr->tr_touched = 1;
-	tr->tr_num_revoke++;
-	sdp->sd_log_num_revoke++;
-	atomic_inc(&gl->gl_revokes);
-	set_bit(GLF_LFLUSH, &gl->gl_flags);
-	list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
-}
-
 static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 {
 	struct gfs2_meta_header *mh;
@@ -623,6 +606,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	struct page *page;
 	unsigned int length;
 
+	gfs2_write_revokes(sdp);
 	if (!sdp->sd_log_num_revoke)
 		return;
 
@@ -653,7 +637,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	gfs2_log_write_page(sdp, page);
 }
 
-static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
 	struct list_head *head = &sdp->sd_log_le_revoke;
 	struct gfs2_bufdata *bd;
@@ -749,44 +733,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 }
 
 /**
- * databuf_lo_add - Add a databuf to the transaction.
- *
- * This is used in two distinct cases:
- * i) In ordered write mode
- *    We put the data buffer on a list so that we can ensure that its
- *    synced to disk at the right time
- * ii) In journaled data mode
- *    We need to journal the data block in the same way as metadata in
- *    the functions above. The difference is that here we have a tag
- *    which is two __be64's being the block number (as per meta data)
- *    and a flag which says whether the data block needs escaping or
- *    not. This means we need a new log entry for each 251 or so data
- *    blocks, which isn't an enormous overhead but twice as much as
- *    for normal metadata blocks.
- */
-static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
-	struct gfs2_trans *tr = current->journal_info;
-	struct address_space *mapping = bd->bd_bh->b_page->mapping;
-	struct gfs2_inode *ip = GFS2_I(mapping->host);
-
-	if (tr)
-		tr->tr_touched = 1;
-	if (!list_empty(&bd->bd_list))
-		return;
-	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
-	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
-	if (gfs2_is_jdata(ip)) {
-		gfs2_pin(sdp, bd->bd_bh);
-		tr->tr_num_databuf_new++;
-		sdp->sd_log_num_databuf++;
-		list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
-	} else {
-		list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
-	}
-}
-
-/**
  * databuf_lo_before_commit - Scan the data buffers, writing as we go
  *
  */
@@ -869,23 +815,27 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 		jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
 }
 
-static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
 	struct list_head *head = &sdp->sd_log_le_databuf;
 	struct gfs2_bufdata *bd;
 
+	if (tr == NULL) {
+		gfs2_assert(sdp, list_empty(head));
+		return;
+	}
+
 	while (!list_empty(head)) {
 		bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
 		list_del_init(&bd->bd_list);
 		sdp->sd_log_num_databuf--;
-		gfs2_unpin(sdp, bd->bd_bh, ai);
+		gfs2_unpin(sdp, bd->bd_bh, tr);
 	}
 	gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
 }
 
 
 const struct gfs2_log_operations gfs2_buf_lops = {
-	.lo_add = buf_lo_add,
 	.lo_before_commit = buf_lo_before_commit,
 	.lo_after_commit = buf_lo_after_commit,
 	.lo_before_scan = buf_lo_before_scan,
@@ -895,7 +845,6 @@ const struct gfs2_log_operations gfs2_buf_lops = {
 };
 
 const struct gfs2_log_operations gfs2_revoke_lops = {
-	.lo_add = revoke_lo_add,
 	.lo_before_commit = revoke_lo_before_commit,
 	.lo_after_commit = revoke_lo_after_commit,
 	.lo_before_scan = revoke_lo_before_scan,
@@ -904,12 +853,7 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
 	.lo_name = "revoke",
 };
 
-const struct gfs2_log_operations gfs2_rg_lops = {
-	.lo_name = "rg",
-};
-
 const struct gfs2_log_operations gfs2_databuf_lops = {
-	.lo_add = databuf_lo_add,
 	.lo_before_commit = databuf_lo_before_commit,
 	.lo_after_commit = databuf_lo_after_commit,
 	.lo_scan_elements = databuf_lo_scan_elements,
@@ -920,7 +864,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
 const struct gfs2_log_operations *gfs2_log_ops[] = {
 	&gfs2_databuf_lops,
 	&gfs2_buf_lops,
-	&gfs2_rg_lops,
 	&gfs2_revoke_lops,
 	NULL,
 };
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 954a330585f..9ca2e643841 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -23,12 +23,12 @@
 extern const struct gfs2_log_operations gfs2_glock_lops;
 extern const struct gfs2_log_operations gfs2_buf_lops;
 extern const struct gfs2_log_operations gfs2_revoke_lops;
-extern const struct gfs2_log_operations gfs2_rg_lops;
 extern const struct gfs2_log_operations gfs2_databuf_lops;
 
 extern const struct gfs2_log_operations *gfs2_log_ops[];
 extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
 extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw);
+extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
 
 static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
 {
@@ -46,19 +46,6 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
 	return limit;
 }
 
-static inline void lops_init_le(struct gfs2_bufdata *bd,
-				const struct gfs2_log_operations *lops)
-{
-	INIT_LIST_HEAD(&bd->bd_list);
-	bd->bd_ops = lops;
-}
-
-static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
-	if (bd->bd_ops->lo_add)
-		bd->bd_ops->lo_add(sdp, bd);
-}
-
 static inline void lops_before_commit(struct gfs2_sbd *sdp)
 {
 	int x;
@@ -67,12 +54,13 @@ static inline void lops_before_commit(struct gfs2_sbd *sdp)
 			gfs2_log_ops[x]->lo_before_commit(sdp);
 }
 
-static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static inline void lops_after_commit(struct gfs2_sbd *sdp,
+				     struct gfs2_trans *tr)
 {
 	int x;
 	for (x = 0; gfs2_log_ops[x]; x++)
 		if (gfs2_log_ops[x]->lo_after_commit)
-			gfs2_log_ops[x]->lo_after_commit(sdp, ai);
+			gfs2_log_ops[x]->lo_after_commit(sdp, tr);
 }
 
 static inline void lops_before_scan(struct gfs2_jdesc *jd,
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 22255d96b27..0da390686c0 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -271,41 +271,6 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
 	return 0;
 }
 
-/**
- * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
- * @gl: the glock the buffer belongs to
- * @bh: The buffer to be attached to
- * @meta: Flag to indicate whether its metadata or not
- */
-
-void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
-			 int meta)
-{
-	struct gfs2_bufdata *bd;
-
-	if (meta)
-		lock_page(bh->b_page);
-
-	if (bh->b_private) {
-		if (meta)
-			unlock_page(bh->b_page);
-		return;
-	}
-
-	bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
-	bd->bd_bh = bh;
-	bd->bd_gl = gl;
-
-	if (meta)
-		lops_init_le(bd, &gfs2_buf_lops);
-	else
-		lops_init_le(bd, &gfs2_databuf_lops);
-	bh->b_private = bd;
-
-	if (meta)
-		unlock_page(bh->b_page);
-}
-
 void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
 {
 	struct address_space *mapping = bh->b_page->mapping;
@@ -330,11 +295,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
 	}
 	if (bd) {
 		spin_lock(&sdp->sd_ail_lock);
-		if (bd->bd_ail) {
-			gfs2_remove_from_ail(bd);
-			bh->b_private = NULL;
-			bd->bd_bh = NULL;
-			bd->bd_blkno = bh->b_blocknr;
+		if (bd->bd_tr) {
 			gfs2_trans_add_revoke(sdp, bd);
 		}
 		spin_unlock(&sdp->sd_ail_lock);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index c30973b07a7..0d4c843b6f8 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -56,9 +56,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
 struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
 
-void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
-			 int meta);
-
 void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
 			      int meta);
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 0e3554edb8f..0262c190b6f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -20,6 +20,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/quotaops.h>
 #include <linux/lockdep.h>
+#include <linux/module.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -81,6 +82,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	init_waitqueue_head(&sdp->sd_glock_wait);
 	atomic_set(&sdp->sd_glock_disposal, 0);
 	init_completion(&sdp->sd_locking_init);
+	init_completion(&sdp->sd_wdack);
 	spin_lock_init(&sdp->sd_statfs_spin);
 
 	spin_lock_init(&sdp->sd_rindex_spin);
@@ -102,6 +104,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
 	INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
 	INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
+	spin_lock_init(&sdp->sd_ordered_lock);
 
 	init_waitqueue_head(&sdp->sd_log_waitq);
 	init_waitqueue_head(&sdp->sd_logd_waitq);
@@ -115,8 +118,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	INIT_LIST_HEAD(&sdp->sd_revoke_list);
 
-	mutex_init(&sdp->sd_freeze_lock);
-
 	return sdp;
 }
 
@@ -915,16 +916,16 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
 		goto fail_quotad;
 
 	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
-	error = IS_ERR(p);
-	if (error) {
+	if (IS_ERR(p)) {
+		error = PTR_ERR(p);
 		fs_err(sdp, "can't start logd thread: %d\n", error);
 		return error;
 	}
 	sdp->sd_logd_process = p;
 
 	p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
-	error = IS_ERR(p);
-	if (error) {
+	if (IS_ERR(p)) {
+		error = PTR_ERR(p);
 		fs_err(sdp, "can't start quotad thread: %d\n", error);
 		goto fail;
 	}
@@ -1425,6 +1426,7 @@ struct file_system_type gfs2_fs_type = {
 	.kill_sb = gfs2_kill_sb,
 	.owner = THIS_MODULE,
 };
+MODULE_ALIAS_FS("gfs2");
 
 struct file_system_type gfs2meta_fs_type = {
 	.name = "gfs2meta",
@@ -1432,4 +1434,4 @@ struct file_system_type gfs2meta_fs_type = {
 	.mount = gfs2_mount_meta,
 	.owner = THIS_MODULE,
 };
-
+MODULE_ALIAS_FS("gfs2meta");
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index ae55e248c3b..3768c2f40e4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -65,13 +65,10 @@
 #include "inode.h"
 #include "util.h"
 
-#define QUOTA_USER 1
-#define QUOTA_GROUP 0
-
 struct gfs2_quota_change_host {
 	u64 qc_change;
 	u32 qc_flags; /* GFS2_QCF_... */
-	u32 qc_id;
+	struct kqid qc_id;
 };
 
 static LIST_HEAD(qd_lru_list);
@@ -120,17 +117,24 @@ out:
 	return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100;
 }
 
+static u64 qd2index(struct gfs2_quota_data *qd)
+{
+	struct kqid qid = qd->qd_id;
+	return (2 * (u64)from_kqid(&init_user_ns, qid)) +
+		((qid.type == USRQUOTA) ? 0 : 1);
+}
+
 static u64 qd2offset(struct gfs2_quota_data *qd)
 {
 	u64 offset;
 
-	offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
+	offset = qd2index(qd);
 	offset *= sizeof(struct gfs2_quota);
 
 	return offset;
 }
 
-static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
+static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
 		    struct gfs2_quota_data **qdp)
 {
 	struct gfs2_quota_data *qd;
@@ -141,13 +145,11 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
 		return -ENOMEM;
 
 	atomic_set(&qd->qd_count, 1);
-	qd->qd_id = id;
-	if (user)
-		set_bit(QDF_USER, &qd->qd_flags);
+	qd->qd_id = qid;
 	qd->qd_slot = -1;
 	INIT_LIST_HEAD(&qd->qd_reclaim);
 
-	error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
+	error = gfs2_glock_get(sdp, qd2index(qd),
 			      &gfs2_quota_glops, CREATE, &qd->qd_gl);
 	if (error)
 		goto fail;
@@ -161,7 +163,7 @@ fail:
 	return error;
 }
 
-static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
+static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
 		  struct gfs2_quota_data **qdp)
 {
 	struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
@@ -173,8 +175,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
 		found = 0;
 		spin_lock(&qd_lru_lock);
 		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
-			if (qd->qd_id == id &&
-			    !test_bit(QDF_USER, &qd->qd_flags) == !user) {
+			if (qid_eq(qd->qd_id, qid)) {
 				if (!atomic_read(&qd->qd_count) &&
 				    !list_empty(&qd->qd_reclaim)) {
 					/* Remove it from reclaim list */
@@ -208,7 +209,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id,
 			return 0;
 		}
 
-		error = qd_alloc(sdp, user, id, &new_qd);
+		error = qd_alloc(sdp, qid, &new_qd);
 		if (error)
 			return error;
 	}
@@ -458,12 +459,12 @@ static void qd_unlock(struct gfs2_quota_data *qd)
 	qd_put(qd);
 }
 
-static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id,
+static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid,
 		    struct gfs2_quota_data **qdp)
 {
 	int error;
 
-	error = qd_get(sdp, user, id, qdp);
+	error = qd_get(sdp, qid, qdp);
 	if (error)
 		return error;
 
@@ -491,7 +492,7 @@ static void qdsb_put(struct gfs2_quota_data *qd)
 	qd_put(qd);
 }
 
-int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
+int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_quota_data **qd;
@@ -512,28 +513,30 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
 	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
 		return 0;
 
-	error = qdsb_get(sdp, QUOTA_USER, ip->i_inode.i_uid, qd);
+	error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd);
 	if (error)
 		goto out;
 	ip->i_res->rs_qa_qd_num++;
 	qd++;
 
-	error = qdsb_get(sdp, QUOTA_GROUP, ip->i_inode.i_gid, qd);
+	error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd);
 	if (error)
 		goto out;
 	ip->i_res->rs_qa_qd_num++;
 	qd++;
 
-	if (uid != NO_QUOTA_CHANGE && uid != ip->i_inode.i_uid) {
-		error = qdsb_get(sdp, QUOTA_USER, uid, qd);
+	if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) &&
+	    !uid_eq(uid, ip->i_inode.i_uid)) {
+		error = qdsb_get(sdp, make_kqid_uid(uid), qd);
 		if (error)
 			goto out;
 		ip->i_res->rs_qa_qd_num++;
 		qd++;
 	}
 
-	if (gid != NO_QUOTA_CHANGE && gid != ip->i_inode.i_gid) {
-		error = qdsb_get(sdp, QUOTA_GROUP, gid, qd);
+	if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) &&
+	    !gid_eq(gid, ip->i_inode.i_gid)) {
+		error = qdsb_get(sdp, make_kqid_gid(gid), qd);
 		if (error)
 			goto out;
 		ip->i_res->rs_qa_qd_num++;
@@ -567,18 +570,10 @@ static int sort_qd(const void *a, const void *b)
 	const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;
 	const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b;
 
-	if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
-	    !test_bit(QDF_USER, &qd_b->qd_flags)) {
-		if (test_bit(QDF_USER, &qd_a->qd_flags))
-			return -1;
-		else
-			return 1;
-	}
-	if (qd_a->qd_id < qd_b->qd_id)
+	if (qid_lt(qd_a->qd_id, qd_b->qd_id))
 		return -1;
-	if (qd_a->qd_id > qd_b->qd_id)
+	if (qid_lt(qd_b->qd_id, qd_a->qd_id))
 		return 1;
-
 	return 0;
 }
 
@@ -590,14 +585,14 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 	s64 x;
 
 	mutex_lock(&sdp->sd_quota_mutex);
-	gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, qd->qd_bh);
 
 	if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
 		qc->qc_change = 0;
 		qc->qc_flags = 0;
-		if (test_bit(QDF_USER, &qd->qd_flags))
+		if (qd->qd_id.type == USRQUOTA)
 			qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
-		qc->qc_id = cpu_to_be32(qd->qd_id);
+		qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id));
 	}
 
 	x = be64_to_cpu(qc->qc_change) + change;
@@ -726,7 +721,7 @@ get_a_page:
 			goto unlock_out;
 	}
 
-	gfs2_trans_add_bh(ip->i_gl, bh, 0);
+	gfs2_trans_add_data(ip->i_gl, bh);
 
 	kaddr = kmap_atomic(page);
 	if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
@@ -925,7 +920,7 @@ fail:
 	return error;
 }
 
-int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
+int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_quota_data *qd;
@@ -1040,13 +1035,13 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
 
 	printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
 	       sdp->sd_fsname, type,
-	       (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
-	       qd->qd_id);
+	       (qd->qd_id.type == USRQUOTA) ? "user" : "group",
+	       from_kqid(&init_user_ns, qd->qd_id));
 
 	return 0;
 }
 
-int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
+int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_quota_data *qd;
@@ -1063,8 +1058,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
 		qd = ip->i_res->rs_qa_qd[x];
 
-		if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
-		      (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
+		if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
+		      qid_eq(qd->qd_id, make_kqid_gid(gid))))
 			continue;
 
 		value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
@@ -1074,10 +1069,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 
 		if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
 			print_message(qd, "exceeded");
-			quota_send_warning(make_kqid(&init_user_ns,
-						     test_bit(QDF_USER, &qd->qd_flags) ?
-						     USRQUOTA : GRPQUOTA,
-						     qd->qd_id),
+			quota_send_warning(qd->qd_id,
 					   sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
 
 			error = -EDQUOT;
@@ -1087,10 +1079,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 			   time_after_eq(jiffies, qd->qd_last_warn +
 					 gfs2_tune_get(sdp,
 						gt_quota_warn_period) * HZ)) {
-			quota_send_warning(make_kqid(&init_user_ns,
-						     test_bit(QDF_USER, &qd->qd_flags) ?
-						     USRQUOTA : GRPQUOTA,
-						     qd->qd_id),
+			quota_send_warning(qd->qd_id,
 					   sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
 			error = print_message(qd, "warning");
 			qd->qd_last_warn = jiffies;
@@ -1101,7 +1090,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 }
 
 void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
-		       u32 uid, u32 gid)
+		       kuid_t uid, kgid_t gid)
 {
 	struct gfs2_quota_data *qd;
 	unsigned int x;
@@ -1114,8 +1103,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
 		qd = ip->i_res->rs_qa_qd[x];
 
-		if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
-		    (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
+		if (qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
+		    qid_eq(qd->qd_id, make_kqid_gid(gid))) {
 			do_qc(qd, change);
 		}
 	}
@@ -1165,18 +1154,13 @@ int gfs2_quota_sync(struct super_block *sb, int type)
 	return error;
 }
 
-static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
-{
-	return gfs2_quota_sync(sb, type);
-}
-
-int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
 {
 	struct gfs2_quota_data *qd;
 	struct gfs2_holder q_gh;
 	int error;
 
-	error = qd_get(sdp, user, id, &qd);
+	error = qd_get(sdp, qid, &qd);
 	if (error)
 		return error;
 
@@ -1194,7 +1178,9 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
 
 	qc->qc_change = be64_to_cpu(str->qc_change);
 	qc->qc_flags = be32_to_cpu(str->qc_flags);
-	qc->qc_id = be32_to_cpu(str->qc_id);
+	qc->qc_id = make_kqid(&init_user_ns,
+			      (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA,
+			      be32_to_cpu(str->qc_id));
 }
 
 int gfs2_quota_init(struct gfs2_sbd *sdp)
@@ -1257,8 +1243,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 			if (!qc.qc_change)
 				continue;
 
-			error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
-					 qc.qc_id, &qd);
+			error = qd_alloc(sdp, qc.qc_id, &qd);
 			if (error) {
 				brelse(bh);
 				goto fail;
@@ -1424,7 +1409,7 @@ int gfs2_quotad(void *data)
 					   &tune->gt_statfs_quantum);
 
 		/* Update quota file */
-		quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t,
+		quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
 				   &quotad_timeo, &tune->gt_quota_quantum);
 
 		/* Check for & recover partially truncated inodes */
@@ -1485,21 +1470,17 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
 	struct gfs2_quota_data *qd;
 	struct gfs2_holder q_gh;
 	int error;
-	int type;
 
 	memset(fdq, 0, sizeof(struct fs_disk_quota));
 
 	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
 		return -ESRCH; /* Crazy XFS error code */
 
-	if (qid.type == USRQUOTA)
-		type = QUOTA_USER;
-	else if (qid.type == GRPQUOTA)
-		type = QUOTA_GROUP;
-	else
+	if ((qid.type != USRQUOTA) &&
+	    (qid.type != GRPQUOTA))
 		return -EINVAL;
 
-	error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd);
+	error = qd_get(sdp, qid, &qd);
 	if (error)
 		return error;
 	error = do_glock(qd, FORCE, &q_gh);
@@ -1508,8 +1489,8 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
 
 	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
 	fdq->d_version = FS_DQUOT_VERSION;
-	fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
-	fdq->d_id = from_kqid(&init_user_ns, qid);
+	fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
+	fdq->d_id = from_kqid_munged(current_user_ns(), qid);
 	fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
 	fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
 	fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
@@ -1535,32 +1516,18 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
 	int alloc_required;
 	loff_t offset;
 	int error;
-	int type;
 
 	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
 		return -ESRCH; /* Crazy XFS error code */
 
-	switch(qid.type) {
-	case USRQUOTA:
-		type = QUOTA_USER;
-		if (fdq->d_flags != FS_USER_QUOTA)
-			return -EINVAL;
-		break;
-	case GRPQUOTA:
-		type = QUOTA_GROUP;
-		if (fdq->d_flags != FS_GROUP_QUOTA)
-			return -EINVAL;
-		break;
-	default:
+	if ((qid.type != USRQUOTA) &&
+	    (qid.type != GRPQUOTA))
 		return -EINVAL;
-	}
 
 	if (fdq->d_fieldmask & ~GFS2_FIELDMASK)
 		return -EINVAL;
-	if (fdq->d_id != from_kqid(&init_user_ns, qid))
-		return -EINVAL;
 
-	error = qd_get(sdp, type, from_kqid(&init_user_ns, qid), &qd);
+	error = qd_get(sdp, qid, &qd);
 	if (error)
 		return error;
 
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index f25d98b8790..4f5e6e44ed8 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -14,20 +14,21 @@ struct gfs2_inode;
 struct gfs2_sbd;
 struct shrink_control;
 
-#define NO_QUOTA_CHANGE ((u32)-1)
+#define NO_UID_QUOTA_CHANGE INVALID_UID
+#define NO_GID_QUOTA_CHANGE INVALID_GID
 
-extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
 extern void gfs2_quota_unhold(struct gfs2_inode *ip);
 
-extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
 extern void gfs2_quota_unlock(struct gfs2_inode *ip);
 
-extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
 extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
-			      u32 uid, u32 gid);
+			      kuid_t uid, kgid_t gid);
 
 extern int gfs2_quota_sync(struct super_block *sb, int type);
-extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid);
 
 extern int gfs2_quota_init(struct gfs2_sbd *sdp);
 extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
@@ -41,7 +42,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 	int ret;
 	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
 		return 0;
-	ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
 	if (ret)
 		return ret;
 	if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index b7eff078fe9..69317435faa 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -576,7 +576,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
 	RB_CLEAR_NODE(&ip->i_res->rs_node);
 out:
 	up_write(&ip->i_rw_mutex);
-	return 0;
+	return error;
 }
 
 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
@@ -592,7 +592,7 @@ static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
  * @rs: The reservation to remove
  *
  */
-static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
+static void __rs_deltree(struct gfs2_blkreserv *rs)
 {
 	struct gfs2_rgrpd *rgd;
 
@@ -605,7 +605,7 @@ static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
 	RB_CLEAR_NODE(&rs->rs_node);
 
 	if (rs->rs_free) {
-		/* return reserved blocks to the rgrp and the ip */
+		/* return reserved blocks to the rgrp */
 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
 		rs->rs_free = 0;
@@ -619,14 +619,14 @@ static void __rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
  * @rs: The reservation to remove
  *
  */
-void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
+void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
 {
 	struct gfs2_rgrpd *rgd;
 
 	rgd = rs->rs_rbm.rgd;
 	if (rgd) {
 		spin_lock(&rgd->rd_rsspin);
-		__rs_deltree(ip, rs);
+		__rs_deltree(rs);
 		spin_unlock(&rgd->rd_rsspin);
 	}
 }
@@ -638,9 +638,11 @@ void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs)
  */
 void gfs2_rs_delete(struct gfs2_inode *ip)
 {
+	struct inode *inode = &ip->i_inode;
+
 	down_write(&ip->i_rw_mutex);
-	if (ip->i_res) {
-		gfs2_rs_deltree(ip, ip->i_res);
+	if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) {
+		gfs2_rs_deltree(ip->i_res);
 		BUG_ON(ip->i_res->rs_free);
 		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
 		ip->i_res = NULL;
@@ -664,7 +666,7 @@ static void return_all_reservations(struct gfs2_rgrpd *rgd)
 	spin_lock(&rgd->rd_rsspin);
 	while ((n = rb_first(&rgd->rd_rstree))) {
 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
-		__rs_deltree(NULL, rs);
+		__rs_deltree(rs);
 	}
 	spin_unlock(&rgd->rd_rsspin);
 }
@@ -1181,12 +1183,9 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
 {
 	struct super_block *sb = sdp->sd_vfs;
-	struct block_device *bdev = sb->s_bdev;
-	const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
-					   bdev_logical_block_size(sb->s_bdev);
 	u64 blk;
 	sector_t start = 0;
-	sector_t nr_sects = 0;
+	sector_t nr_blks = 0;
 	int rv;
 	unsigned int x;
 	u32 trimmed = 0;
@@ -1206,35 +1205,34 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 		if (diff == 0)
 			continue;
 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
-		blk *= sects_per_blk; /* convert to sectors */
 		while(diff) {
 			if (diff & 1) {
-				if (nr_sects == 0)
+				if (nr_blks == 0)
 					goto start_new_extent;
-				if ((start + nr_sects) != blk) {
-					if (nr_sects >= minlen) {
-						rv = blkdev_issue_discard(bdev,
-							start, nr_sects,
+				if ((start + nr_blks) != blk) {
+					if (nr_blks >= minlen) {
+						rv = sb_issue_discard(sb,
+							start, nr_blks,
 							GFP_NOFS, 0);
 						if (rv)
 							goto fail;
-						trimmed += nr_sects;
+						trimmed += nr_blks;
 					}
-					nr_sects = 0;
+					nr_blks = 0;
 start_new_extent:
 					start = blk;
 				}
-				nr_sects += sects_per_blk;
+				nr_blks++;
 			}
 			diff >>= 2;
-			blk += sects_per_blk;
+			blk++;
 		}
 	}
-	if (nr_sects >= minlen) {
-		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0);
+	if (nr_blks >= minlen) {
+		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
 		if (rv)
 			goto fail;
-		trimmed += nr_sects;
+		trimmed += nr_blks;
 	}
 	if (ptrimmed)
 		*ptrimmed = trimmed;
@@ -1257,7 +1255,7 @@ fail:
 
 int gfs2_fitrim(struct file *filp, void __user *argp)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
 	struct buffer_head *bh;
@@ -1290,13 +1288,15 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 	minlen = max_t(u64, r.minlen,
 		       q->limits.discard_granularity) >> bs_shift;
 
+	if (end <= start || minlen > sdp->sd_max_rg_data)
+		return -EINVAL;
+
 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
-	rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
+	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
 
-	if (end <= start ||
-	    minlen > sdp->sd_max_rg_data ||
-	    start > rgd_end->rd_data0 + rgd_end->rd_data)
-		return -EINVAL;
+	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
+	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
+		return -EINVAL; /* start is beyond the end of the fs */
 
 	while (1) {
 
@@ -1323,7 +1323,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 			if (ret == 0) {
 				bh = rgd->rd_bits[0].bi_bh;
 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
-				gfs2_trans_add_bh(rgd->rd_gl, bh, 1);
+				gfs2_trans_add_meta(rgd->rd_gl, bh);
 				gfs2_rgrp_out(rgd, bh->b_data);
 				gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data);
 				gfs2_trans_end(sdp);
@@ -1338,7 +1338,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 	}
 
 out:
-	r.len = trimmed << 9;
+	r.len = trimmed << bs_shift;
 	if (copy_to_user(argp, &r, sizeof(r)))
 		return -EFAULT;
 
@@ -1405,9 +1405,14 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
 	u32 extlen;
 	u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
 	int ret;
+	struct inode *inode = &ip->i_inode;
 
-	extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
-	extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
+	if (S_ISDIR(inode->i_mode))
+		extlen = 1;
+	else {
+		extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
+		extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
+	}
 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
 		return;
 
@@ -1878,7 +1883,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
 
 		/* Drop reservation, if we couldn't use reserved rgrp */
 		if (gfs2_rs_active(rs))
-			gfs2_rs_deltree(ip, rs);
+			gfs2_rs_deltree(rs);
 check_rgrp:
 		/* Check for unlinked inodes which can be reclaimed */
 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
@@ -1968,14 +1973,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
 
 	*n = 1;
 	block = gfs2_rbm_to_block(rbm);
-	gfs2_trans_add_bh(rbm->rgd->rd_gl, rbm->bi->bi_bh, 1);
+	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh);
 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
 	block++;
 	while (*n < elen) {
 		ret = gfs2_rbm_from_block(&pos, block);
 		if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
 			break;
-		gfs2_trans_add_bh(pos.rgd->rd_gl, pos.bi->bi_bh, 1);
+		gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh);
 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
 		(*n)++;
 		block++;
@@ -2014,7 +2019,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 			       rbm.bi->bi_bh->b_data + rbm.bi->bi_offset,
 			       rbm.bi->bi_len);
 		}
-		gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.bi->bi_bh, 1);
+		gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh);
 		gfs2_setbit(&rbm, false, new_state);
 	}
 
@@ -2091,7 +2096,7 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip,
 			if (rs->rs_free && !ret)
 				goto out;
 		}
-		__rs_deltree(ip, rs);
+		__rs_deltree(rs);
 	}
 out:
 	spin_unlock(&rgd->rd_rsspin);
@@ -2157,7 +2162,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 		if (error == 0) {
 			struct gfs2_dinode *di =
 				(struct gfs2_dinode *)dibh->b_data;
-			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+			gfs2_trans_add_meta(ip->i_gl, dibh);
 			di->di_goal_meta = di->di_goal_data =
 				cpu_to_be64(ip->i_goal);
 			brelse(dibh);
@@ -2176,7 +2181,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 			*generation = rbm.rgd->rd_igeneration++;
 	}
 
-	gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh, 1);
+	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
 	gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data);
 
@@ -2184,13 +2189,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	if (dinode)
 		gfs2_trans_add_unrevoke(sdp, block, 1);
 
-	/*
-	 * This needs reviewing to see why we cannot do the quota change
-	 * at this point in the dinode case.
-	 */
-	if (ndata)
-		gfs2_quota_change(ip, ndata, ip->i_inode.i_uid,
-				  ip->i_inode.i_gid);
+	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
 
 	rbm.rgd->rd_free_clone -= *nblocks;
 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
@@ -2223,7 +2222,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
 	rgd->rd_free += blen;
 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
-	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
 
@@ -2260,7 +2259,7 @@ void gfs2_unlink_di(struct inode *inode)
 	if (!rgd)
 		return;
 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
-	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
 	update_rgrp_lvb_unlinked(rgd, 1);
@@ -2281,7 +2280,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
 	rgd->rd_dinodes--;
 	rgd->rd_free++;
 
-	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
 	update_rgrp_lvb_unlinked(rgd, -1);
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 842185853f6..5b3f4a896e6 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -47,7 +47,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
 			     bool dinode, u64 *generation);
 
 extern int gfs2_rs_alloc(struct gfs2_inode *ip);
-extern void gfs2_rs_deltree(struct gfs2_inode *ip, struct gfs2_blkreserv *rs);
+extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
 extern void gfs2_rs_delete(struct gfs2_inode *ip);
 extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d6488674d91..e5639dec66c 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -500,7 +500,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
 	if (error)
 		return;
 
-	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+	gfs2_trans_add_meta(l_ip->i_gl, l_bh);
 
 	spin_lock(&sdp->sd_statfs_spin);
 	l_sc->sc_total += total;
@@ -528,7 +528,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
 	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
 	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 
-	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+	gfs2_trans_add_meta(l_ip->i_gl, l_bh);
 
 	spin_lock(&sdp->sd_statfs_spin);
 	m_sc->sc_total += l_sc->sc_total;
@@ -539,7 +539,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
 	       0, sizeof(struct gfs2_statfs_change));
 	spin_unlock(&sdp->sd_statfs_spin);
 
-	gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+	gfs2_trans_add_meta(m_ip->i_gl, m_bh);
 	gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
 }
 
@@ -663,54 +663,6 @@ out:
 	return error;
 }
 
-/**
- * gfs2_freeze_fs - freezes the file system
- * @sdp: the file system
- *
- * This function flushes data and meta data for all machines by
- * acquiring the transaction log exclusively.  All journals are
- * ensured to be in a clean state as well.
- *
- * Returns: errno
- */
-
-int gfs2_freeze_fs(struct gfs2_sbd *sdp)
-{
-	int error = 0;
-
-	mutex_lock(&sdp->sd_freeze_lock);
-
-	if (!sdp->sd_freeze_count++) {
-		error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
-		if (error)
-			sdp->sd_freeze_count--;
-	}
-
-	mutex_unlock(&sdp->sd_freeze_lock);
-
-	return error;
-}
-
-/**
- * gfs2_unfreeze_fs - unfreezes the file system
- * @sdp: the file system
- *
- * This function allows the file system to proceed by unlocking
- * the exclusively held transaction lock.  Other GFS2 nodes are
- * now free to acquire the lock shared and go on with their lives.
- *
- */
-
-void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
-{
-	mutex_lock(&sdp->sd_freeze_lock);
-
-	if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
-		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-
-	mutex_unlock(&sdp->sd_freeze_lock);
-}
-
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 {
 	struct gfs2_dinode *str = buf;
@@ -721,8 +673,8 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
 	str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
 	str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
-	str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
-	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
+	str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode));
+	str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));
 	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
 	str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
 	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -824,7 +776,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
 
 	ret = gfs2_meta_inode_buffer(ip, &bh);
 	if (ret == 0) {
-		gfs2_trans_add_bh(ip->i_gl, bh, 1);
+		gfs2_trans_add_meta(ip->i_gl, bh);
 		gfs2_dinode_out(ip, bh->b_data);
 		brelse(bh);
 	}
@@ -888,13 +840,6 @@ static void gfs2_put_super(struct super_block *sb)
 	int error;
 	struct gfs2_jdesc *jd;
 
-	/*  Unfreeze the filesystem, if we need to  */
-
-	mutex_lock(&sdp->sd_freeze_lock);
-	if (sdp->sd_freeze_count)
-		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-	mutex_unlock(&sdp->sd_freeze_lock);
-
 	/* No more recovery requests */
 	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
 	smp_mb();
@@ -985,7 +930,7 @@ static int gfs2_freeze(struct super_block *sb)
 		return -EINVAL;
 
 	for (;;) {
-		error = gfs2_freeze_fs(sdp);
+		error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
 		if (!error)
 			break;
 
@@ -1013,7 +958,9 @@ static int gfs2_freeze(struct super_block *sb)
 
 static int gfs2_unfreeze(struct super_block *sb)
 {
-	gfs2_unfreeze_fs(sb->s_fs_info);
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+
+	gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
 	return 0;
 }
 
@@ -1429,7 +1376,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 	if (error)
 		return error;
 
-	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
 	if (error)
 		return error;
 
@@ -1497,6 +1444,7 @@ static void gfs2_evict_inode(struct inode *inode)
 	/* Must not read inode block until block type has been verified */
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
 	if (unlikely(error)) {
+		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
 		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 		goto out;
 	}
@@ -1565,10 +1513,12 @@ out_truncate:
 out_unlock:
 	/* Error path for case 1 */
 	if (gfs2_rs_active(ip->i_res))
-		gfs2_rs_deltree(ip, ip->i_res);
+		gfs2_rs_deltree(ip->i_res);
 
-	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
 		gfs2_glock_dq(&ip->i_iopen_gh);
+	}
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_glock_dq_uninit(&gh);
 	if (error && error != GLR_TRYFAILED && error != -EROFS)
@@ -1577,6 +1527,7 @@ out:
 	/* Case 3 starts here */
 	truncate_inode_pages(&inode->i_data, 0);
 	gfs2_rs_delete(ip);
+	gfs2_ordered_del_inode(ip);
 	clear_inode(inode);
 	gfs2_dir_hash_inval(ip);
 	ip->i_gl->gl_object = NULL;
@@ -1586,6 +1537,7 @@ out:
 	ip->i_gl = NULL;
 	if (ip->i_iopen_gh.gh_gl) {
 		ip->i_iopen_gh.gh_gl->gl_object = NULL;
+		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
 		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 	}
 }
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index a0464680af0..90e3322ffa1 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -46,9 +46,6 @@ extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
 			  struct buffer_head *l_bh);
 extern int gfs2_statfs_sync(struct super_block *sb, int type);
 
-extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
-extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
-
 extern struct file_system_type gfs2_fs_type;
 extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 8056b7b7238..aa5c4804496 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -91,39 +91,37 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
 
 static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
 {
-	unsigned int count;
-
-	mutex_lock(&sdp->sd_freeze_lock);
-	count = sdp->sd_freeze_count;
-	mutex_unlock(&sdp->sd_freeze_lock);
+	struct super_block *sb = sdp->sd_vfs;
+	int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
 
-	return snprintf(buf, PAGE_SIZE, "%u\n", count);
+	return snprintf(buf, PAGE_SIZE, "%u\n", frozen);
 }
 
 static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
-	ssize_t ret = len;
-	int error = 0;
+	int error;
 	int n = simple_strtol(buf, NULL, 0);
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	switch (n) {
 	case 0:
-		gfs2_unfreeze_fs(sdp);
+		error = thaw_super(sdp->sd_vfs);
 		break;
 	case 1:
-		error = gfs2_freeze_fs(sdp);
+		error = freeze_super(sdp->sd_vfs);
 		break;
 	default:
-		ret = -EINVAL;
+		return -EINVAL;
 	}
 
-	if (error)
+	if (error) {
 		fs_warn(sdp, "freeze %d error %d", n, error);
+		return error;
+	}
 
-	return ret;
+	return len;
 }
 
 static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
@@ -135,7 +133,7 @@ static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
 static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
 	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	if (simple_strtol(buf, NULL, 0) != 1)
 		return -EINVAL;
@@ -150,7 +148,7 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
 				 size_t len)
 {
 	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	if (simple_strtol(buf, NULL, 0) != 1)
 		return -EINVAL;
@@ -163,7 +161,7 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
 				size_t len)
 {
 	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	if (simple_strtol(buf, NULL, 0) != 1)
 		return -EINVAL;
@@ -175,30 +173,40 @@ static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
 static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
 					size_t len)
 {
+	struct kqid qid;
 	int error;
 	u32 id;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	id = simple_strtoul(buf, NULL, 0);
 
-	error = gfs2_quota_refresh(sdp, 1, id);
+	qid = make_kqid(current_user_ns(), USRQUOTA, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+
+	error = gfs2_quota_refresh(sdp, qid);
 	return error ? error : len;
 }
 
 static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
 					 size_t len)
 {
+	struct kqid qid;
 	int error;
 	u32 id;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	id = simple_strtoul(buf, NULL, 0);
 
-	error = gfs2_quota_refresh(sdp, 0, id);
+	qid = make_kqid(current_user_ns(), GRPQUOTA, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+
+	error = gfs2_quota_refresh(sdp, qid);
 	return error ? error : len;
 }
 
@@ -213,7 +221,7 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
 	int rv;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum,
 		    mode);
@@ -332,6 +340,28 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 	return ret;
 }
 
+static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf)
+{
+	int val = completion_done(&sdp->sd_wdack) ? 1 : 0;
+
+	return sprintf(buf, "%d\n", val);
+}
+
+static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	ssize_t ret = len;
+	int val;
+
+	val = simple_strtol(buf, NULL, 0);
+
+	if ((val == 1) &&
+	    !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
+		complete(&sdp->sd_wdack);
+	else
+		ret = -EINVAL;
+	return ret;
+}
+
 static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
 {
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -463,7 +493,7 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 
 GDLM_ATTR(proto_name,		0444, proto_name_show,		NULL);
 GDLM_ATTR(block,		0644, block_show,		block_store);
-GDLM_ATTR(withdraw,		0644, withdraw_show,		withdraw_store);
+GDLM_ATTR(withdraw,		0644, wdack_show,		wdack_store);
 GDLM_ATTR(jid,			0644, jid_show,			jid_store);
 GDLM_ATTR(first,		0644, lkfirst_show,		lkfirst_store);
 GDLM_ATTR(first_done,		0444, first_done_show,		NULL);
@@ -502,7 +532,7 @@ static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
 	unsigned int x, y;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
 		return -EINVAL;
@@ -521,7 +551,7 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
 	unsigned int x;
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	x = simple_strtoul(buf, NULL, 0);
 
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 2ee13e841e9..20c007d747a 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -159,9 +159,9 @@ TRACE_EVENT(gfs2_glock_put,
 /* Callback (local or remote) requesting lock demotion */
 TRACE_EVENT(gfs2_demote_rq,
 
-	TP_PROTO(const struct gfs2_glock *gl),
+	TP_PROTO(const struct gfs2_glock *gl, bool remote),
 
-	TP_ARGS(gl),
+	TP_ARGS(gl, remote),
 
 	TP_STRUCT__entry(
 		__field(        dev_t,  dev                     )
@@ -170,6 +170,7 @@ TRACE_EVENT(gfs2_demote_rq,
 		__field(	u8,	cur_state		)
 		__field(	u8,	dmt_state		)
 		__field(	unsigned long,	flags		)
+		__field(	bool,	remote			)
 	),
 
 	TP_fast_assign(
@@ -179,14 +180,16 @@ TRACE_EVENT(gfs2_demote_rq,
 		__entry->cur_state	= glock_trace_state(gl->gl_state);
 		__entry->dmt_state	= glock_trace_state(gl->gl_demote_state);
 		__entry->flags		= gl->gl_flags  | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
+		__entry->remote		= remote;
 	),
 
-	TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s",
+	TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s %s",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
 		  (unsigned long long)__entry->glnum,
                   glock_trace_name(__entry->cur_state),
                   glock_trace_name(__entry->dmt_state),
-		  show_glock_flags(__entry->flags))
+		  show_glock_flags(__entry->flags),
+		  __entry->remote ? "remote" : "local")
 
 );
 
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 413627072f3..2b20d7046bf 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -18,6 +18,7 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "glock.h"
+#include "inode.h"
 #include "log.h"
 #include "lops.h"
 #include "meta_io.h"
@@ -95,7 +96,8 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
 
 static void gfs2_print_trans(const struct gfs2_trans *tr)
 {
-	print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+	printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n",
+	       (void *)tr->tr_ip);
 	printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n",
 	       tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
 	printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
@@ -134,52 +136,146 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 	if (tr->tr_t_gh.gh_gl) {
 		gfs2_glock_dq(&tr->tr_t_gh);
 		gfs2_holder_uninit(&tr->tr_t_gh);
-		kfree(tr);
+		if (!tr->tr_attached)
+			kfree(tr);
 	}
+	up_read(&sdp->sd_log_flush_lock);
 
 	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
 		gfs2_log_flush(sdp, NULL);
 	sb_end_intwrite(sdp->sd_vfs);
 }
 
+static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
+					       struct buffer_head *bh,
+					       const struct gfs2_log_operations *lops)
+{
+	struct gfs2_bufdata *bd;
+
+	bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
+	bd->bd_bh = bh;
+	bd->bd_gl = gl;
+	bd->bd_ops = lops;
+	INIT_LIST_HEAD(&bd->bd_list);
+	bh->b_private = bd;
+	return bd;
+}
+
 /**
- * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
- * @gl: the glock the buffer belongs to
+ * gfs2_trans_add_data - Add a databuf to the transaction.
+ * @gl: The inode glock associated with the buffer
  * @bh: The buffer to add
- * @meta: True in the case of adding metadata
  *
+ * This is used in two distinct cases:
+ * i) In ordered write mode
+ *    We put the data buffer on a list so that we can ensure that its
+ *    synced to disk at the right time
+ * ii) In journaled data mode
+ *    We need to journal the data block in the same way as metadata in
+ *    the functions above. The difference is that here we have a tag
+ *    which is two __be64's being the block number (as per meta data)
+ *    and a flag which says whether the data block needs escaping or
+ *    not. This means we need a new log entry for each 251 or so data
+ *    blocks, which isn't an enormous overhead but twice as much as
+ *    for normal metadata blocks.
  */
+void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
+{
+	struct gfs2_trans *tr = current->journal_info;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct address_space *mapping = bh->b_page->mapping;
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_bufdata *bd;
 
-void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
+	if (!gfs2_is_jdata(ip)) {
+		gfs2_ordered_add_inode(ip);
+		return;
+	}
+
+	lock_buffer(bh);
+	gfs2_log_lock(sdp);
+	bd = bh->b_private;
+	if (bd == NULL) {
+		gfs2_log_unlock(sdp);
+		unlock_buffer(bh);
+		if (bh->b_private == NULL)
+			bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+		lock_buffer(bh);
+		gfs2_log_lock(sdp);
+	}
+	gfs2_assert(sdp, bd->bd_gl == gl);
+	tr->tr_touched = 1;
+	if (list_empty(&bd->bd_list)) {
+		set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+		set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+		gfs2_pin(sdp, bd->bd_bh);
+		tr->tr_num_databuf_new++;
+		sdp->sd_log_num_databuf++;
+		list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
+	}
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bh);
+}
+
+static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
+{
+	struct gfs2_meta_header *mh;
+	struct gfs2_trans *tr;
+
+	tr = current->journal_info;
+	tr->tr_touched = 1;
+	if (!list_empty(&bd->bd_list))
+		return;
+	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+	if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
+		printk(KERN_ERR
+		       "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
+		       (unsigned long long)bd->bd_bh->b_blocknr);
+		BUG();
+	}
+	gfs2_pin(sdp, bd->bd_bh);
+	mh->__pad0 = cpu_to_be64(0);
+	mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+	sdp->sd_log_num_buf++;
+	list_add(&bd->bd_list, &sdp->sd_log_le_buf);
+	tr->tr_num_buf_new++;
+}
+
+void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
 {
+
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct gfs2_bufdata *bd;
 
 	lock_buffer(bh);
 	gfs2_log_lock(sdp);
 	bd = bh->b_private;
-	if (bd)
-		gfs2_assert(sdp, bd->bd_gl == gl);
-	else {
+	if (bd == NULL) {
 		gfs2_log_unlock(sdp);
 		unlock_buffer(bh);
-		gfs2_attach_bufdata(gl, bh, meta);
-		bd = bh->b_private;
+		lock_page(bh->b_page);
+		if (bh->b_private == NULL)
+			bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+		unlock_page(bh->b_page);
 		lock_buffer(bh);
 		gfs2_log_lock(sdp);
 	}
-	lops_add(sdp, bd);
+	gfs2_assert(sdp, bd->bd_gl == gl);
+	meta_lo_add(sdp, bd);
 	gfs2_log_unlock(sdp);
 	unlock_buffer(bh);
 }
 
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 {
+	struct gfs2_trans *tr = current->journal_info;
+
 	BUG_ON(!list_empty(&bd->bd_list));
-	BUG_ON(!list_empty(&bd->bd_ail_st_list));
-	BUG_ON(!list_empty(&bd->bd_ail_gl_list));
-	lops_init_le(bd, &gfs2_revoke_lops);
-	lops_add(sdp, bd);
+	gfs2_add_revoke(sdp, bd);
+	tr->tr_touched = 1;
+	tr->tr_num_revoke++;
 }
 
 void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index bf2ae9aeee7..1e6e7da25a1 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -39,7 +39,8 @@ extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 			    unsigned int revokes);
 
 extern void gfs2_trans_end(struct gfs2_sbd *sdp);
-extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
+extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
 extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
 extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
 
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f00d7c5744f..6402fb69d71 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -54,6 +54,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
 
 		kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
 
+		if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
+			wait_for_completion(&sdp->sd_wdack);
+
 		if (lm->lm_unmount) {
 			fs_err(sdp, "telling LM to unmount\n");
 			lm->lm_unmount(sdp);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 76c144b3c9b..ecd37f30ab9 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -270,7 +270,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 	if (error)
 		goto out_gunlock;
 
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, bh);
 
 	dataptrs = GFS2_EA2DATAPTRS(ea);
 	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
@@ -309,7 +309,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
 		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_trans_add_meta(ip->i_gl, dibh);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
@@ -331,7 +331,7 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 	if (error)
 		return error;
 
-	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
 	if (error)
 		goto out_alloc;
 
@@ -509,7 +509,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 		}
 
 		if (din) {
-			gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+			gfs2_trans_add_meta(ip->i_gl, bh[x]);
 			memcpy(pos, din, cp_size);
 			din += sdp->sd_jbsize;
 		}
@@ -629,7 +629,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 		return error;
 	gfs2_trans_add_unrevoke(sdp, block, 1);
 	*bhp = gfs2_meta_new(ip->i_gl, block);
-	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+	gfs2_trans_add_meta(ip->i_gl, *bhp);
 	gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
 	gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
 
@@ -691,7 +691,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 				return error;
 			gfs2_trans_add_unrevoke(sdp, block, 1);
 			bh = gfs2_meta_new(ip->i_gl, block);
-			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			gfs2_trans_add_meta(ip->i_gl, bh);
 			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
 
 			gfs2_add_inode_blocks(&ip->i_inode, 1);
@@ -751,7 +751,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
 		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_trans_add_meta(ip->i_gl, dibh);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
@@ -834,7 +834,7 @@ static void ea_set_remove_stuffed(struct gfs2_inode *ip,
 	struct gfs2_ea_header *prev = el->el_prev;
 	u32 len;
 
-	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, el->el_bh);
 
 	if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
 		ea->ea_type = GFS2_EATYPE_UNUSED;
@@ -872,7 +872,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
 	if (error)
 		return error;
 
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, bh);
 
 	if (es->ea_split)
 		ea = ea_split_ea(ea);
@@ -886,7 +886,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
 	if (error)
 		goto out;
 	ip->i_inode.i_ctime = CURRENT_TIME;
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_meta(ip->i_gl, dibh);
 	gfs2_dinode_out(ip, dibh->b_data);
 	brelse(dibh);
 out:
@@ -901,7 +901,7 @@ static int ea_set_simple_alloc(struct gfs2_inode *ip,
 	struct gfs2_ea_header *ea = es->es_ea;
 	int error;
 
-	gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, es->es_bh);
 
 	if (es->ea_split)
 		ea = ea_split_ea(ea);
@@ -997,7 +997,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 			goto out;
 		}
 
-		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+		gfs2_trans_add_meta(ip->i_gl, indbh);
 	} else {
 		u64 blk;
 		unsigned int n = 1;
@@ -1006,7 +1006,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 			return error;
 		gfs2_trans_add_unrevoke(sdp, blk, 1);
 		indbh = gfs2_meta_new(ip->i_gl, blk);
-		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+		gfs2_trans_add_meta(ip->i_gl, indbh);
 		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 		gfs2_buffer_clear_tail(indbh, mh_size);
 
@@ -1092,7 +1092,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 	if (error)
 		return error;
 
-	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+	gfs2_trans_add_meta(ip->i_gl, el->el_bh);
 
 	if (prev) {
 		u32 len;
@@ -1109,7 +1109,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
 		ip->i_inode.i_ctime = CURRENT_TIME;
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_trans_add_meta(ip->i_gl, dibh);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
@@ -1265,7 +1265,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
 	if (GFS2_EA_IS_STUFFED(el.el_ea)) {
 		error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
 		if (error == 0) {
-			gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
+			gfs2_trans_add_meta(ip->i_gl, el.el_bh);
 			memcpy(GFS2_EA2DATA(el.el_ea), data,
 			       GFS2_EA_DATA_LEN(el.el_ea));
 		}
@@ -1352,7 +1352,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 	if (error)
 		goto out_gunlock;
 
-	gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+	gfs2_trans_add_meta(ip->i_gl, indbh);
 
 	eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
 	bstart = 0;
@@ -1384,7 +1384,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_trans_add_meta(ip->i_gl, dibh);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
@@ -1434,7 +1434,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
-		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_trans_add_meta(ip->i_gl, dibh);
 		gfs2_dinode_out(ip, dibh->b_data);
 		brelse(dibh);
 	}
@@ -1461,7 +1461,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
 	if (error)
 		return error;
 
-	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
 	if (error)
 		return error;
 
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
index b77c5bc20f8..998e3a6decf 100644
--- a/fs/hfs/Kconfig
+++ b/fs/hfs/Kconfig
@@ -1,6 +1,6 @@
 config HFS_FS
-	tristate "Apple Macintosh file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
+	tristate "Apple Macintosh file system support"
+	depends on BLOCK
 	select NLS
 	help
 	  If you say Y here, you will be able to mount Macintosh-formatted
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 571abe97b42..de69d8a24f6 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -22,7 +22,8 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
 		return -ENOMEM;
 	fd->search_key = ptr;
 	fd->key = ptr + tree->max_key_len + 2;
-	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
+	hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
+		tree->cnid, __builtin_return_address(0));
 	mutex_lock(&tree->tree_lock);
 	return 0;
 }
@@ -31,7 +32,8 @@ void hfs_find_exit(struct hfs_find_data *fd)
 {
 	hfs_bnode_put(fd->bnode);
 	kfree(fd->search_key);
-	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
+	hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n",
+		fd->tree->cnid, __builtin_return_address(0));
 	mutex_unlock(&fd->tree->tree_lock);
 	fd->tree = NULL;
 }
@@ -135,8 +137,8 @@ int hfs_brec_find(struct hfs_find_data *fd)
 	return res;
 
 invalid:
-	printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
-		height, bnode->height, bnode->type, nidx, parent);
+	pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
+	       height, bnode->height, bnode->type, nidx, parent);
 	res = -EIO;
 release:
 	hfs_bnode_put(bnode);
diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c
index c6e97366e8a..28307bc9ec1 100644
--- a/fs/hfs/bitmap.c
+++ b/fs/hfs/bitmap.c
@@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
 		}
 	}
 
-	dprint(DBG_BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits);
+	hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits);
 	HFS_SB(sb)->free_ablocks -= *num_bits;
 	hfs_bitmap_dirty(sb);
 out:
@@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
 	if (!count)
 		return 0;
 
-	dprint(DBG_BITMAP, "clear_bits: %u,%u\n", start, count);
+	hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count);
 	/* are all of the bits in range? */
 	if ((start + count) > HFS_SB(sb)->fs_ablocks)
 		return -2;
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index cdb41a1f6a6..d3fa6bd9503 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -100,7 +100,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
 	struct hfs_btree *tree;
 	struct page *src_page, *dst_page;
 
-	dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+	hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
 	if (!len)
 		return;
 	tree = src_node->tree;
@@ -120,7 +120,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 	struct page *page;
 	void *ptr;
 
-	dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+	hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
 	if (!len)
 		return;
 	src += node->page_offset;
@@ -138,16 +138,16 @@ void hfs_bnode_dump(struct hfs_bnode *node)
 	__be32 cnid;
 	int i, off, key_off;
 
-	dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this);
+	hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this);
 	hfs_bnode_read(node, &desc, 0, sizeof(desc));
-	dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n",
+	hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n",
 		be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
 		desc.type, desc.height, be16_to_cpu(desc.num_recs));
 
 	off = node->tree->node_size - 2;
 	for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
 		key_off = hfs_bnode_read_u16(node, off);
-		dprint(DBG_BNODE_MOD, " %d", key_off);
+		hfs_dbg_cont(BNODE_MOD, " %d", key_off);
 		if (i && node->type == HFS_NODE_INDEX) {
 			int tmp;
 
@@ -155,17 +155,18 @@ void hfs_bnode_dump(struct hfs_bnode *node)
 				tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1;
 			else
 				tmp = node->tree->max_key_len + 1;
-			dprint(DBG_BNODE_MOD, " (%d,%d", tmp, hfs_bnode_read_u8(node, key_off));
+			hfs_dbg_cont(BNODE_MOD, " (%d,%d",
+				     tmp, hfs_bnode_read_u8(node, key_off));
 			hfs_bnode_read(node, &cnid, key_off + tmp, 4);
-			dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+			hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid));
 		} else if (i && node->type == HFS_NODE_LEAF) {
 			int tmp;
 
 			tmp = hfs_bnode_read_u8(node, key_off);
-			dprint(DBG_BNODE_MOD, " (%d)", tmp);
+			hfs_dbg_cont(BNODE_MOD, " (%d)", tmp);
 		}
 	}
-	dprint(DBG_BNODE_MOD, "\n");
+	hfs_dbg_cont(BNODE_MOD, "\n");
 }
 
 void hfs_bnode_unlink(struct hfs_bnode *node)
@@ -220,7 +221,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
 	struct hfs_bnode *node;
 
 	if (cnid >= tree->node_count) {
-		printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+		pr_err("request for non-existent node %d in B*Tree\n", cnid);
 		return NULL;
 	}
 
@@ -243,7 +244,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	loff_t off;
 
 	if (cnid >= tree->node_count) {
-		printk(KERN_ERR "hfs: request for non-existent node %d in B*Tree\n", cnid);
+		pr_err("request for non-existent node %d in B*Tree\n", cnid);
 		return NULL;
 	}
 
@@ -257,8 +258,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	node->this = cnid;
 	set_bit(HFS_BNODE_NEW, &node->flags);
 	atomic_set(&node->refcnt, 1);
-	dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n",
-	       node->tree->cnid, node->this);
+	hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n",
+		node->tree->cnid, node->this);
 	init_waitqueue_head(&node->lock_wq);
 	spin_lock(&tree->hash_lock);
 	node2 = hfs_bnode_findhash(tree, cnid);
@@ -301,7 +302,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node)
 {
 	struct hfs_bnode **p;
 
-	dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n",
+	hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n",
 		node->tree->cnid, node->this, atomic_read(&node->refcnt));
 	for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
 	     *p && *p != node; p = &(*p)->next_hash)
@@ -414,7 +415,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
 	spin_lock(&tree->hash_lock);
 	node = hfs_bnode_findhash(tree, num);
 	spin_unlock(&tree->hash_lock);
-	BUG_ON(node);
+	if (node) {
+		pr_crit("new node %u already hashed?\n", num);
+		WARN_ON(1);
+		return node;
+	}
 	node = __hfs_bnode_create(tree, num);
 	if (!node)
 		return ERR_PTR(-ENOMEM);
@@ -443,8 +448,9 @@ void hfs_bnode_get(struct hfs_bnode *node)
 {
 	if (node) {
 		atomic_inc(&node->refcnt);
-		dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
-		       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+		hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n",
+			node->tree->cnid, node->this,
+			atomic_read(&node->refcnt));
 	}
 }
 
@@ -455,8 +461,9 @@ void hfs_bnode_put(struct hfs_bnode *node)
 		struct hfs_btree *tree = node->tree;
 		int i;
 
-		dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
-		       node->tree->cnid, node->this, atomic_read(&node->refcnt));
+		hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n",
+			node->tree->cnid, node->this,
+			atomic_read(&node->refcnt));
 		BUG_ON(!atomic_read(&node->refcnt));
 		if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
 			return;
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 92fb358ce82..9f4ee7f5202 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -47,15 +47,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
 		if (node->tree->attributes & HFS_TREE_BIGKEYS) {
 			retval = hfs_bnode_read_u16(node, recoff) + 2;
 			if (retval > node->tree->max_key_len + 2) {
-				printk(KERN_ERR "hfs: keylen %d too large\n",
-					retval);
+				pr_err("keylen %d too large\n", retval);
 				retval = 0;
 			}
 		} else {
 			retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
 			if (retval > node->tree->max_key_len + 1) {
-				printk(KERN_ERR "hfs: keylen %d too large\n",
-					retval);
+				pr_err("keylen %d too large\n", retval);
 				retval = 0;
 			}
 		}
@@ -94,7 +92,8 @@ again:
 	end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
 	end_off = hfs_bnode_read_u16(node, end_rec_off);
 	end_rec_off -= 2;
-	dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", rec, size, end_off, end_rec_off);
+	hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+		rec, size, end_off, end_rec_off);
 	if (size > end_rec_off - end_off) {
 		if (new_node)
 			panic("not enough room!\n");
@@ -190,7 +189,8 @@ again:
 		mark_inode_dirty(tree->inode);
 	}
 	hfs_bnode_dump(node);
-	dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n", fd->record, fd->keylength + fd->entrylength);
+	hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n",
+		fd->record, fd->keylength + fd->entrylength);
 	if (!--node->num_recs) {
 		hfs_bnode_unlink(node);
 		if (!node->parent)
@@ -240,7 +240,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 	if (IS_ERR(new_node))
 		return new_node;
 	hfs_bnode_get(node);
-	dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n",
+	hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n",
 		node->this, new_node->this, node->next);
 	new_node->next = node->next;
 	new_node->prev = node->this;
@@ -374,7 +374,8 @@ again:
 		newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1;
 	else
 		fd->keylength = newkeylen = tree->max_key_len + 1;
-	dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n", rec, fd->keylength, newkeylen);
+	hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n",
+		rec, fd->keylength, newkeylen);
 
 	rec_off = tree->node_size - (rec + 2) * 2;
 	end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
@@ -385,7 +386,7 @@ again:
 		end_off = hfs_bnode_read_u16(parent, end_rec_off);
 		if (end_rec_off - end_off < diff) {
 
-			printk(KERN_DEBUG "hfs: splitting index node...\n");
+			printk(KERN_DEBUG "splitting index node...\n");
 			fd->bnode = parent;
 			new_node = hfs_bnode_split(fd);
 			if (IS_ERR(new_node))
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 1cbdeea1db4..1ab19e660e6 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -48,7 +48,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 				    mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz));
 		if (HFS_I(tree->inode)->alloc_blocks >
 					HFS_I(tree->inode)->first_blocks) {
-			printk(KERN_ERR "hfs: invalid btree extent records\n");
+			pr_err("invalid btree extent records\n");
 			unlock_new_inode(tree->inode);
 			goto free_inode;
 		}
@@ -60,8 +60,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 				    mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz));
 
 		if (!HFS_I(tree->inode)->first_blocks) {
-			printk(KERN_ERR "hfs: invalid btree extent records "
-								"(0 size).\n");
+			pr_err("invalid btree extent records (0 size)\n");
 			unlock_new_inode(tree->inode);
 			goto free_inode;
 		}
@@ -100,15 +99,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	switch (id) {
 	case HFS_EXT_CNID:
 		if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) {
-			printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
-				tree->max_key_len);
+			pr_err("invalid extent max_key_len %d\n",
+			       tree->max_key_len);
 			goto fail_page;
 		}
 		break;
 	case HFS_CAT_CNID:
 		if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) {
-			printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
-				tree->max_key_len);
+			pr_err("invalid catalog max_key_len %d\n",
+			       tree->max_key_len);
 			goto fail_page;
 		}
 		break;
@@ -146,8 +145,9 @@ void hfs_btree_close(struct hfs_btree *tree)
 		while ((node = tree->node_hash[i])) {
 			tree->node_hash[i] = node->next_hash;
 			if (atomic_read(&node->refcnt))
-				printk(KERN_ERR "hfs: node %d:%d still has %d user(s)!\n",
-					node->tree->cnid, node->this, atomic_read(&node->refcnt));
+				pr_err("node %d:%d still has %d user(s)!\n",
+				       node->tree->cnid, node->this,
+				       atomic_read(&node->refcnt));
 			hfs_bnode_free(node);
 			tree->node_hash_cnt--;
 		}
@@ -290,7 +290,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 		kunmap(*pagep);
 		nidx = node->next;
 		if (!nidx) {
-			printk(KERN_DEBUG "hfs: create new bmap node...\n");
+			printk(KERN_DEBUG "create new bmap node...\n");
 			next_node = hfs_bmap_new_bmap(node, idx);
 		} else
 			next_node = hfs_bnode_find(tree, nidx);
@@ -316,7 +316,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
 	u32 nidx;
 	u8 *data, byte, m;
 
-	dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this);
+	hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this);
 	tree = node->tree;
 	nidx = node->this;
 	node = hfs_bnode_find(tree, 0);
@@ -331,7 +331,8 @@ void hfs_bmap_free(struct hfs_bnode *node)
 		hfs_bnode_put(node);
 		if (!i) {
 			/* panic */;
-			printk(KERN_CRIT "hfs: unable to free bnode %u. bmap not found!\n", node->this);
+			pr_crit("unable to free bnode %u. bmap not found!\n",
+				node->this);
 			return;
 		}
 		node = hfs_bnode_find(tree, i);
@@ -339,7 +340,8 @@ void hfs_bmap_free(struct hfs_bnode *node)
 			return;
 		if (node->type != HFS_NODE_MAP) {
 			/* panic */;
-			printk(KERN_CRIT "hfs: invalid bmap found! (%u,%d)\n", node->this, node->type);
+			pr_crit("invalid bmap found! (%u,%d)\n",
+				node->this, node->type);
 			hfs_bnode_put(node);
 			return;
 		}
@@ -352,7 +354,8 @@ void hfs_bmap_free(struct hfs_bnode *node)
 	m = 1 << (~nidx & 7);
 	byte = data[off];
 	if (!(byte & m)) {
-		printk(KERN_CRIT "hfs: trying to free free bnode %u(%d)\n", node->this, node->type);
+		pr_crit("trying to free free bnode %u(%d)\n",
+			node->this, node->type);
 		kunmap(page);
 		hfs_bnode_put(node);
 		return;
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 424b0337f52..ff0316b925a 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -87,12 +87,15 @@ int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *
 	int entry_size;
 	int err;
 
-	dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
+	hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n",
+		str->name, cnid, inode->i_nlink);
 	if (dir->i_size >= HFS_MAX_VALENCE)
 		return -ENOSPC;
 
 	sb = dir->i_sb;
-	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	if (err)
+		return err;
 
 	hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
 	entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -184,14 +187,14 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
 
 	type = rec.type;
 	if (type != HFS_CDR_THD && type != HFS_CDR_FTH) {
-		printk(KERN_ERR "hfs: found bad thread record in catalog\n");
+		pr_err("found bad thread record in catalog\n");
 		return -EIO;
 	}
 
 	fd->search_key->cat.ParID = rec.thread.ParID;
 	len = fd->search_key->cat.CName.len = rec.thread.CName.len;
 	if (len > HFS_NAMELEN) {
-		printk(KERN_ERR "hfs: bad catalog namelength\n");
+		pr_err("bad catalog namelength\n");
 		return -EIO;
 	}
 	memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len);
@@ -212,9 +215,11 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
 	struct list_head *pos;
 	int res, type;
 
-	dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
+	hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
 	sb = dir->i_sb;
-	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	if (res)
+		return res;
 
 	hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
 	res = hfs_brec_find(&fd);
@@ -278,10 +283,13 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
 	int entry_size, type;
 	int err;
 
-	dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
+	hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+		cnid, src_dir->i_ino, src_name->name,
 		dst_dir->i_ino, dst_name->name);
 	sb = src_dir->i_sb;
-	hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd);
+	err = hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd);
+	if (err)
+		return err;
 	dst_fd = src_fd;
 
 	/* find the old dir entry and read the data */
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 422dde2ec0a..145566851e7 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -25,7 +25,9 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	int res;
 
-	hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+	res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+	if (res)
+		return ERR_PTR(res);
 	hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
 	res = hfs_brec_read(&fd, &rec, sizeof(rec));
 	if (res) {
@@ -49,9 +51,9 @@ done:
 /*
  * hfs_readdir
  */
-static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	int len, err;
 	char strbuf[HFS_MAX_NAMELEN];
@@ -60,23 +62,24 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct hfs_readdir_data *rd;
 	u16 type;
 
-	if (filp->f_pos >= inode->i_size)
+	if (ctx->pos >= inode->i_size)
 		return 0;
 
-	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	if (err)
+		return err;
 	hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
 	err = hfs_brec_find(&fd);
 	if (err)
 		goto out;
 
-	switch ((u32)filp->f_pos) {
-	case 0:
+	if (ctx->pos == 0) {
 		/* This is completely artificial... */
-		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+		if (!dir_emit_dot(file, ctx))
 			goto out;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
 		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
 			err = -EIO;
 			goto out;
@@ -84,31 +87,29 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
 		if (entry.type != HFS_CDR_THD) {
-			printk(KERN_ERR "hfs: bad catalog folder thread\n");
+			pr_err("bad catalog folder thread\n");
 			err = -EIO;
 			goto out;
 		}
 		//if (fd.entrylength < HFS_MIN_THREAD_SZ) {
-		//	printk(KERN_ERR "hfs: truncated catalog thread\n");
+		//	pr_err("truncated catalog thread\n");
 		//	err = -EIO;
 		//	goto out;
 		//}
-		if (filldir(dirent, "..", 2, 1,
+		if (!dir_emit(ctx, "..", 2,
 			    be32_to_cpu(entry.thread.ParID), DT_DIR))
 			goto out;
-		filp->f_pos++;
-		/* fall through */
-	default:
-		if (filp->f_pos >= inode->i_size)
-			goto out;
-		err = hfs_brec_goto(&fd, filp->f_pos - 1);
-		if (err)
-			goto out;
+		ctx->pos = 2;
 	}
+	if (ctx->pos >= inode->i_size)
+		goto out;
+	err = hfs_brec_goto(&fd, ctx->pos - 1);
+	if (err)
+		goto out;
 
 	for (;;) {
 		if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
-			printk(KERN_ERR "hfs: walked past end of dir\n");
+			pr_err("walked past end of dir\n");
 			err = -EIO;
 			goto out;
 		}
@@ -123,43 +124,43 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
 		if (type == HFS_CDR_DIR) {
 			if (fd.entrylength < sizeof(struct hfs_cat_dir)) {
-				printk(KERN_ERR "hfs: small dir entry\n");
+				pr_err("small dir entry\n");
 				err = -EIO;
 				goto out;
 			}
-			if (filldir(dirent, strbuf, len, filp->f_pos,
+			if (!dir_emit(ctx, strbuf, len,
 				    be32_to_cpu(entry.dir.DirID), DT_DIR))
 				break;
 		} else if (type == HFS_CDR_FIL) {
 			if (fd.entrylength < sizeof(struct hfs_cat_file)) {
-				printk(KERN_ERR "hfs: small file entry\n");
+				pr_err("small file entry\n");
 				err = -EIO;
 				goto out;
 			}
-			if (filldir(dirent, strbuf, len, filp->f_pos,
+			if (!dir_emit(ctx, strbuf, len,
 				    be32_to_cpu(entry.file.FlNum), DT_REG))
 				break;
 		} else {
-			printk(KERN_ERR "hfs: bad catalog entry type %d\n", type);
+			pr_err("bad catalog entry type %d\n", type);
 			err = -EIO;
 			goto out;
 		}
-		filp->f_pos++;
-		if (filp->f_pos >= inode->i_size)
+		ctx->pos++;
+		if (ctx->pos >= inode->i_size)
 			goto out;
 		err = hfs_brec_goto(&fd, 1);
 		if (err)
 			goto out;
 	}
-	rd = filp->private_data;
+	rd = file->private_data;
 	if (!rd) {
 		rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL);
 		if (!rd) {
 			err = -ENOMEM;
 			goto out;
 		}
-		filp->private_data = rd;
-		rd->file = filp;
+		file->private_data = rd;
+		rd->file = file;
 		list_add(&rd->list, &HFS_I(inode)->open_dir_list);
 	}
 	memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key));
@@ -172,7 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
 {
 	struct hfs_readdir_data *rd = file->private_data;
 	if (rd) {
+		mutex_lock(&inode->i_mutex);
 		list_del(&rd->list);
+		mutex_unlock(&inode->i_mutex);
 		kfree(rd);
 	}
 	return 0;
@@ -300,7 +303,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 const struct file_operations hfs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= hfs_readdir,
+	.iterate	= hfs_readdir,
 	.llseek		= generic_file_llseek,
 	.release	= hfs_dir_release,
 };
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index a67955a0c36..e33a0d36a93 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -107,7 +107,7 @@ static u16 hfs_ext_lastblock(struct hfs_extent *ext)
 	return be16_to_cpu(ext->block) + be16_to_cpu(ext->count);
 }
 
-static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
+static int __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
 {
 	int res;
 
@@ -116,26 +116,31 @@ static void __hfs_ext_write_extent(struct inode *inode, struct hfs_find_data *fd
 	res = hfs_brec_find(fd);
 	if (HFS_I(inode)->flags & HFS_FLG_EXT_NEW) {
 		if (res != -ENOENT)
-			return;
+			return res;
 		hfs_brec_insert(fd, HFS_I(inode)->cached_extents, sizeof(hfs_extent_rec));
 		HFS_I(inode)->flags &= ~(HFS_FLG_EXT_DIRTY|HFS_FLG_EXT_NEW);
 	} else {
 		if (res)
-			return;
+			return res;
 		hfs_bnode_write(fd->bnode, HFS_I(inode)->cached_extents, fd->entryoffset, fd->entrylength);
 		HFS_I(inode)->flags &= ~HFS_FLG_EXT_DIRTY;
 	}
+	return 0;
 }
 
-void hfs_ext_write_extent(struct inode *inode)
+int hfs_ext_write_extent(struct inode *inode)
 {
 	struct hfs_find_data fd;
+	int res = 0;
 
 	if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) {
-		hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
-		__hfs_ext_write_extent(inode, &fd);
+		res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
+		if (res)
+			return res;
+		res = __hfs_ext_write_extent(inode, &fd);
 		hfs_find_exit(&fd);
 	}
+	return res;
 }
 
 static inline int __hfs_ext_read_extent(struct hfs_find_data *fd, struct hfs_extent *extent,
@@ -161,8 +166,11 @@ static inline int __hfs_ext_cache_extent(struct hfs_find_data *fd, struct inode
 {
 	int res;
 
-	if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY)
-		__hfs_ext_write_extent(inode, fd);
+	if (HFS_I(inode)->flags & HFS_FLG_EXT_DIRTY) {
+		res = __hfs_ext_write_extent(inode, fd);
+		if (res)
+			return res;
+	}
 
 	res = __hfs_ext_read_extent(fd, HFS_I(inode)->cached_extents, inode->i_ino,
 				    block, HFS_IS_RSRC(inode) ? HFS_FK_RSRC : HFS_FK_DATA);
@@ -185,9 +193,11 @@ static int hfs_ext_read_extent(struct inode *inode, u16 block)
 	    block < HFS_I(inode)->cached_start + HFS_I(inode)->cached_blocks)
 		return 0;
 
-	hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
-	res = __hfs_ext_cache_extent(&fd, inode, block);
-	hfs_find_exit(&fd);
+	res = hfs_find_init(HFS_SB(inode->i_sb)->ext_tree, &fd);
+	if (!res) {
+		res = __hfs_ext_cache_extent(&fd, inode, block);
+		hfs_find_exit(&fd);
+	}
 	return res;
 }
 
@@ -195,11 +205,12 @@ static void hfs_dump_extent(struct hfs_extent *extent)
 {
 	int i;
 
-	dprint(DBG_EXTENT, "   ");
+	hfs_dbg(EXTENT, "   ");
 	for (i = 0; i < 3; i++)
-		dprint(DBG_EXTENT, " %u:%u", be16_to_cpu(extent[i].block),
-				 be16_to_cpu(extent[i].count));
-	dprint(DBG_EXTENT, "\n");
+		hfs_dbg_cont(EXTENT, " %u:%u",
+			     be16_to_cpu(extent[i].block),
+			     be16_to_cpu(extent[i].count));
+	hfs_dbg_cont(EXTENT, "\n");
 }
 
 static int hfs_add_extent(struct hfs_extent *extent, u16 offset,
@@ -298,7 +309,9 @@ int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type)
 	if (total_blocks == blocks)
 		return 0;
 
-	hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+	res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+	if (res)
+		return res;
 	do {
 		res = __hfs_ext_read_extent(&fd, extent, cnid, total_blocks, type);
 		if (res)
@@ -392,10 +405,10 @@ int hfs_extend_file(struct inode *inode)
 		goto out;
 	}
 
-	dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+	hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
 	if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) {
 		if (!HFS_I(inode)->first_blocks) {
-			dprint(DBG_EXTENT, "first extents\n");
+			hfs_dbg(EXTENT, "first extents\n");
 			/* no extents yet */
 			HFS_I(inode)->first_extents[0].block = cpu_to_be16(start);
 			HFS_I(inode)->first_extents[0].count = cpu_to_be16(len);
@@ -437,8 +450,10 @@ out:
 	return res;
 
 insert_extent:
-	dprint(DBG_EXTENT, "insert new extent\n");
-	hfs_ext_write_extent(inode);
+	hfs_dbg(EXTENT, "insert new extent\n");
+	res = hfs_ext_write_extent(inode);
+	if (res)
+		goto out;
 
 	memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec));
 	HFS_I(inode)->cached_extents[0].block = cpu_to_be16(start);
@@ -460,13 +475,13 @@ void hfs_file_truncate(struct inode *inode)
 	u32 size;
 	int res;
 
-	dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino,
-	       (long long)HFS_I(inode)->phys_size, inode->i_size);
+	hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n",
+		inode->i_ino, (long long)HFS_I(inode)->phys_size,
+		inode->i_size);
 	if (inode->i_size > HFS_I(inode)->phys_size) {
 		struct address_space *mapping = inode->i_mapping;
 		void *fsdata;
 		struct page *page;
-		int res;
 
 		/* XXX: Can use generic_cont_expand? */
 		size = inode->i_size - 1;
@@ -488,7 +503,12 @@ void hfs_file_truncate(struct inode *inode)
 		goto out;
 
 	mutex_lock(&HFS_I(inode)->extents_lock);
-	hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+	res = hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
+	if (res) {
+		mutex_unlock(&HFS_I(inode)->extents_lock);
+		/* XXX: We lack error handling of hfs_file_truncate() */
+		return;
+	}
 	while (1) {
 		if (alloc_cnt == HFS_I(inode)->first_blocks) {
 			hfs_free_extents(sb, HFS_I(inode)->first_extents,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 693df9fe52b..0524cda47a6 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -9,6 +9,12 @@
 #ifndef _LINUX_HFS_FS_H
 #define _LINUX_HFS_FS_H
 
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/mutex.h>
@@ -34,8 +40,18 @@
 //#define DBG_MASK	(DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT)
 #define DBG_MASK	(0)
 
-#define dprint(flg, fmt, args...) \
-	if (flg & DBG_MASK) printk(fmt , ## args)
+#define hfs_dbg(flg, fmt, ...)					\
+do {								\
+	if (DBG_##flg & DBG_MASK)				\
+		printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__);	\
+} while (0)
+
+#define hfs_dbg_cont(flg, fmt, ...)				\
+do {								\
+	if (DBG_##flg & DBG_MASK)				\
+		pr_cont(fmt, ##__VA_ARGS__);			\
+} while (0)
+
 
 /*
  * struct hfs_inode_info
@@ -174,7 +190,7 @@ extern const struct inode_operations hfs_dir_inode_operations;
 /* extent.c */
 extern int hfs_ext_keycmp(const btree_key *, const btree_key *);
 extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int);
-extern void hfs_ext_write_extent(struct inode *);
+extern int hfs_ext_write_extent(struct inode *);
 extern int hfs_extend_file(struct inode *);
 extern void hfs_file_truncate(struct inode *);
 
@@ -213,13 +229,10 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 /* string.c */
 extern const struct dentry_operations hfs_dentry_operations;
 
-extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
-		struct qstr *);
+extern int hfs_hash_dentry(const struct dentry *, struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
 		      const unsigned char *, unsigned int);
-extern int hfs_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 
 /* trans.c */
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index d47f11658c1..f9299d8a64e 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
+#include <linux/aio.h>
 
 #include "hfs_fs.h"
 #include "btree.h"
@@ -128,7 +129,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+	struct inode *inode = file_inode(file)->i_mapping->host;
 	ssize_t ret;
 
 	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
@@ -237,7 +238,7 @@ void hfs_delete_inode(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 
-	dprint(DBG_INODE, "delete_inode: %lu\n", inode->i_ino);
+	hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino);
 	if (S_ISDIR(inode->i_mode)) {
 		HFS_SB(sb)->folder_count--;
 		if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
@@ -416,9 +417,12 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	struct inode *main_inode = inode;
 	struct hfs_find_data fd;
 	hfs_cat_rec rec;
+	int res;
 
-	dprint(DBG_INODE, "hfs_write_inode: %lu\n", inode->i_ino);
-	hfs_ext_write_extent(inode);
+	hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino);
+	res = hfs_ext_write_extent(inode);
+	if (res)
+		return res;
 
 	if (inode->i_ino < HFS_FIRSTUSER_CNID) {
 		switch (inode->i_ino) {
@@ -515,7 +519,11 @@ static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
-	hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+	res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
+	if (res) {
+		iput(inode);
+		return ERR_PTR(res);
+	}
 	fd.search_key->cat = HFS_I(dir)->cat_key;
 	res = hfs_brec_read(&fd, &rec, sizeof(rec));
 	if (!res) {
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index b7ec224910c..aa3f0d6d043 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -48,7 +48,7 @@ static int hfs_get_last_session(struct super_block *sb,
 			*start = (sector_t)te.cdte_addr.lba << 2;
 			return 0;
 		}
-		printk(KERN_ERR "hfs: invalid session number or type of track\n");
+		pr_err("invalid session number or type of track\n");
 		return -EINVAL;
 	}
 	ms_info.addr_format = CDROM_LBA;
@@ -101,7 +101,7 @@ int hfs_mdb_get(struct super_block *sb)
 
 	HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz);
 	if (!size || (size & (HFS_SECTOR_SIZE - 1))) {
-		printk(KERN_ERR "hfs: bad allocation block size %d\n", size);
+		pr_err("bad allocation block size %d\n", size);
 		goto out_bh;
 	}
 
@@ -118,7 +118,7 @@ int hfs_mdb_get(struct super_block *sb)
 		size >>= 1;
 	brelse(bh);
 	if (!sb_set_blocksize(sb, size)) {
-		printk(KERN_ERR "hfs: unable to set blocksize to %u\n", size);
+		pr_err("unable to set blocksize to %u\n", size);
 		goto out;
 	}
 
@@ -162,8 +162,8 @@ int hfs_mdb_get(struct super_block *sb)
 	}
 
 	if (!HFS_SB(sb)->alt_mdb) {
-		printk(KERN_WARNING "hfs: unable to locate alternate MDB\n");
-		printk(KERN_WARNING "hfs: continuing without an alternate MDB\n");
+		pr_warn("unable to locate alternate MDB\n");
+		pr_warn("continuing without an alternate MDB\n");
 	}
 
 	HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0);
@@ -178,7 +178,7 @@ int hfs_mdb_get(struct super_block *sb)
 	while (size) {
 		bh = sb_bread(sb, off >> sb->s_blocksize_bits);
 		if (!bh) {
-			printk(KERN_ERR "hfs: unable to read volume bitmap\n");
+			pr_err("unable to read volume bitmap\n");
 			goto out;
 		}
 		off2 = off & (sb->s_blocksize - 1);
@@ -192,23 +192,22 @@ int hfs_mdb_get(struct super_block *sb)
 
 	HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp);
 	if (!HFS_SB(sb)->ext_tree) {
-		printk(KERN_ERR "hfs: unable to open extent tree\n");
+		pr_err("unable to open extent tree\n");
 		goto out;
 	}
 	HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp);
 	if (!HFS_SB(sb)->cat_tree) {
-		printk(KERN_ERR "hfs: unable to open catalog tree\n");
+		pr_err("unable to open catalog tree\n");
 		goto out;
 	}
 
 	attrib = mdb->drAtrb;
 	if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
-		printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
-			 "running fsck.hfs is recommended.  mounting read-only.\n");
+		pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	}
 	if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) {
-		printk(KERN_WARNING "hfs: filesystem is marked locked, mounting read-only.\n");
+		pr_warn("filesystem is marked locked, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	}
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -312,7 +311,7 @@ void hfs_mdb_commit(struct super_block *sb)
 		while (size) {
 			bh = sb_bread(sb, block);
 			if (!bh) {
-				printk(KERN_ERR "hfs: unable to read volume bitmap\n");
+				pr_err("unable to read volume bitmap\n");
 				break;
 			}
 			len = min((int)sb->s_blocksize - off, size);
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 495a976a3cc..85b610c3909 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,8 +51,7 @@ static unsigned char caseorder[256] = {
 /*
  * Hash a string to an integer in a case-independent way
  */
-int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *this)
+int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this)
 {
 	const unsigned char *name = this->name;
 	unsigned int hash, len = this->len;
@@ -93,8 +92,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
  * Test for equality of two strings in the HFS filename character ordering.
  * return 1 on failure and 0 on success
  */
-int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	const unsigned char *n1, *n2;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index e93ddaadfd1..2d2039e754c 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -117,12 +117,11 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data)
 		return 0;
 	if (!(*flags & MS_RDONLY)) {
 		if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
-			printk(KERN_WARNING "hfs: filesystem was not cleanly unmounted, "
-			       "running fsck.hfs is recommended.  leaving read-only.\n");
+			pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
 		} else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
-			printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
+			pr_warn("filesystem is marked locked, leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
 		}
@@ -253,29 +252,29 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
 		switch (token) {
 		case opt_uid:
 			if (match_int(&args[0], &tmp)) {
-				printk(KERN_ERR "hfs: uid requires an argument\n");
+				pr_err("uid requires an argument\n");
 				return 0;
 			}
 			hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp);
 			if (!uid_valid(hsb->s_uid)) {
-				printk(KERN_ERR "hfs: invalid uid %d\n", tmp);
+				pr_err("invalid uid %d\n", tmp);
 				return 0;
 			}
 			break;
 		case opt_gid:
 			if (match_int(&args[0], &tmp)) {
-				printk(KERN_ERR "hfs: gid requires an argument\n");
+				pr_err("gid requires an argument\n");
 				return 0;
 			}
 			hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp);
 			if (!gid_valid(hsb->s_gid)) {
-				printk(KERN_ERR "hfs: invalid gid %d\n", tmp);
+				pr_err("invalid gid %d\n", tmp);
 				return 0;
 			}
 			break;
 		case opt_umask:
 			if (match_octal(&args[0], &tmp)) {
-				printk(KERN_ERR "hfs: umask requires a value\n");
+				pr_err("umask requires a value\n");
 				return 0;
 			}
 			hsb->s_file_umask = (umode_t)tmp;
@@ -283,39 +282,39 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
 			break;
 		case opt_file_umask:
 			if (match_octal(&args[0], &tmp)) {
-				printk(KERN_ERR "hfs: file_umask requires a value\n");
+				pr_err("file_umask requires a value\n");
 				return 0;
 			}
 			hsb->s_file_umask = (umode_t)tmp;
 			break;
 		case opt_dir_umask:
 			if (match_octal(&args[0], &tmp)) {
-				printk(KERN_ERR "hfs: dir_umask requires a value\n");
+				pr_err("dir_umask requires a value\n");
 				return 0;
 			}
 			hsb->s_dir_umask = (umode_t)tmp;
 			break;
 		case opt_part:
 			if (match_int(&args[0], &hsb->part)) {
-				printk(KERN_ERR "hfs: part requires an argument\n");
+				pr_err("part requires an argument\n");
 				return 0;
 			}
 			break;
 		case opt_session:
 			if (match_int(&args[0], &hsb->session)) {
-				printk(KERN_ERR "hfs: session requires an argument\n");
+				pr_err("session requires an argument\n");
 				return 0;
 			}
 			break;
 		case opt_type:
 			if (match_fourchar(&args[0], &hsb->s_type)) {
-				printk(KERN_ERR "hfs: type requires a 4 character value\n");
+				pr_err("type requires a 4 character value\n");
 				return 0;
 			}
 			break;
 		case opt_creator:
 			if (match_fourchar(&args[0], &hsb->s_creator)) {
-				printk(KERN_ERR "hfs: creator requires a 4 character value\n");
+				pr_err("creator requires a 4 character value\n");
 				return 0;
 			}
 			break;
@@ -324,14 +323,14 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
 			break;
 		case opt_codepage:
 			if (hsb->nls_disk) {
-				printk(KERN_ERR "hfs: unable to change codepage\n");
+				pr_err("unable to change codepage\n");
 				return 0;
 			}
 			p = match_strdup(&args[0]);
 			if (p)
 				hsb->nls_disk = load_nls(p);
 			if (!hsb->nls_disk) {
-				printk(KERN_ERR "hfs: unable to load codepage \"%s\"\n", p);
+				pr_err("unable to load codepage \"%s\"\n", p);
 				kfree(p);
 				return 0;
 			}
@@ -339,14 +338,14 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
 			break;
 		case opt_iocharset:
 			if (hsb->nls_io) {
-				printk(KERN_ERR "hfs: unable to change iocharset\n");
+				pr_err("unable to change iocharset\n");
 				return 0;
 			}
 			p = match_strdup(&args[0]);
 			if (p)
 				hsb->nls_io = load_nls(p);
 			if (!hsb->nls_io) {
-				printk(KERN_ERR "hfs: unable to load iocharset \"%s\"\n", p);
+				pr_err("unable to load iocharset \"%s\"\n", p);
 				kfree(p);
 				return 0;
 			}
@@ -360,7 +359,7 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
 	if (hsb->nls_disk && !hsb->nls_io) {
 		hsb->nls_io = load_nls_default();
 		if (!hsb->nls_io) {
-			printk(KERN_ERR "hfs: unable to load default iocharset\n");
+			pr_err("unable to load default iocharset\n");
 			return 0;
 		}
 	}
@@ -400,7 +399,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	res = -EINVAL;
 	if (!parse_options((char *)data, sbi)) {
-		printk(KERN_ERR "hfs: unable to parse mount options.\n");
+		pr_err("unable to parse mount options\n");
 		goto bail;
 	}
 
@@ -411,14 +410,16 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 	res = hfs_mdb_get(sb);
 	if (res) {
 		if (!silent)
-			printk(KERN_WARNING "hfs: can't find a HFS filesystem on dev %s.\n",
+			pr_warn("can't find a HFS filesystem on dev %s\n",
 				hfs_mdb_name(sb));
 		res = -EINVAL;
 		goto bail;
 	}
 
 	/* try to get the root inode */
-	hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
+	if (res)
+		goto bail_no_root;
 	res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
 	if (!res) {
 		if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
@@ -447,7 +448,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 
 bail_no_root:
-	printk(KERN_ERR "hfs: get root inode failed.\n");
+	pr_err("get root inode failed\n");
 bail:
 	hfs_mdb_put(sb);
 	return res;
@@ -466,6 +467,7 @@ static struct file_system_type hfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("hfs");
 
 static void hfs_init_once(void *p)
 {
diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile
index 3cc0df73015..09d278bb7b9 100644
--- a/fs/hfsplus/Makefile
+++ b/fs/hfsplus/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o
 
 hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \
-		bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o
-
+		bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \
+		attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
new file mode 100644
index 00000000000..0f47890299c
--- /dev/null
+++ b/fs/hfsplus/attributes.c
@@ -0,0 +1,399 @@
+/*
+ * linux/fs/hfsplus/attributes.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handling of records in attributes tree
+ */
+
+#include "hfsplus_fs.h"
+#include "hfsplus_raw.h"
+
+static struct kmem_cache *hfsplus_attr_tree_cachep;
+
+int hfsplus_create_attr_tree_cache(void)
+{
+	if (hfsplus_attr_tree_cachep)
+		return -EEXIST;
+
+	hfsplus_attr_tree_cachep =
+		kmem_cache_create("hfsplus_attr_cache",
+			sizeof(hfsplus_attr_entry), 0,
+			SLAB_HWCACHE_ALIGN, NULL);
+	if (!hfsplus_attr_tree_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void hfsplus_destroy_attr_tree_cache(void)
+{
+	kmem_cache_destroy(hfsplus_attr_tree_cachep);
+}
+
+int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1,
+				const hfsplus_btree_key *k2)
+{
+	__be32 k1_cnid, k2_cnid;
+
+	k1_cnid = k1->attr.cnid;
+	k2_cnid = k2->attr.cnid;
+	if (k1_cnid != k2_cnid)
+		return be32_to_cpu(k1_cnid) < be32_to_cpu(k2_cnid) ? -1 : 1;
+
+	return hfsplus_strcmp(
+			(const struct hfsplus_unistr *)&k1->attr.key_name,
+			(const struct hfsplus_unistr *)&k2->attr.key_name);
+}
+
+int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key,
+			u32 cnid, const char *name)
+{
+	int len;
+
+	memset(key, 0, sizeof(struct hfsplus_attr_key));
+	key->attr.cnid = cpu_to_be32(cnid);
+	if (name) {
+		len = strlen(name);
+		if (len > HFSPLUS_ATTR_MAX_STRLEN) {
+			pr_err("invalid xattr name's length\n");
+			return -EINVAL;
+		}
+		hfsplus_asc2uni(sb,
+				(struct hfsplus_unistr *)&key->attr.key_name,
+				HFSPLUS_ATTR_MAX_STRLEN, name, len);
+		len = be16_to_cpu(key->attr.key_name.length);
+	} else {
+		key->attr.key_name.length = 0;
+		len = 0;
+	}
+
+	/* The length of the key, as stored in key_len field, does not include
+	 * the size of the key_len field itself.
+	 * So, offsetof(hfsplus_attr_key, key_name) is a trick because
+	 * it takes into consideration key_len field (__be16) of
+	 * hfsplus_attr_key structure instead of length field (__be16) of
+	 * hfsplus_attr_unistr structure.
+	 */
+	key->key_len =
+		cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) +
+				2 * len);
+
+	return 0;
+}
+
+void hfsplus_attr_build_key_uni(hfsplus_btree_key *key,
+					u32 cnid,
+					struct hfsplus_attr_unistr *name)
+{
+	int ustrlen;
+
+	memset(key, 0, sizeof(struct hfsplus_attr_key));
+	ustrlen = be16_to_cpu(name->length);
+	key->attr.cnid = cpu_to_be32(cnid);
+	key->attr.key_name.length = cpu_to_be16(ustrlen);
+	ustrlen *= 2;
+	memcpy(key->attr.key_name.unicode, name->unicode, ustrlen);
+
+	/* The length of the key, as stored in key_len field, does not include
+	 * the size of the key_len field itself.
+	 * So, offsetof(hfsplus_attr_key, key_name) is a trick because
+	 * it takes into consideration key_len field (__be16) of
+	 * hfsplus_attr_key structure instead of length field (__be16) of
+	 * hfsplus_attr_unistr structure.
+	 */
+	key->key_len =
+		cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) +
+				ustrlen);
+}
+
+hfsplus_attr_entry *hfsplus_alloc_attr_entry(void)
+{
+	return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL);
+}
+
+void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry)
+{
+	if (entry)
+		kmem_cache_free(hfsplus_attr_tree_cachep, entry);
+}
+
+#define HFSPLUS_INVALID_ATTR_RECORD -1
+
+static int hfsplus_attr_build_record(hfsplus_attr_entry *entry, int record_type,
+				u32 cnid, const void *value, size_t size)
+{
+	if (record_type == HFSPLUS_ATTR_FORK_DATA) {
+		/*
+		 * Mac OS X supports only inline data attributes.
+		 * Do nothing
+		 */
+		memset(entry, 0, sizeof(*entry));
+		return sizeof(struct hfsplus_attr_fork_data);
+	} else if (record_type == HFSPLUS_ATTR_EXTENTS) {
+		/*
+		 * Mac OS X supports only inline data attributes.
+		 * Do nothing.
+		 */
+		memset(entry, 0, sizeof(*entry));
+		return sizeof(struct hfsplus_attr_extents);
+	} else if (record_type == HFSPLUS_ATTR_INLINE_DATA) {
+		u16 len;
+
+		memset(entry, 0, sizeof(struct hfsplus_attr_inline_data));
+		entry->inline_data.record_type = cpu_to_be32(record_type);
+		if (size <= HFSPLUS_MAX_INLINE_DATA_SIZE)
+			len = size;
+		else
+			return HFSPLUS_INVALID_ATTR_RECORD;
+		entry->inline_data.length = cpu_to_be16(len);
+		memcpy(entry->inline_data.raw_bytes, value, len);
+		/*
+		 * Align len on two-byte boundary.
+		 * It needs to add pad byte if we have odd len.
+		 */
+		len = round_up(len, 2);
+		return offsetof(struct hfsplus_attr_inline_data, raw_bytes) +
+					len;
+	} else /* invalid input */
+		memset(entry, 0, sizeof(*entry));
+
+	return HFSPLUS_INVALID_ATTR_RECORD;
+}
+
+int hfsplus_find_attr(struct super_block *sb, u32 cnid,
+			const char *name, struct hfs_find_data *fd)
+{
+	int err = 0;
+
+	hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid);
+
+	if (!HFSPLUS_SB(sb)->attr_tree) {
+		pr_err("attributes file doesn't exist\n");
+		return -EINVAL;
+	}
+
+	if (name) {
+		err = hfsplus_attr_build_key(sb, fd->search_key, cnid, name);
+		if (err)
+			goto failed_find_attr;
+		err = hfs_brec_find(fd, hfs_find_rec_by_key);
+		if (err)
+			goto failed_find_attr;
+	} else {
+		err = hfsplus_attr_build_key(sb, fd->search_key, cnid, NULL);
+		if (err)
+			goto failed_find_attr;
+		err = hfs_brec_find(fd, hfs_find_1st_rec_by_cnid);
+		if (err)
+			goto failed_find_attr;
+	}
+
+failed_find_attr:
+	return err;
+}
+
+int hfsplus_attr_exists(struct inode *inode, const char *name)
+{
+	int err = 0;
+	struct super_block *sb = inode->i_sb;
+	struct hfs_find_data fd;
+
+	if (!HFSPLUS_SB(sb)->attr_tree)
+		return 0;
+
+	err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd);
+	if (err)
+		return 0;
+
+	err = hfsplus_find_attr(sb, inode->i_ino, name, &fd);
+	if (err)
+		goto attr_not_found;
+
+	hfs_find_exit(&fd);
+	return 1;
+
+attr_not_found:
+	hfs_find_exit(&fd);
+	return 0;
+}
+
+int hfsplus_create_attr(struct inode *inode,
+				const char *name,
+				const void *value, size_t size)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hfs_find_data fd;
+	hfsplus_attr_entry *entry_ptr;
+	int entry_size;
+	int err;
+
+	hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n",
+		name ? name : NULL, inode->i_ino);
+
+	if (!HFSPLUS_SB(sb)->attr_tree) {
+		pr_err("attributes file doesn't exist\n");
+		return -EINVAL;
+	}
+
+	entry_ptr = hfsplus_alloc_attr_entry();
+	if (!entry_ptr)
+		return -ENOMEM;
+
+	err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd);
+	if (err)
+		goto failed_init_create_attr;
+
+	if (name) {
+		err = hfsplus_attr_build_key(sb, fd.search_key,
+						inode->i_ino, name);
+		if (err)
+			goto failed_create_attr;
+	} else {
+		err = -EINVAL;
+		goto failed_create_attr;
+	}
+
+	/* Mac OS X supports only inline data attributes. */
+	entry_size = hfsplus_attr_build_record(entry_ptr,
+					HFSPLUS_ATTR_INLINE_DATA,
+					inode->i_ino,
+					value, size);
+	if (entry_size == HFSPLUS_INVALID_ATTR_RECORD) {
+		err = -EINVAL;
+		goto failed_create_attr;
+	}
+
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+	if (err != -ENOENT) {
+		if (!err)
+			err = -EEXIST;
+		goto failed_create_attr;
+	}
+
+	err = hfs_brec_insert(&fd, entry_ptr, entry_size);
+	if (err)
+		goto failed_create_attr;
+
+	hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY);
+
+failed_create_attr:
+	hfs_find_exit(&fd);
+
+failed_init_create_attr:
+	hfsplus_destroy_attr_entry(entry_ptr);
+	return err;
+}
+
+static int __hfsplus_delete_attr(struct inode *inode, u32 cnid,
+					struct hfs_find_data *fd)
+{
+	int err = 0;
+	__be32 found_cnid, record_type;
+
+	hfs_bnode_read(fd->bnode, &found_cnid,
+			fd->keyoffset +
+			offsetof(struct hfsplus_attr_key, cnid),
+			sizeof(__be32));
+	if (cnid != be32_to_cpu(found_cnid))
+		return -ENOENT;
+
+	hfs_bnode_read(fd->bnode, &record_type,
+			fd->entryoffset, sizeof(record_type));
+
+	switch (be32_to_cpu(record_type)) {
+	case HFSPLUS_ATTR_INLINE_DATA:
+		/* All is OK. Do nothing. */
+		break;
+	case HFSPLUS_ATTR_FORK_DATA:
+	case HFSPLUS_ATTR_EXTENTS:
+		pr_err("only inline data xattr are supported\n");
+		return -EOPNOTSUPP;
+	default:
+		pr_err("invalid extended attribute record\n");
+		return -ENOENT;
+	}
+
+	err = hfs_brec_remove(fd);
+	if (err)
+		return err;
+
+	hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ATTR_DIRTY);
+	return err;
+}
+
+int hfsplus_delete_attr(struct inode *inode, const char *name)
+{
+	int err = 0;
+	struct super_block *sb = inode->i_sb;
+	struct hfs_find_data fd;
+
+	hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n",
+		name ? name : NULL, inode->i_ino);
+
+	if (!HFSPLUS_SB(sb)->attr_tree) {
+		pr_err("attributes file doesn't exist\n");
+		return -EINVAL;
+	}
+
+	err = hfs_find_init(HFSPLUS_SB(sb)->attr_tree, &fd);
+	if (err)
+		return err;
+
+	if (name) {
+		err = hfsplus_attr_build_key(sb, fd.search_key,
+						inode->i_ino, name);
+		if (err)
+			goto out;
+	} else {
+		pr_err("invalid extended attribute name\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
+	if (err)
+		goto out;
+
+	err = __hfsplus_delete_attr(inode, inode->i_ino, &fd);
+	if (err)
+		goto out;
+
+out:
+	hfs_find_exit(&fd);
+	return err;
+}
+
+int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid)
+{
+	int err = 0;
+	struct hfs_find_data fd;
+
+	hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid);
+
+	if (!HFSPLUS_SB(dir->i_sb)->attr_tree) {
+		pr_err("attributes file doesn't exist\n");
+		return -EINVAL;
+	}
+
+	err = hfs_find_init(HFSPLUS_SB(dir->i_sb)->attr_tree, &fd);
+	if (err)
+		return err;
+
+	for (;;) {
+		err = hfsplus_find_attr(dir->i_sb, cnid, NULL, &fd);
+		if (err) {
+			if (err != -ENOENT)
+				pr_err("xattr search failed\n");
+			goto end_delete_all;
+		}
+
+		err = __hfsplus_delete_attr(dir, cnid, &fd);
+		if (err)
+			goto end_delete_all;
+	}
+
+end_delete_all:
+	hfs_find_exit(&fd);
+	return err;
+}
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5d799c13205..c1422d91cd3 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -22,9 +22,21 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
 		return -ENOMEM;
 	fd->search_key = ptr;
 	fd->key = ptr + tree->max_key_len + 2;
-	dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n",
+	hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
 		tree->cnid, __builtin_return_address(0));
-	mutex_lock(&tree->tree_lock);
+	switch (tree->cnid) {
+	case HFSPLUS_CAT_CNID:
+		mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX);
+		break;
+	case HFSPLUS_EXT_CNID:
+		mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX);
+		break;
+	case HFSPLUS_ATTR_CNID:
+		mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX);
+		break;
+	default:
+		BUG();
+	}
 	return 0;
 }
 
@@ -32,21 +44,83 @@ void hfs_find_exit(struct hfs_find_data *fd)
 {
 	hfs_bnode_put(fd->bnode);
 	kfree(fd->search_key);
-	dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n",
+	hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n",
 		fd->tree->cnid, __builtin_return_address(0));
 	mutex_unlock(&fd->tree->tree_lock);
 	fd->tree = NULL;
 }
 
-/* Find the record in bnode that best matches key (not greater than...)*/
-int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
+int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode,
+				struct hfs_find_data *fd,
+				int *begin,
+				int *end,
+				int *cur_rec)
+{
+	__be32 cur_cnid;
+	__be32 search_cnid;
+
+	if (bnode->tree->cnid == HFSPLUS_EXT_CNID) {
+		cur_cnid = fd->key->ext.cnid;
+		search_cnid = fd->search_key->ext.cnid;
+	} else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) {
+		cur_cnid = fd->key->cat.parent;
+		search_cnid = fd->search_key->cat.parent;
+	} else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) {
+		cur_cnid = fd->key->attr.cnid;
+		search_cnid = fd->search_key->attr.cnid;
+	} else {
+		cur_cnid = 0;	/* used-uninitialized warning */
+		search_cnid = 0;
+		BUG();
+	}
+
+	if (cur_cnid == search_cnid) {
+		(*end) = (*cur_rec);
+		if ((*begin) == (*end))
+			return 1;
+	} else {
+		if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid))
+			(*begin) = (*cur_rec) + 1;
+		else
+			(*end) = (*cur_rec) - 1;
+	}
+
+	return 0;
+}
+
+int hfs_find_rec_by_key(struct hfs_bnode *bnode,
+				struct hfs_find_data *fd,
+				int *begin,
+				int *end,
+				int *cur_rec)
 {
 	int cmpval;
+
+	cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
+	if (!cmpval) {
+		(*end) = (*cur_rec);
+		return 1;
+	}
+	if (cmpval < 0)
+		(*begin) = (*cur_rec) + 1;
+	else
+		*(end) = (*cur_rec) - 1;
+
+	return 0;
+}
+
+/* Find the record in bnode that best matches key (not greater than...)*/
+int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd,
+					search_strategy_t rec_found)
+{
 	u16 off, len, keylen;
 	int rec;
 	int b, e;
 	int res;
 
+	if (!rec_found)
+		BUG();
+
 	b = 0;
 	e = bnode->num_recs - 1;
 	res = -ENOENT;
@@ -59,17 +133,12 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
 			goto fail;
 		}
 		hfs_bnode_read(bnode, fd->key, off, keylen);
-		cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
-		if (!cmpval) {
-			e = rec;
+		if (rec_found(bnode, fd, &b, &e, &rec)) {
 			res = 0;
 			goto done;
 		}
-		if (cmpval < 0)
-			b = rec + 1;
-		else
-			e = rec - 1;
 	} while (b <= e);
+
 	if (rec != e && e >= 0) {
 		len = hfs_brec_lenoff(bnode, e, &off);
 		keylen = hfs_brec_keylen(bnode, e);
@@ -79,19 +148,21 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
 		}
 		hfs_bnode_read(bnode, fd->key, off, keylen);
 	}
+
 done:
 	fd->record = e;
 	fd->keyoffset = off;
 	fd->keylength = keylen;
 	fd->entryoffset = off + keylen;
 	fd->entrylength = len - keylen;
+
 fail:
 	return res;
 }
 
 /* Traverse a B*Tree from the root to a leaf finding best fit to key */
 /* Return allocated copy of node found, set recnum to best record */
-int hfs_brec_find(struct hfs_find_data *fd)
+int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare)
 {
 	struct hfs_btree *tree;
 	struct hfs_bnode *bnode;
@@ -122,7 +193,7 @@ int hfs_brec_find(struct hfs_find_data *fd)
 			goto invalid;
 		bnode->parent = parent;
 
-		res = __hfs_brec_find(bnode, fd);
+		res = __hfs_brec_find(bnode, fd, do_key_compare);
 		if (!height)
 			break;
 		if (fd->record < 0)
@@ -137,7 +208,7 @@ int hfs_brec_find(struct hfs_find_data *fd)
 	return res;
 
 invalid:
-	printk(KERN_ERR "hfs: inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
+	pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
 		height, bnode->height, bnode->type, nidx, parent);
 	res = -EIO;
 release:
@@ -149,7 +220,7 @@ int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len)
 {
 	int res;
 
-	res = hfs_brec_find(fd);
+	res = hfs_brec_find(fd, hfs_find_rec_by_key);
 	if (res)
 		return res;
 	if (fd->entrylength > rec_len)
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index 6feefc0cb48..d2954451519 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -30,7 +30,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
 	if (!len)
 		return size;
 
-	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
+	hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
 	mutex_lock(&sbi->alloc_mutex);
 	mapping = sbi->alloc_file->i_mapping;
 	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
@@ -89,14 +89,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
 		else
 			end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32;
 	}
-	dprint(DBG_BITMAP, "bitmap full\n");
+	hfs_dbg(BITMAP, "bitmap full\n");
 	start = size;
 	goto out;
 
 found:
 	start = offset + (curr - pptr) * 32 + i;
 	if (start >= size) {
-		dprint(DBG_BITMAP, "bitmap full\n");
+		hfs_dbg(BITMAP, "bitmap full\n");
 		goto out;
 	}
 	/* do any partial u32 at the start */
@@ -154,7 +154,7 @@ done:
 	*max = offset + (curr - pptr) * 32 + i - start;
 	sbi->free_blocks -= *max;
 	hfsplus_mark_mdb_dirty(sb);
-	dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
+	hfs_dbg(BITMAP, "-> %u,%u\n", start, *max);
 out:
 	mutex_unlock(&sbi->alloc_mutex);
 	return start;
@@ -173,7 +173,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 	if (!count)
 		return 0;
 
-	dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
+	hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count);
 	/* are all of the bits in range? */
 	if ((offset + count) > sbi->total_blocks)
 		return -ENOENT;
@@ -238,8 +238,7 @@ out:
 	return 0;
 
 kaboom:
-	printk(KERN_CRIT "hfsplus: unable to mark blocks free: error %ld\n",
-			PTR_ERR(page));
+	pr_crit("unable to mark blocks free: error %ld\n", PTR_ERR(page));
 	mutex_unlock(&sbi->alloc_mutex);
 
 	return -EIO;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 1c42cc5b899..11c86020452 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -62,7 +62,8 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
 
 	tree = node->tree;
 	if (node->type == HFS_NODE_LEAF ||
-	    tree->attributes & HFS_TREE_VARIDXKEYS)
+	    tree->attributes & HFS_TREE_VARIDXKEYS ||
+	    node->tree->cnid == HFSPLUS_ATTR_CNID)
 		key_len = hfs_bnode_read_u16(node, off) + 2;
 	else
 		key_len = tree->max_key_len + 2;
@@ -129,7 +130,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
 	struct page **src_page, **dst_page;
 	int l;
 
-	dprint(DBG_BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
+	hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
 	if (!len)
 		return;
 	tree = src_node->tree;
@@ -187,7 +188,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 	struct page **src_page, **dst_page;
 	int l;
 
-	dprint(DBG_BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
+	hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
 	if (!len)
 		return;
 	src += node->page_offset;
@@ -301,34 +302,35 @@ void hfs_bnode_dump(struct hfs_bnode *node)
 	__be32 cnid;
 	int i, off, key_off;
 
-	dprint(DBG_BNODE_MOD, "bnode: %d\n", node->this);
+	hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this);
 	hfs_bnode_read(node, &desc, 0, sizeof(desc));
-	dprint(DBG_BNODE_MOD, "%d, %d, %d, %d, %d\n",
+	hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n",
 		be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
 		desc.type, desc.height, be16_to_cpu(desc.num_recs));
 
 	off = node->tree->node_size - 2;
 	for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
 		key_off = hfs_bnode_read_u16(node, off);
-		dprint(DBG_BNODE_MOD, " %d", key_off);
+		hfs_dbg(BNODE_MOD, " %d", key_off);
 		if (i && node->type == HFS_NODE_INDEX) {
 			int tmp;
 
-			if (node->tree->attributes & HFS_TREE_VARIDXKEYS)
+			if (node->tree->attributes & HFS_TREE_VARIDXKEYS ||
+					node->tree->cnid == HFSPLUS_ATTR_CNID)
 				tmp = hfs_bnode_read_u16(node, key_off) + 2;
 			else
 				tmp = node->tree->max_key_len + 2;
-			dprint(DBG_BNODE_MOD, " (%d", tmp);
+			hfs_dbg_cont(BNODE_MOD, " (%d", tmp);
 			hfs_bnode_read(node, &cnid, key_off + tmp, 4);
-			dprint(DBG_BNODE_MOD, ",%d)", be32_to_cpu(cnid));
+			hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid));
 		} else if (i && node->type == HFS_NODE_LEAF) {
 			int tmp;
 
 			tmp = hfs_bnode_read_u16(node, key_off);
-			dprint(DBG_BNODE_MOD, " (%d)", tmp);
+			hfs_dbg_cont(BNODE_MOD, " (%d)", tmp);
 		}
 	}
-	dprint(DBG_BNODE_MOD, "\n");
+	hfs_dbg_cont(BNODE_MOD, "\n");
 }
 
 void hfs_bnode_unlink(struct hfs_bnode *node)
@@ -364,7 +366,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node)
 
 	/* move down? */
 	if (!node->prev && !node->next)
-		dprint(DBG_BNODE_MOD, "hfs_btree_del_level\n");
+		hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n");
 	if (!node->parent) {
 		tree->root = 0;
 		tree->depth = 0;
@@ -384,7 +386,7 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
 	struct hfs_bnode *node;
 
 	if (cnid >= tree->node_count) {
-		printk(KERN_ERR "hfs: request for non-existent node "
+		pr_err("request for non-existent node "
 				"%d in B*Tree\n",
 			cnid);
 		return NULL;
@@ -407,7 +409,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	loff_t off;
 
 	if (cnid >= tree->node_count) {
-		printk(KERN_ERR "hfs: request for non-existent node "
+		pr_err("request for non-existent node "
 				"%d in B*Tree\n",
 			cnid);
 		return NULL;
@@ -423,8 +425,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	node->this = cnid;
 	set_bit(HFS_BNODE_NEW, &node->flags);
 	atomic_set(&node->refcnt, 1);
-	dprint(DBG_BNODE_REFS, "new_node(%d:%d): 1\n",
-	       node->tree->cnid, node->this);
+	hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n",
+		node->tree->cnid, node->this);
 	init_waitqueue_head(&node->lock_wq);
 	spin_lock(&tree->hash_lock);
 	node2 = hfs_bnode_findhash(tree, cnid);
@@ -468,7 +470,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node)
 {
 	struct hfs_bnode **p;
 
-	dprint(DBG_BNODE_REFS, "remove_node(%d:%d): %d\n",
+	hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n",
 		node->tree->cnid, node->this, atomic_read(&node->refcnt));
 	for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
 	     *p && *p != node; p = &(*p)->next_hash)
@@ -586,7 +588,7 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
 	node = hfs_bnode_findhash(tree, num);
 	spin_unlock(&tree->hash_lock);
 	if (node) {
-		printk(KERN_CRIT "new node %u already hashed?\n", num);
+		pr_crit("new node %u already hashed?\n", num);
 		WARN_ON(1);
 		return node;
 	}
@@ -618,7 +620,7 @@ void hfs_bnode_get(struct hfs_bnode *node)
 {
 	if (node) {
 		atomic_inc(&node->refcnt);
-		dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
+		hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n",
 			node->tree->cnid, node->this,
 			atomic_read(&node->refcnt));
 	}
@@ -631,7 +633,7 @@ void hfs_bnode_put(struct hfs_bnode *node)
 		struct hfs_btree *tree = node->tree;
 		int i;
 
-		dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n",
+		hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n",
 			node->tree->cnid, node->this,
 			atomic_read(&node->refcnt));
 		BUG_ON(!atomic_read(&node->refcnt));
@@ -646,6 +648,8 @@ void hfs_bnode_put(struct hfs_bnode *node)
 		if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
 			hfs_bnode_unhash(node);
 			spin_unlock(&tree->hash_lock);
+			hfs_bnode_clear(node, 0,
+				PAGE_CACHE_SIZE * tree->pages_per_bnode);
 			hfs_bmap_free(node);
 			hfs_bnode_free(node);
 			return;
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 2a734cfccc9..6e560d56094 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -36,7 +36,8 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
 		return 0;
 
 	if ((node->type == HFS_NODE_INDEX) &&
-	   !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
+	   !(node->tree->attributes & HFS_TREE_VARIDXKEYS) &&
+	   (node->tree->cnid != HFSPLUS_ATTR_CNID)) {
 		retval = node->tree->max_key_len + 2;
 	} else {
 		recoff = hfs_bnode_read_u16(node,
@@ -44,13 +45,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
 		if (!recoff)
 			return 0;
 		if (recoff > node->tree->node_size - 2) {
-			printk(KERN_ERR "hfs: recoff %d too large\n", recoff);
+			pr_err("recoff %d too large\n", recoff);
 			return 0;
 		}
 
 		retval = hfs_bnode_read_u16(node, recoff) + 2;
 		if (retval > node->tree->max_key_len + 2) {
-			printk(KERN_ERR "hfs: keylen %d too large\n",
+			pr_err("keylen %d too large\n",
 				retval);
 			retval = 0;
 		}
@@ -89,7 +90,7 @@ again:
 	end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
 	end_off = hfs_bnode_read_u16(node, end_rec_off);
 	end_rec_off -= 2;
-	dprint(DBG_BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
+	hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
 		rec, size, end_off, end_rec_off);
 	if (size > end_rec_off - end_off) {
 		if (new_node)
@@ -151,12 +152,13 @@ skip:
 
 		/* get index key */
 		hfs_bnode_read_key(new_node, fd->search_key, 14);
-		__hfs_brec_find(fd->bnode, fd);
+		__hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key);
 
 		hfs_bnode_put(new_node);
 		new_node = NULL;
 
-		if (tree->attributes & HFS_TREE_VARIDXKEYS)
+		if ((tree->attributes & HFS_TREE_VARIDXKEYS) ||
+				(tree->cnid == HFSPLUS_ATTR_CNID))
 			key_len = be16_to_cpu(fd->search_key->key_len) + 2;
 		else {
 			fd->search_key->key_len =
@@ -189,7 +191,7 @@ again:
 		mark_inode_dirty(tree->inode);
 	}
 	hfs_bnode_dump(node);
-	dprint(DBG_BNODE_MOD, "remove_rec: %d, %d\n",
+	hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n",
 		fd->record, fd->keylength + fd->entrylength);
 	if (!--node->num_recs) {
 		hfs_bnode_unlink(node);
@@ -201,7 +203,7 @@ again:
 		hfs_bnode_put(node);
 		node = fd->bnode = parent;
 
-		__hfs_brec_find(node, fd);
+		__hfs_brec_find(node, fd, hfs_find_rec_by_key);
 		goto again;
 	}
 	hfs_bnode_write_u16(node,
@@ -242,7 +244,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
 	if (IS_ERR(new_node))
 		return new_node;
 	hfs_bnode_get(node);
-	dprint(DBG_BNODE_MOD, "split_nodes: %d - %d - %d\n",
+	hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n",
 		node->this, new_node->this, node->next);
 	new_node->next = node->next;
 	new_node->prev = node->this;
@@ -367,16 +369,17 @@ again:
 	parent = hfs_bnode_find(tree, node->parent);
 	if (IS_ERR(parent))
 		return PTR_ERR(parent);
-	__hfs_brec_find(parent, fd);
+	__hfs_brec_find(parent, fd, hfs_find_rec_by_key);
 	hfs_bnode_dump(parent);
 	rec = fd->record;
 
 	/* size difference between old and new key */
-	if (tree->attributes & HFS_TREE_VARIDXKEYS)
+	if ((tree->attributes & HFS_TREE_VARIDXKEYS) ||
+				(tree->cnid == HFSPLUS_ATTR_CNID))
 		newkeylen = hfs_bnode_read_u16(node, 14) + 2;
 	else
 		fd->keylength = newkeylen = tree->max_key_len + 2;
-	dprint(DBG_BNODE_MOD, "update_rec: %d, %d, %d\n",
+	hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n",
 		rec, fd->keylength, newkeylen);
 
 	rec_off = tree->node_size - (rec + 2) * 2;
@@ -388,7 +391,7 @@ again:
 		end_off = hfs_bnode_read_u16(parent, end_rec_off);
 		if (end_rec_off - end_off < diff) {
 
-			dprint(DBG_BNODE_MOD, "hfs: splitting index node.\n");
+			hfs_dbg(BNODE_MOD, "splitting index node\n");
 			fd->bnode = parent;
 			new_node = hfs_bnode_split(fd);
 			if (IS_ERR(new_node))
@@ -427,7 +430,7 @@ skip:
 		hfs_bnode_read_key(new_node, fd->search_key, 14);
 		cnid = cpu_to_be32(new_node->this);
 
-		__hfs_brec_find(fd->bnode, fd);
+		__hfs_brec_find(fd->bnode, fd, hfs_find_rec_by_key);
 		hfs_brec_insert(fd, &cnid, sizeof(cnid));
 		hfs_bnode_put(fd->bnode);
 		hfs_bnode_put(new_node);
@@ -495,13 +498,15 @@ static int hfs_btree_inc_height(struct hfs_btree *tree)
 		/* insert old root idx into new root */
 		node->parent = tree->root;
 		if (node->type == HFS_NODE_LEAF ||
-		    tree->attributes & HFS_TREE_VARIDXKEYS)
+				tree->attributes & HFS_TREE_VARIDXKEYS ||
+				tree->cnid == HFSPLUS_ATTR_CNID)
 			key_size = hfs_bnode_read_u16(node, 14) + 2;
 		else
 			key_size = tree->max_key_len + 2;
 		hfs_bnode_copy(new_node, 14, node, 14, key_size);
 
-		if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+		if (!(tree->attributes & HFS_TREE_VARIDXKEYS) &&
+				(tree->cnid != HFSPLUS_ATTR_CNID)) {
 			key_size = tree->max_key_len + 2;
 			hfs_bnode_write_u16(new_node, 14, tree->max_key_len);
 		}
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 685d07d0ed1..0c6540c9116 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -40,8 +40,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	tree->inode = inode;
 
 	if (!HFSPLUS_I(tree->inode)->first_blocks) {
-		printk(KERN_ERR
-		       "hfs: invalid btree extent records (0 size).\n");
+		pr_err("invalid btree extent records (0 size)\n");
 		goto free_inode;
 	}
 
@@ -68,12 +67,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	switch (id) {
 	case HFSPLUS_EXT_CNID:
 		if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
-			printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+			pr_err("invalid extent max_key_len %d\n",
 				tree->max_key_len);
 			goto fail_page;
 		}
 		if (tree->attributes & HFS_TREE_VARIDXKEYS) {
-			printk(KERN_ERR "hfs: invalid extent btree flag\n");
+			pr_err("invalid extent btree flag\n");
 			goto fail_page;
 		}
 
@@ -81,12 +80,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		break;
 	case HFSPLUS_CAT_CNID:
 		if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
-			printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+			pr_err("invalid catalog max_key_len %d\n",
 				tree->max_key_len);
 			goto fail_page;
 		}
 		if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
-			printk(KERN_ERR "hfs: invalid catalog btree flag\n");
+			pr_err("invalid catalog btree flag\n");
 			goto fail_page;
 		}
 
@@ -98,13 +97,21 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 			set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
 		}
 		break;
+	case HFSPLUS_ATTR_CNID:
+		if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) {
+			pr_err("invalid attributes max_key_len %d\n",
+				tree->max_key_len);
+			goto fail_page;
+		}
+		tree->keycmp = hfsplus_attr_bin_cmp_key;
+		break;
 	default:
-		printk(KERN_ERR "hfs: unknown B*Tree requested\n");
+		pr_err("unknown B*Tree requested\n");
 		goto fail_page;
 	}
 
 	if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
-		printk(KERN_ERR "hfs: invalid btree flag\n");
+		pr_err("invalid btree flag\n");
 		goto fail_page;
 	}
 
@@ -147,7 +154,7 @@ void hfs_btree_close(struct hfs_btree *tree)
 		while ((node = tree->node_hash[i])) {
 			tree->node_hash[i] = node->next_hash;
 			if (atomic_read(&node->refcnt))
-				printk(KERN_CRIT "hfs: node %d:%d "
+				pr_crit("node %d:%d "
 						"still has %d user(s)!\n",
 					node->tree->cnid, node->this,
 					atomic_read(&node->refcnt));
@@ -295,7 +302,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 		kunmap(*pagep);
 		nidx = node->next;
 		if (!nidx) {
-			dprint(DBG_BNODE_MOD, "hfs: create new bmap node.\n");
+			hfs_dbg(BNODE_MOD, "create new bmap node\n");
 			next_node = hfs_bmap_new_bmap(node, idx);
 		} else
 			next_node = hfs_bnode_find(tree, nidx);
@@ -321,7 +328,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
 	u32 nidx;
 	u8 *data, byte, m;
 
-	dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this);
+	hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this);
 	BUG_ON(!node->this);
 	tree = node->tree;
 	nidx = node->this;
@@ -337,7 +344,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
 		hfs_bnode_put(node);
 		if (!i) {
 			/* panic */;
-			printk(KERN_CRIT "hfs: unable to free bnode %u. "
+			pr_crit("unable to free bnode %u. "
 					"bmap not found!\n",
 				node->this);
 			return;
@@ -347,7 +354,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
 			return;
 		if (node->type != HFS_NODE_MAP) {
 			/* panic */;
-			printk(KERN_CRIT "hfs: invalid bmap found! "
+			pr_crit("invalid bmap found! "
 					"(%u,%d)\n",
 				node->this, node->type);
 			hfs_bnode_put(node);
@@ -362,7 +369,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
 	m = 1 << (~nidx & 7);
 	byte = data[off];
 	if (!(byte & m)) {
-		printk(KERN_CRIT "hfs: trying to free free bnode "
+		pr_crit("trying to free free bnode "
 				"%u(%d)\n",
 			node->this, node->type);
 		kunmap(page);
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 798d9c4c5e7..968ce411db5 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -45,7 +45,8 @@ void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
 
 	key->cat.parent = cpu_to_be32(parent);
 	if (str) {
-		hfsplus_asc2uni(sb, &key->cat.name, str->name, str->len);
+		hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN,
+					str->name, str->len);
 		len = be16_to_cpu(key->cat.name.length);
 	} else {
 		key->cat.name.length = 0;
@@ -167,7 +168,8 @@ static int hfsplus_fill_cat_thread(struct super_block *sb,
 	entry->type = cpu_to_be16(type);
 	entry->thread.reserved = 0;
 	entry->thread.parentID = cpu_to_be32(parentid);
-	hfsplus_asc2uni(sb, &entry->thread.nodeName, str->name, str->len);
+	hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN,
+				str->name, str->len);
 	return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2;
 }
 
@@ -186,19 +188,19 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
 
 	type = be16_to_cpu(tmp.type);
 	if (type != HFSPLUS_FOLDER_THREAD && type != HFSPLUS_FILE_THREAD) {
-		printk(KERN_ERR "hfs: found bad thread record in catalog\n");
+		pr_err("found bad thread record in catalog\n");
 		return -EIO;
 	}
 
 	if (be16_to_cpu(tmp.thread.nodeName.length) > 255) {
-		printk(KERN_ERR "hfs: catalog name length corrupted\n");
+		pr_err("catalog name length corrupted\n");
 		return -EIO;
 	}
 
 	hfsplus_cat_build_key_uni(fd->search_key,
 		be32_to_cpu(tmp.thread.parentID),
 		&tmp.thread.nodeName);
-	return hfs_brec_find(fd);
+	return hfs_brec_find(fd, hfs_find_rec_by_key);
 }
 
 int hfsplus_create_cat(u32 cnid, struct inode *dir,
@@ -210,7 +212,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
 	int entry_size;
 	int err;
 
-	dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n",
+	hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n",
 		str->name, cnid, inode->i_nlink);
 	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	if (err)
@@ -221,7 +223,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
 		S_ISDIR(inode->i_mode) ?
 			HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD,
 		dir->i_ino, str);
-	err = hfs_brec_find(&fd);
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
 	if (err != -ENOENT) {
 		if (!err)
 			err = -EEXIST;
@@ -233,7 +235,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
 
 	hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
 	entry_size = hfsplus_cat_build_record(&entry, cnid, inode);
-	err = hfs_brec_find(&fd);
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
 	if (err != -ENOENT) {
 		/* panic? */
 		if (!err)
@@ -253,7 +255,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
 
 err1:
 	hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
-	if (!hfs_brec_find(&fd))
+	if (!hfs_brec_find(&fd, hfs_find_rec_by_key))
 		hfs_brec_remove(&fd);
 err2:
 	hfs_find_exit(&fd);
@@ -269,8 +271,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 	int err, off;
 	u16 type;
 
-	dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n",
-		str ? str->name : NULL, cnid);
+	hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
 	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	if (err)
 		return err;
@@ -279,7 +280,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 		int len;
 
 		hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
-		err = hfs_brec_find(&fd);
+		err = hfs_brec_find(&fd, hfs_find_rec_by_key);
 		if (err)
 			goto out;
 
@@ -296,7 +297,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 	} else
 		hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
 
-	err = hfs_brec_find(&fd);
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
 	if (err)
 		goto out;
 
@@ -326,7 +327,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 		goto out;
 
 	hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
-	err = hfs_brec_find(&fd);
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
 	if (err)
 		goto out;
 
@@ -337,6 +338,12 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
 	dir->i_size--;
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
 	hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
+
+	if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) {
+		if (HFSPLUS_SB(sb)->attr_tree)
+			hfsplus_delete_all_attrs(dir, cnid);
+	}
+
 out:
 	hfs_find_exit(&fd);
 
@@ -353,7 +360,7 @@ int hfsplus_rename_cat(u32 cnid,
 	int entry_size, type;
 	int err;
 
-	dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
+	hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
 		cnid, src_dir->i_ino, src_name->name,
 		dst_dir->i_ino, dst_name->name);
 	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
@@ -363,7 +370,7 @@ int hfsplus_rename_cat(u32 cnid,
 
 	/* find the old dir entry and read the data */
 	hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
-	err = hfs_brec_find(&src_fd);
+	err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
 	if (err)
 		goto out;
 	if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
@@ -376,7 +383,7 @@ int hfsplus_rename_cat(u32 cnid,
 
 	/* create new dir entry with the data from the old entry */
 	hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
-	err = hfs_brec_find(&dst_fd);
+	err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
 	if (err != -ENOENT) {
 		if (!err)
 			err = -EEXIST;
@@ -391,7 +398,7 @@ int hfsplus_rename_cat(u32 cnid,
 
 	/* finally remove the old entry */
 	hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
-	err = hfs_brec_find(&src_fd);
+	err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
 	if (err)
 		goto out;
 	err = hfs_brec_remove(&src_fd);
@@ -402,7 +409,7 @@ int hfsplus_rename_cat(u32 cnid,
 
 	/* remove old thread entry */
 	hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL);
-	err = hfs_brec_find(&src_fd);
+	err = hfs_brec_find(&src_fd, hfs_find_rec_by_key);
 	if (err)
 		goto out;
 	type = hfs_bnode_read_u16(src_fd.bnode, src_fd.entryoffset);
@@ -414,7 +421,7 @@ int hfsplus_rename_cat(u32 cnid,
 	hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
 	entry_size = hfsplus_fill_cat_thread(sb, &entry, type,
 		dst_dir->i_ino, dst_name);
-	err = hfs_brec_find(&dst_fd);
+	err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key);
 	if (err != -ENOENT) {
 		if (!err)
 			err = -EEXIST;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 6b9f921ef2f..d8ce4bd17fc 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -15,6 +15,7 @@
 
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
+#include "xattr.h"
 
 static inline void hfsplus_instantiate(struct dentry *dentry,
 				       struct inode *inode, u32 cnid)
@@ -102,7 +103,7 @@ again:
 		} else if (!dentry->d_fsdata)
 			dentry->d_fsdata = (void *)(unsigned long)cnid;
 	} else {
-		printk(KERN_ERR "hfs: invalid catalog entry type in lookup\n");
+		pr_err("invalid catalog entry type in lookup\n");
 		err = -EIO;
 		goto fail;
 	}
@@ -120,9 +121,9 @@ fail:
 	return ERR_PTR(err);
 }
 
-static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	int len, err;
 	char strbuf[HFSPLUS_MAX_STRLEN + 1];
@@ -131,25 +132,24 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct hfsplus_readdir_data *rd;
 	u16 type;
 
-	if (filp->f_pos >= inode->i_size)
+	if (file->f_pos >= inode->i_size)
 		return 0;
 
 	err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
 	if (err)
 		return err;
 	hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
-	err = hfs_brec_find(&fd);
+	err = hfs_brec_find(&fd, hfs_find_rec_by_key);
 	if (err)
 		goto out;
 
-	switch ((u32)filp->f_pos) {
-	case 0:
+	if (ctx->pos == 0) {
 		/* This is completely artificial... */
-		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
+		if (!dir_emit_dot(file, ctx))
 			goto out;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
 		if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
 			err = -EIO;
 			goto out;
@@ -158,31 +158,28 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
 			fd.entrylength);
 		if (be16_to_cpu(entry.type) != HFSPLUS_FOLDER_THREAD) {
-			printk(KERN_ERR "hfs: bad catalog folder thread\n");
+			pr_err("bad catalog folder thread\n");
 			err = -EIO;
 			goto out;
 		}
 		if (fd.entrylength < HFSPLUS_MIN_THREAD_SZ) {
-			printk(KERN_ERR "hfs: truncated catalog thread\n");
+			pr_err("truncated catalog thread\n");
 			err = -EIO;
 			goto out;
 		}
-		if (filldir(dirent, "..", 2, 1,
+		if (!dir_emit(ctx, "..", 2,
 			    be32_to_cpu(entry.thread.parentID), DT_DIR))
 			goto out;
-		filp->f_pos++;
-		/* fall through */
-	default:
-		if (filp->f_pos >= inode->i_size)
-			goto out;
-		err = hfs_brec_goto(&fd, filp->f_pos - 1);
-		if (err)
-			goto out;
+		ctx->pos = 2;
 	}
-
+	if (ctx->pos >= inode->i_size)
+		goto out;
+	err = hfs_brec_goto(&fd, ctx->pos - 1);
+	if (err)
+		goto out;
 	for (;;) {
 		if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) {
-			printk(KERN_ERR "hfs: walked past end of dir\n");
+			pr_err("walked past end of dir\n");
 			err = -EIO;
 			goto out;
 		}
@@ -202,7 +199,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (type == HFSPLUS_FOLDER) {
 			if (fd.entrylength <
 					sizeof(struct hfsplus_cat_folder)) {
-				printk(KERN_ERR "hfs: small dir entry\n");
+				pr_err("small dir entry\n");
 				err = -EIO;
 				goto out;
 			}
@@ -210,40 +207,40 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			    HFSPLUS_SB(sb)->hidden_dir->i_ino ==
 					be32_to_cpu(entry.folder.id))
 				goto next;
-			if (filldir(dirent, strbuf, len, filp->f_pos,
+			if (!dir_emit(ctx, strbuf, len,
 				    be32_to_cpu(entry.folder.id), DT_DIR))
 				break;
 		} else if (type == HFSPLUS_FILE) {
 			if (fd.entrylength < sizeof(struct hfsplus_cat_file)) {
-				printk(KERN_ERR "hfs: small file entry\n");
+				pr_err("small file entry\n");
 				err = -EIO;
 				goto out;
 			}
-			if (filldir(dirent, strbuf, len, filp->f_pos,
+			if (!dir_emit(ctx, strbuf, len,
 				    be32_to_cpu(entry.file.id), DT_REG))
 				break;
 		} else {
-			printk(KERN_ERR "hfs: bad catalog entry type\n");
+			pr_err("bad catalog entry type\n");
 			err = -EIO;
 			goto out;
 		}
 next:
-		filp->f_pos++;
-		if (filp->f_pos >= inode->i_size)
+		ctx->pos++;
+		if (ctx->pos >= inode->i_size)
 			goto out;
 		err = hfs_brec_goto(&fd, 1);
 		if (err)
 			goto out;
 	}
-	rd = filp->private_data;
+	rd = file->private_data;
 	if (!rd) {
 		rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL);
 		if (!rd) {
 			err = -ENOMEM;
 			goto out;
 		}
-		filp->private_data = rd;
-		rd->file = filp;
+		file->private_data = rd;
+		rd->file = file;
 		list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
 	}
 	memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
@@ -421,6 +418,15 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
 	if (res)
 		goto out_err;
 
+	res = hfsplus_init_inode_security(inode, dir, &dentry->d_name);
+	if (res == -EOPNOTSUPP)
+		res = 0; /* Operation is not supported. */
+	else if (res) {
+		/* Try to delete anyway without error analysis. */
+		hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
+		goto out_err;
+	}
+
 	hfsplus_instantiate(dentry, inode, inode->i_ino);
 	mark_inode_dirty(inode);
 	goto out;
@@ -450,15 +456,26 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 		init_special_inode(inode, mode, rdev);
 
 	res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-	if (res) {
-		clear_nlink(inode);
-		hfsplus_delete_inode(inode);
-		iput(inode);
-		goto out;
+	if (res)
+		goto failed_mknod;
+
+	res = hfsplus_init_inode_security(inode, dir, &dentry->d_name);
+	if (res == -EOPNOTSUPP)
+		res = 0; /* Operation is not supported. */
+	else if (res) {
+		/* Try to delete anyway without error analysis. */
+		hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
+		goto failed_mknod;
 	}
 
 	hfsplus_instantiate(dentry, inode, inode->i_ino);
 	mark_inode_dirty(inode);
+	goto out;
+
+failed_mknod:
+	clear_nlink(inode);
+	hfsplus_delete_inode(inode);
+	iput(inode);
 out:
 	mutex_unlock(&sbi->vh_mutex);
 	return res;
@@ -499,21 +516,25 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
 }
 
 const struct inode_operations hfsplus_dir_inode_operations = {
-	.lookup		= hfsplus_lookup,
-	.create		= hfsplus_create,
-	.link		= hfsplus_link,
-	.unlink		= hfsplus_unlink,
-	.mkdir		= hfsplus_mkdir,
-	.rmdir		= hfsplus_rmdir,
-	.symlink	= hfsplus_symlink,
-	.mknod		= hfsplus_mknod,
-	.rename		= hfsplus_rename,
+	.lookup			= hfsplus_lookup,
+	.create			= hfsplus_create,
+	.link			= hfsplus_link,
+	.unlink			= hfsplus_unlink,
+	.mkdir			= hfsplus_mkdir,
+	.rmdir			= hfsplus_rmdir,
+	.symlink		= hfsplus_symlink,
+	.mknod			= hfsplus_mknod,
+	.rename			= hfsplus_rename,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.listxattr		= hfsplus_listxattr,
+	.removexattr		= hfsplus_removexattr,
 };
 
 const struct file_operations hfsplus_dir_operations = {
 	.fsync		= hfsplus_file_fsync,
 	.read		= generic_read_dir,
-	.readdir	= hfsplus_readdir,
+	.iterate	= hfsplus_readdir,
 	.unlocked_ioctl = hfsplus_ioctl,
 	.llseek		= generic_file_llseek,
 	.release	= hfsplus_dir_release,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index eba76eab6d6..fbb212fbb1e 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -83,7 +83,7 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
 	return be32_to_cpu(ext->start_block) + be32_to_cpu(ext->block_count);
 }
 
-static void __hfsplus_ext_write_extent(struct inode *inode,
+static int __hfsplus_ext_write_extent(struct inode *inode,
 		struct hfs_find_data *fd)
 {
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
@@ -95,16 +95,16 @@ static void __hfsplus_ext_write_extent(struct inode *inode,
 			      HFSPLUS_IS_RSRC(inode) ?
 				HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
 
-	res = hfs_brec_find(fd);
+	res = hfs_brec_find(fd, hfs_find_rec_by_key);
 	if (hip->extent_state & HFSPLUS_EXT_NEW) {
 		if (res != -ENOENT)
-			return;
+			return res;
 		hfs_brec_insert(fd, hip->cached_extents,
 				sizeof(hfsplus_extent_rec));
 		hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW);
 	} else {
 		if (res)
-			return;
+			return res;
 		hfs_bnode_write(fd->bnode, hip->cached_extents,
 				fd->entryoffset, fd->entrylength);
 		hip->extent_state &= ~HFSPLUS_EXT_DIRTY;
@@ -117,11 +117,13 @@ static void __hfsplus_ext_write_extent(struct inode *inode,
 	 * to explicily mark the inode dirty, too.
 	 */
 	set_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags);
+
+	return 0;
 }
 
 static int hfsplus_ext_write_extent_locked(struct inode *inode)
 {
-	int res;
+	int res = 0;
 
 	if (HFSPLUS_I(inode)->extent_state & HFSPLUS_EXT_DIRTY) {
 		struct hfs_find_data fd;
@@ -129,10 +131,10 @@ static int hfsplus_ext_write_extent_locked(struct inode *inode)
 		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
 		if (res)
 			return res;
-		__hfsplus_ext_write_extent(inode, &fd);
+		res = __hfsplus_ext_write_extent(inode, &fd);
 		hfs_find_exit(&fd);
 	}
-	return 0;
+	return res;
 }
 
 int hfsplus_ext_write_extent(struct inode *inode)
@@ -154,7 +156,7 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
 
 	hfsplus_ext_build_key(fd->search_key, cnid, block, type);
 	fd->key->ext.cnid = 0;
-	res = hfs_brec_find(fd);
+	res = hfs_brec_find(fd, hfs_find_rec_by_key);
 	if (res && res != -ENOENT)
 		return res;
 	if (fd->key->ext.cnid != fd->search_key->ext.cnid ||
@@ -175,8 +177,11 @@ static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd,
 
 	WARN_ON(!mutex_is_locked(&hip->extents_lock));
 
-	if (hip->extent_state & HFSPLUS_EXT_DIRTY)
-		__hfsplus_ext_write_extent(inode, fd);
+	if (hip->extent_state & HFSPLUS_EXT_DIRTY) {
+		res = __hfsplus_ext_write_extent(inode, fd);
+		if (res)
+			return res;
+	}
 
 	res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
 					block, HFSPLUS_IS_RSRC(inode) ?
@@ -265,7 +270,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 	mutex_unlock(&hip->extents_lock);
 
 done:
-	dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n",
+	hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n",
 		inode->i_ino, (long long)iblock, dblock);
 
 	mask = (1 << sbi->fs_shift) - 1;
@@ -288,11 +293,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent)
 {
 	int i;
 
-	dprint(DBG_EXTENT, "   ");
+	hfs_dbg(EXTENT, "   ");
 	for (i = 0; i < 8; i++)
-		dprint(DBG_EXTENT, " %u:%u", be32_to_cpu(extent[i].start_block),
-				 be32_to_cpu(extent[i].block_count));
-	dprint(DBG_EXTENT, "\n");
+		hfs_dbg_cont(EXTENT, " %u:%u",
+			     be32_to_cpu(extent[i].start_block),
+			     be32_to_cpu(extent[i].block_count));
+	hfs_dbg_cont(EXTENT, "\n");
 }
 
 static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset,
@@ -348,8 +354,8 @@ found:
 		if (count <= block_nr) {
 			err = hfsplus_block_free(sb, start, count);
 			if (err) {
-				printk(KERN_ERR "hfs: can't free extent\n");
-				dprint(DBG_EXTENT, " start: %u count: %u\n",
+				pr_err("can't free extent\n");
+				hfs_dbg(EXTENT, " start: %u count: %u\n",
 					start, count);
 			}
 			extent->block_count = 0;
@@ -359,8 +365,8 @@ found:
 			count -= block_nr;
 			err = hfsplus_block_free(sb, start + count, block_nr);
 			if (err) {
-				printk(KERN_ERR "hfs: can't free extent\n");
-				dprint(DBG_EXTENT, " start: %u count: %u\n",
+				pr_err("can't free extent\n");
+				hfs_dbg(EXTENT, " start: %u count: %u\n",
 					start, count);
 			}
 			extent->block_count = cpu_to_be32(count);
@@ -432,7 +438,7 @@ int hfsplus_file_extend(struct inode *inode)
 	if (sbi->alloc_file->i_size * 8 <
 	    sbi->total_blocks - sbi->free_blocks + 8) {
 		/* extend alloc file */
-		printk(KERN_ERR "hfs: extend alloc file! "
+		pr_err("extend alloc file! "
 				"(%llu,%u,%u)\n",
 			sbi->alloc_file->i_size * 8,
 			sbi->total_blocks, sbi->free_blocks);
@@ -459,11 +465,11 @@ int hfsplus_file_extend(struct inode *inode)
 		}
 	}
 
-	dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
+	hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
 
 	if (hip->alloc_blocks <= hip->first_blocks) {
 		if (!hip->first_blocks) {
-			dprint(DBG_EXTENT, "first extents\n");
+			hfs_dbg(EXTENT, "first extents\n");
 			/* no extents yet */
 			hip->first_extents[0].start_block = cpu_to_be32(start);
 			hip->first_extents[0].block_count = cpu_to_be32(len);
@@ -500,7 +506,7 @@ out:
 	return res;
 
 insert_extent:
-	dprint(DBG_EXTENT, "insert new extent\n");
+	hfs_dbg(EXTENT, "insert new extent\n");
 	res = hfsplus_ext_write_extent_locked(inode);
 	if (res)
 		goto out;
@@ -525,15 +531,14 @@ void hfsplus_file_truncate(struct inode *inode)
 	u32 alloc_cnt, blk_cnt, start;
 	int res;
 
-	dprint(DBG_INODE, "truncate: %lu, %llu -> %llu\n",
-		inode->i_ino, (long long)hip->phys_size,
-		inode->i_size);
+	hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n",
+		inode->i_ino, (long long)hip->phys_size, inode->i_size);
 
 	if (inode->i_size > hip->phys_size) {
 		struct address_space *mapping = inode->i_mapping;
 		struct page *page;
 		void *fsdata;
-		u32 size = inode->i_size;
+		loff_t size = inode->i_size;
 
 		res = pagecache_write_begin(NULL, mapping, size, 0,
 						AOP_FLAG_UNINTERRUPTIBLE,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index a6da86b1b4c..ede79317cfb 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -10,6 +10,12 @@
 #ifndef _LINUX_HFSPLUS_FS_H
 #define _LINUX_HFSPLUS_FS_H
 
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/fs.h>
 #include <linux/mutex.h>
 #include <linux/buffer_head.h>
@@ -23,6 +29,7 @@
 #define DBG_SUPER	0x00000010
 #define DBG_EXTENT	0x00000020
 #define DBG_BITMAP	0x00000040
+#define DBG_ATTR_MOD	0x00000080
 
 #if 0
 #define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)
@@ -31,9 +38,17 @@
 #endif
 #define DBG_MASK	(0)
 
-#define dprint(flg, fmt, args...) \
-	if (flg & DBG_MASK) \
-		printk(fmt , ## args)
+#define hfs_dbg(flg, fmt, ...)					\
+do {								\
+	if (DBG_##flg & DBG_MASK)				\
+		printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__);	\
+} while (0)
+
+#define hfs_dbg_cont(flg, fmt, ...)				\
+do {								\
+	if (DBG_##flg & DBG_MASK)				\
+		pr_cont(fmt, ##__VA_ARGS__);			\
+} while (0)
 
 /* Runtime config options */
 #define HFSPLUS_DEF_CR_TYPE    0x3F3F3F3F  /* '????' */
@@ -46,6 +61,13 @@ typedef int (*btree_keycmp)(const hfsplus_btree_key *,
 
 #define NODE_HASH_SIZE	256
 
+/* B-tree mutex nested subclasses */
+enum hfsplus_btree_mutex_classes {
+	CATALOG_BTREE_MUTEX,
+	EXTENTS_BTREE_MUTEX,
+	ATTR_BTREE_MUTEX,
+};
+
 /* An HFS+ BTree held in memory */
 struct hfs_btree {
 	struct super_block *sb;
@@ -223,6 +245,7 @@ struct hfsplus_inode_info {
 #define HFSPLUS_I_CAT_DIRTY	1	/* has changes in the catalog tree */
 #define HFSPLUS_I_EXT_DIRTY	2	/* has changes in the extent tree */
 #define HFSPLUS_I_ALLOC_DIRTY	3	/* has changes in the allocation file */
+#define HFSPLUS_I_ATTR_DIRTY	4	/* has changes in the attributes tree */
 
 #define HFSPLUS_IS_RSRC(inode) \
 	test_bit(HFSPLUS_I_RSRC, &HFSPLUS_I(inode)->flags)
@@ -302,7 +325,7 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
 #define hfs_brec_remove hfsplus_brec_remove
 #define hfs_find_init hfsplus_find_init
 #define hfs_find_exit hfsplus_find_exit
-#define __hfs_brec_find __hplusfs_brec_find
+#define __hfs_brec_find __hfsplus_brec_find
 #define hfs_brec_find hfsplus_brec_find
 #define hfs_brec_read hfsplus_brec_read
 #define hfs_brec_goto hfsplus_brec_goto
@@ -324,10 +347,33 @@ static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
  */
 #define HFSPLUS_IOC_BLESS _IO('h', 0x80)
 
+typedef int (*search_strategy_t)(struct hfs_bnode *,
+				struct hfs_find_data *,
+				int *, int *, int *);
+
 /*
  * Functions in any *.c used in other files
  */
 
+/* attributes.c */
+int hfsplus_create_attr_tree_cache(void);
+void hfsplus_destroy_attr_tree_cache(void);
+hfsplus_attr_entry *hfsplus_alloc_attr_entry(void);
+void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p);
+int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *,
+		const hfsplus_btree_key *);
+int hfsplus_attr_build_key(struct super_block *, hfsplus_btree_key *,
+			u32, const char *);
+void hfsplus_attr_build_key_uni(hfsplus_btree_key *key,
+					u32 cnid,
+					struct hfsplus_attr_unistr *name);
+int hfsplus_find_attr(struct super_block *, u32,
+			const char *, struct hfs_find_data *);
+int hfsplus_attr_exists(struct inode *inode, const char *name);
+int hfsplus_create_attr(struct inode *, const char *, const void *, size_t);
+int hfsplus_delete_attr(struct inode *, const char *);
+int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid);
+
 /* bitmap.c */
 int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *);
 int hfsplus_block_free(struct super_block *, u32, u32);
@@ -369,8 +415,15 @@ int hfs_brec_remove(struct hfs_find_data *);
 /* bfind.c */
 int hfs_find_init(struct hfs_btree *, struct hfs_find_data *);
 void hfs_find_exit(struct hfs_find_data *);
-int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *);
-int hfs_brec_find(struct hfs_find_data *);
+int hfs_find_1st_rec_by_cnid(struct hfs_bnode *,
+				struct hfs_find_data *,
+				int *, int *, int *);
+int hfs_find_rec_by_key(struct hfs_bnode *,
+				struct hfs_find_data *,
+				int *, int *, int *);
+int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *,
+				search_strategy_t);
+int hfs_brec_find(struct hfs_find_data *, search_strategy_t);
 int hfs_brec_read(struct hfs_find_data *, void *, int);
 int hfs_brec_goto(struct hfs_find_data *, int);
 
@@ -417,11 +470,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 
 /* ioctl.c */
 long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
-		     const void *value, size_t size, int flags);
-ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
-			 void *value, size_t size);
-ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
 
 /* options.c */
 int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
@@ -446,12 +494,9 @@ int hfsplus_strcmp(const struct hfsplus_unistr *,
 int hfsplus_uni2asc(struct super_block *,
 		const struct hfsplus_unistr *, char *, int *);
 int hfsplus_asc2uni(struct super_block *,
-		struct hfsplus_unistr *, const char *, int);
-int hfsplus_hash_dentry(const struct dentry *dentry,
-		const struct inode *inode, struct qstr *str);
-int hfsplus_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+		struct hfsplus_unistr *, int, const char *, int);
+int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
+int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 
 /* wrapper.c */
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 921967e5abb..452ede01b03 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -52,13 +52,23 @@
 typedef __be32 hfsplus_cnid;
 typedef __be16 hfsplus_unichr;
 
+#define HFSPLUS_MAX_STRLEN 255
+#define HFSPLUS_ATTR_MAX_STRLEN 127
+
 /* A "string" as used in filenames, etc. */
 struct hfsplus_unistr {
 	__be16 length;
-	hfsplus_unichr unicode[255];
+	hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN];
 } __packed;
 
-#define HFSPLUS_MAX_STRLEN 255
+/*
+ * A "string" is used in attributes file
+ * for name of extended attribute
+ */
+struct hfsplus_attr_unistr {
+	__be16 length;
+	hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN];
+} __packed;
 
 /* POSIX permissions */
 struct hfsplus_perm {
@@ -291,6 +301,8 @@ struct hfsplus_cat_file {
 /* File attribute bits */
 #define HFSPLUS_FILE_LOCKED		0x0001
 #define HFSPLUS_FILE_THREAD_EXISTS	0x0002
+#define HFSPLUS_XATTR_EXISTS		0x0004
+#define HFSPLUS_ACL_EXISTS		0x0008
 
 /* HFS+ catalog thread (part of a cat_entry) */
 struct hfsplus_cat_thread {
@@ -327,11 +339,63 @@ struct hfsplus_ext_key {
 
 #define HFSPLUS_EXT_KEYLEN	sizeof(struct hfsplus_ext_key)
 
+#define HFSPLUS_XATTR_FINDER_INFO_NAME "com.apple.FinderInfo"
+#define HFSPLUS_XATTR_ACL_NAME "com.apple.system.Security"
+
+#define HFSPLUS_ATTR_INLINE_DATA 0x10
+#define HFSPLUS_ATTR_FORK_DATA   0x20
+#define HFSPLUS_ATTR_EXTENTS     0x30
+
+/* HFS+ attributes tree key */
+struct hfsplus_attr_key {
+	__be16 key_len;
+	__be16 pad;
+	hfsplus_cnid cnid;
+	__be32 start_block;
+	struct hfsplus_attr_unistr key_name;
+} __packed;
+
+#define HFSPLUS_ATTR_KEYLEN	sizeof(struct hfsplus_attr_key)
+
+/* HFS+ fork data attribute */
+struct hfsplus_attr_fork_data {
+	__be32 record_type;
+	__be32 reserved;
+	struct hfsplus_fork_raw the_fork;
+} __packed;
+
+/* HFS+ extension attribute */
+struct hfsplus_attr_extents {
+	__be32 record_type;
+	__be32 reserved;
+	struct hfsplus_extent extents;
+} __packed;
+
+#define HFSPLUS_MAX_INLINE_DATA_SIZE 3802
+
+/* HFS+ attribute inline data */
+struct hfsplus_attr_inline_data {
+	__be32 record_type;
+	__be32 reserved1;
+	u8 reserved2[6];
+	__be16 length;
+	u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE];
+} __packed;
+
+/* A data record in the attributes tree */
+typedef union {
+	__be32 record_type;
+	struct hfsplus_attr_fork_data fork_data;
+	struct hfsplus_attr_extents extents;
+	struct hfsplus_attr_inline_data inline_data;
+} __packed hfsplus_attr_entry;
+
 /* HFS+ generic BTree key */
 typedef union {
 	__be16 key_len;
 	struct hfsplus_cat_key cat;
 	struct hfsplus_ext_key ext;
+	struct hfsplus_attr_key attr;
 } __packed hfsplus_btree_key;
 
 #endif
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 799b336b59f..f833d35630a 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,9 +14,11 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
+#include <linux/aio.h>
 
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
+#include "xattr.h"
 
 static int hfsplus_readpage(struct file *file, struct page *page)
 {
@@ -124,7 +126,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+	struct inode *inode = file_inode(file)->i_mapping->host;
 	ssize_t ret;
 
 	ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
@@ -348,6 +350,18 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 			error = error2;
 	}
 
+	if (test_and_clear_bit(HFSPLUS_I_ATTR_DIRTY, &hip->flags)) {
+		if (sbi->attr_tree) {
+			error2 =
+				filemap_write_and_wait(
+					    sbi->attr_tree->inode->i_mapping);
+			if (!error)
+				error = error2;
+		} else {
+			pr_err("sync non-existent attributes tree\n");
+		}
+	}
+
 	if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) {
 		error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
 		if (!error)
@@ -365,9 +379,10 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 static const struct inode_operations hfsplus_file_inode_operations = {
 	.lookup		= hfsplus_file_lookup,
 	.setattr	= hfsplus_setattr,
-	.setxattr	= hfsplus_setxattr,
-	.getxattr	= hfsplus_getxattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= hfsplus_listxattr,
+	.removexattr	= hfsplus_removexattr,
 };
 
 static const struct file_operations hfsplus_file_operations = {
@@ -559,7 +574,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 		inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
 		HFSPLUS_I(inode)->create_date = file->create_date;
 	} else {
-		printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
+		pr_err("bad catalog entry used to create inode\n");
 		res = -EIO;
 	}
 	return res;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 09addc8615f..d3ff5cc317d 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -16,7 +16,6 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
-#include <linux/xattr.h>
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
 
@@ -59,7 +58,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
 
 static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	unsigned int flags = 0;
 
@@ -75,7 +74,7 @@ static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
 
 static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
 	unsigned int flags;
 	int err = 0;
@@ -151,110 +150,3 @@ long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return -ENOTTY;
 	}
 }
-
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
-		     const void *value, size_t size, int flags)
-{
-	struct inode *inode = dentry->d_inode;
-	struct hfs_find_data fd;
-	hfsplus_cat_entry entry;
-	struct hfsplus_cat_file *file;
-	int res;
-
-	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
-		return -EOPNOTSUPP;
-
-	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
-	if (res)
-		return res;
-	res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
-	if (res)
-		goto out;
-	hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
-			sizeof(struct hfsplus_cat_file));
-	file = &entry.file;
-
-	if (!strcmp(name, "hfs.type")) {
-		if (size == 4)
-			memcpy(&file->user_info.fdType, value, 4);
-		else
-			res = -ERANGE;
-	} else if (!strcmp(name, "hfs.creator")) {
-		if (size == 4)
-			memcpy(&file->user_info.fdCreator, value, 4);
-		else
-			res = -ERANGE;
-	} else
-		res = -EOPNOTSUPP;
-	if (!res) {
-		hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
-				sizeof(struct hfsplus_cat_file));
-		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
-	}
-out:
-	hfs_find_exit(&fd);
-	return res;
-}
-
-ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
-			 void *value, size_t size)
-{
-	struct inode *inode = dentry->d_inode;
-	struct hfs_find_data fd;
-	hfsplus_cat_entry entry;
-	struct hfsplus_cat_file *file;
-	ssize_t res = 0;
-
-	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
-		return -EOPNOTSUPP;
-
-	if (size) {
-		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
-		if (res)
-			return res;
-		res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
-		if (res)
-			goto out;
-		hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
-				sizeof(struct hfsplus_cat_file));
-	}
-	file = &entry.file;
-
-	if (!strcmp(name, "hfs.type")) {
-		if (size >= 4) {
-			memcpy(value, &file->user_info.fdType, 4);
-			res = 4;
-		} else
-			res = size ? -ERANGE : 4;
-	} else if (!strcmp(name, "hfs.creator")) {
-		if (size >= 4) {
-			memcpy(value, &file->user_info.fdCreator, 4);
-			res = 4;
-		} else
-			res = size ? -ERANGE : 4;
-	} else
-		res = -EOPNOTSUPP;
-out:
-	if (size)
-		hfs_find_exit(&fd);
-	return res;
-}
-
-#define HFSPLUS_ATTRLIST_SIZE (sizeof("hfs.creator")+sizeof("hfs.type"))
-
-ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-	struct inode *inode = dentry->d_inode;
-
-	if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
-		return -EOPNOTSUPP;
-
-	if (!buffer || !size)
-		return HFSPLUS_ATTRLIST_SIZE;
-	if (size < HFSPLUS_ATTRLIST_SIZE)
-		return -ERANGE;
-	strcpy(buffer, "hfs.type");
-	strcpy(buffer + sizeof("hfs.type"), "hfs.creator");
-
-	return HFSPLUS_ATTRLIST_SIZE;
-}
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index ed257c67161..968eab5bc1f 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -113,67 +113,67 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
 		switch (token) {
 		case opt_creator:
 			if (match_fourchar(&args[0], &sbi->creator)) {
-				printk(KERN_ERR "hfs: creator requires a 4 character value\n");
+				pr_err("creator requires a 4 character value\n");
 				return 0;
 			}
 			break;
 		case opt_type:
 			if (match_fourchar(&args[0], &sbi->type)) {
-				printk(KERN_ERR "hfs: type requires a 4 character value\n");
+				pr_err("type requires a 4 character value\n");
 				return 0;
 			}
 			break;
 		case opt_umask:
 			if (match_octal(&args[0], &tmp)) {
-				printk(KERN_ERR "hfs: umask requires a value\n");
+				pr_err("umask requires a value\n");
 				return 0;
 			}
 			sbi->umask = (umode_t)tmp;
 			break;
 		case opt_uid:
 			if (match_int(&args[0], &tmp)) {
-				printk(KERN_ERR "hfs: uid requires an argument\n");
+				pr_err("uid requires an argument\n");
 				return 0;
 			}
 			sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp);
 			if (!uid_valid(sbi->uid)) {
-				printk(KERN_ERR "hfs: invalid uid specified\n");
+				pr_err("invalid uid specified\n");
 				return 0;
 			}
 			break;
 		case opt_gid:
 			if (match_int(&args[0], &tmp)) {
-				printk(KERN_ERR "hfs: gid requires an argument\n");
+				pr_err("gid requires an argument\n");
 				return 0;
 			}
 			sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp);
 			if (!gid_valid(sbi->gid)) {
-				printk(KERN_ERR "hfs: invalid gid specified\n");
+				pr_err("invalid gid specified\n");
 				return 0;
 			}
 			break;
 		case opt_part:
 			if (match_int(&args[0], &sbi->part)) {
-				printk(KERN_ERR "hfs: part requires an argument\n");
+				pr_err("part requires an argument\n");
 				return 0;
 			}
 			break;
 		case opt_session:
 			if (match_int(&args[0], &sbi->session)) {
-				printk(KERN_ERR "hfs: session requires an argument\n");
+				pr_err("session requires an argument\n");
 				return 0;
 			}
 			break;
 		case opt_nls:
 			if (sbi->nls) {
-				printk(KERN_ERR "hfs: unable to change nls mapping\n");
+				pr_err("unable to change nls mapping\n");
 				return 0;
 			}
 			p = match_strdup(&args[0]);
 			if (p)
 				sbi->nls = load_nls(p);
 			if (!sbi->nls) {
-				printk(KERN_ERR "hfs: unable to load "
+				pr_err("unable to load "
 						"nls mapping \"%s\"\n",
 					p);
 				kfree(p);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 796198d2655..4c4d142cf89 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -20,6 +20,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb);
 static void hfsplus_destroy_inode(struct inode *inode);
 
 #include "hfsplus_fs.h"
+#include "xattr.h"
 
 static int hfsplus_system_read_inode(struct inode *inode)
 {
@@ -118,6 +119,7 @@ static int hfsplus_system_write_inode(struct inode *inode)
 	case HFSPLUS_ATTR_CNID:
 		fork = &vhdr->attr_file;
 		tree = sbi->attr_tree;
+		break;
 	default:
 		return -EIO;
 	}
@@ -130,7 +132,7 @@ static int hfsplus_system_write_inode(struct inode *inode)
 	if (tree) {
 		int err = hfs_btree_write(tree);
 		if (err) {
-			printk(KERN_ERR "hfs: b-tree write err: %d, ino %lu\n",
+			pr_err("b-tree write err: %d, ino %lu\n",
 					err, inode->i_ino);
 			return err;
 		}
@@ -143,7 +145,7 @@ static int hfsplus_write_inode(struct inode *inode,
 {
 	int err;
 
-	dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+	hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
 
 	err = hfsplus_ext_write_extent(inode);
 	if (err)
@@ -158,7 +160,7 @@ static int hfsplus_write_inode(struct inode *inode,
 
 static void hfsplus_evict_inode(struct inode *inode)
 {
-	dprint(DBG_INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
+	hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
 	truncate_inode_pages(&inode->i_data, 0);
 	clear_inode(inode);
 	if (HFSPLUS_IS_RSRC(inode)) {
@@ -177,7 +179,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 	if (!wait)
 		return 0;
 
-	dprint(DBG_SUPER, "hfsplus_sync_fs\n");
+	hfs_dbg(SUPER, "hfsplus_sync_fs\n");
 
 	/*
 	 * Explicitly write out the special metadata inodes.
@@ -191,6 +193,12 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 	error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping);
 	if (!error)
 		error = error2;
+	if (sbi->attr_tree) {
+		error2 =
+		    filemap_write_and_wait(sbi->attr_tree->inode->i_mapping);
+		if (!error)
+			error = error2;
+	}
 	error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping);
 	if (!error)
 		error = error2;
@@ -243,7 +251,7 @@ static void delayed_sync_fs(struct work_struct *work)
 
 	err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1);
 	if (err)
-		printk(KERN_ERR "hfs: delayed sync fs err %d\n", err);
+		pr_err("delayed sync fs err %d\n", err);
 }
 
 void hfsplus_mark_mdb_dirty(struct super_block *sb)
@@ -267,7 +275,7 @@ static void hfsplus_put_super(struct super_block *sb)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 
-	dprint(DBG_SUPER, "hfsplus_put_super\n");
+	hfs_dbg(SUPER, "hfsplus_put_super\n");
 
 	cancel_delayed_work_sync(&sbi->sync_work);
 
@@ -281,6 +289,7 @@ static void hfsplus_put_super(struct super_block *sb)
 		hfsplus_sync_fs(sb, 1);
 	}
 
+	hfs_btree_close(sbi->attr_tree);
 	hfs_btree_close(sbi->cat_tree);
 	hfs_btree_close(sbi->ext_tree);
 	iput(sbi->alloc_file);
@@ -324,25 +333,19 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 			return -EINVAL;
 
 		if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
-			printk(KERN_WARNING "hfs: filesystem was "
-					"not cleanly unmounted, "
-					"running fsck.hfsplus is recommended.  "
-					"leaving read-only.\n");
+			pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
 		} else if (force) {
 			/* nothing */
 		} else if (vhdr->attributes &
 				cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
-			printk(KERN_WARNING "hfs: filesystem is marked locked, "
-					"leaving read-only.\n");
+			pr_warn("filesystem is marked locked, leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
 		} else if (vhdr->attributes &
 				cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
-			printk(KERN_WARNING "hfs: filesystem is "
-					"marked journaled, "
-					"leaving read-only.\n");
+			pr_warn("filesystem is marked journaled, leaving read-only.\n");
 			sb->s_flags |= MS_RDONLY;
 			*flags |= MS_RDONLY;
 		}
@@ -388,7 +391,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 	err = -EINVAL;
 	if (!hfsplus_parse_options(data, sbi)) {
-		printk(KERN_ERR "hfs: unable to parse mount options\n");
+		pr_err("unable to parse mount options\n");
 		goto out_unload_nls;
 	}
 
@@ -396,14 +399,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	nls = sbi->nls;
 	sbi->nls = load_nls("utf8");
 	if (!sbi->nls) {
-		printk(KERN_ERR "hfs: unable to load nls for utf8\n");
+		pr_err("unable to load nls for utf8\n");
 		goto out_unload_nls;
 	}
 
 	/* Grab the volume header */
 	if (hfsplus_read_wrapper(sb)) {
 		if (!silent)
-			printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
+			pr_warn("unable to find HFS+ superblock\n");
 		goto out_unload_nls;
 	}
 	vhdr = sbi->s_vhdr;
@@ -412,7 +415,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = HFSPLUS_VOLHEAD_SIG;
 	if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
 	    be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
-		printk(KERN_ERR "hfs: wrong filesystem version\n");
+		pr_err("wrong filesystem version\n");
 		goto out_free_vhdr;
 	}
 	sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
@@ -436,7 +439,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 	if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
 	    (last_fs_page > (pgoff_t)(~0ULL))) {
-		printk(KERN_ERR "hfs: filesystem size too large.\n");
+		pr_err("filesystem size too large\n");
 		goto out_free_vhdr;
 	}
 
@@ -445,22 +448,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
 	if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
-		printk(KERN_WARNING "hfs: Filesystem was "
-				"not cleanly unmounted, "
-				"running fsck.hfsplus is recommended.  "
-				"mounting read-only.\n");
+		pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended.  mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	} else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
 		/* nothing */
 	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
-		printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
+		pr_warn("Filesystem is marked locked, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) &&
 			!(sb->s_flags & MS_RDONLY)) {
-		printk(KERN_WARNING "hfs: write access to "
-				"a journaled filesystem is not supported, "
-				"use the force option at your own risk, "
-				"mounting read-only.\n");
+		pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
 	}
 
@@ -469,27 +466,35 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	/* Load metadata objects (B*Trees) */
 	sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
 	if (!sbi->ext_tree) {
-		printk(KERN_ERR "hfs: failed to load extents file\n");
+		pr_err("failed to load extents file\n");
 		goto out_free_vhdr;
 	}
 	sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
 	if (!sbi->cat_tree) {
-		printk(KERN_ERR "hfs: failed to load catalog file\n");
+		pr_err("failed to load catalog file\n");
 		goto out_close_ext_tree;
 	}
+	if (vhdr->attr_file.total_blocks != 0) {
+		sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
+		if (!sbi->attr_tree) {
+			pr_err("failed to load attributes file\n");
+			goto out_close_cat_tree;
+		}
+	}
+	sb->s_xattr = hfsplus_xattr_handlers;
 
 	inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
 	if (IS_ERR(inode)) {
-		printk(KERN_ERR "hfs: failed to load allocation file\n");
+		pr_err("failed to load allocation file\n");
 		err = PTR_ERR(inode);
-		goto out_close_cat_tree;
+		goto out_close_attr_tree;
 	}
 	sbi->alloc_file = inode;
 
 	/* Load the root directory */
 	root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
 	if (IS_ERR(root)) {
-		printk(KERN_ERR "hfs: failed to load root directory\n");
+		pr_err("failed to load root directory\n");
 		err = PTR_ERR(root);
 		goto out_put_alloc_file;
 	}
@@ -542,10 +547,27 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 			}
 			err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root,
 						 &str, sbi->hidden_dir);
-			mutex_unlock(&sbi->vh_mutex);
-			if (err)
+			if (err) {
+				mutex_unlock(&sbi->vh_mutex);
+				goto out_put_hidden_dir;
+			}
+
+			err = hfsplus_init_inode_security(sbi->hidden_dir,
+								root, &str);
+			if (err == -EOPNOTSUPP)
+				err = 0; /* Operation is not supported. */
+			else if (err) {
+				/*
+				 * Try to delete anyway without
+				 * error analysis.
+				 */
+				hfsplus_delete_cat(sbi->hidden_dir->i_ino,
+							root, &str);
+				mutex_unlock(&sbi->vh_mutex);
 				goto out_put_hidden_dir;
+			}
 
+			mutex_unlock(&sbi->vh_mutex);
 			hfsplus_mark_inode_dirty(sbi->hidden_dir,
 						 HFSPLUS_I_CAT_DIRTY);
 		}
@@ -562,6 +584,8 @@ out_put_root:
 	sb->s_root = NULL;
 out_put_alloc_file:
 	iput(sbi->alloc_file);
+out_close_attr_tree:
+	hfs_btree_close(sbi->attr_tree);
 out_close_cat_tree:
 	hfs_btree_close(sbi->cat_tree);
 out_close_ext_tree:
@@ -618,6 +642,7 @@ static struct file_system_type hfsplus_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("hfsplus");
 
 static void hfsplus_init_once(void *p)
 {
@@ -635,9 +660,20 @@ static int __init init_hfsplus_fs(void)
 		hfsplus_init_once);
 	if (!hfsplus_inode_cachep)
 		return -ENOMEM;
+	err = hfsplus_create_attr_tree_cache();
+	if (err)
+		goto destroy_inode_cache;
 	err = register_filesystem(&hfsplus_fs_type);
 	if (err)
-		kmem_cache_destroy(hfsplus_inode_cachep);
+		goto destroy_attr_tree_cache;
+	return 0;
+
+destroy_attr_tree_cache:
+	hfsplus_destroy_attr_tree_cache();
+
+destroy_inode_cache:
+	kmem_cache_destroy(hfsplus_inode_cachep);
+
 	return err;
 }
 
@@ -650,6 +686,7 @@ static void __exit exit_hfsplus_fs(void)
 	 * destroy cache.
 	 */
 	rcu_barrier();
+	hfsplus_destroy_attr_tree_cache();
 	kmem_cache_destroy(hfsplus_inode_cachep);
 }
 
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index a32998f29f0..e8ef121a4d8 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -295,7 +295,8 @@ static inline u16 *decompose_unichar(wchar_t uc, int *size)
 	return hfsplus_decompose_table + (off / 4);
 }
 
-int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
+int hfsplus_asc2uni(struct super_block *sb,
+		    struct hfsplus_unistr *ustr, int max_unistr_len,
 		    const char *astr, int len)
 {
 	int size, dsize, decompose;
@@ -303,7 +304,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
 	wchar_t c;
 
 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
-	while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
+	while (outlen < max_unistr_len && len > 0) {
 		size = asc2unichar(sb, astr, len, &c);
 
 		if (decompose)
@@ -311,7 +312,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
 		else
 			dstr = NULL;
 		if (dstr) {
-			if (outlen + dsize > HFSPLUS_MAX_STRLEN)
+			if (outlen + dsize > max_unistr_len)
 				break;
 			do {
 				ustr->unicode[outlen++] = cpu_to_be16(*dstr++);
@@ -333,8 +334,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
  * Composed unicode characters are decomposed and case-folding is performed
  * if the appropriate bits are (un)set on the superblock.
  */
-int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *str)
+int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
 {
 	struct super_block *sb = dentry->d_sb;
 	const char *astr;
@@ -385,9 +385,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
  * Composed unicode characters are decomposed and case-folding is performed
  * if the appropriate bits are (un)set on the superblock.
  */
-int hfsplus_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	struct super_block *sb = parent->d_sb;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 90effcccca9..b51a6079108 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -156,7 +156,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
 			*start = (sector_t)te.cdte_addr.lba << 2;
 			return 0;
 		}
-		printk(KERN_ERR "hfs: invalid session number or type of track\n");
+		pr_err("invalid session number or type of track\n");
 		return -EINVAL;
 	}
 	ms_info.addr_format = CDROM_LBA;
@@ -234,8 +234,7 @@ reread:
 
 	error = -EINVAL;
 	if (sbi->s_backup_vhdr->signature != sbi->s_vhdr->signature) {
-		printk(KERN_WARNING
-			"hfs: invalid secondary volume header\n");
+		pr_warn("invalid secondary volume header\n");
 		goto out_free_backup_vhdr;
 	}
 
@@ -259,8 +258,7 @@ reread:
 		blocksize >>= 1;
 
 	if (sb_set_blocksize(sb, blocksize) != blocksize) {
-		printk(KERN_ERR "hfs: unable to set blocksize to %u!\n",
-			blocksize);
+		pr_err("unable to set blocksize to %u!\n", blocksize);
 		goto out_free_backup_vhdr;
 	}
 
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
new file mode 100644
index 00000000000..f66346155df
--- /dev/null
+++ b/fs/hfsplus/xattr.c
@@ -0,0 +1,708 @@
+/*
+ * linux/fs/hfsplus/xattr.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Logic of processing extended attributes
+ */
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+
+const struct xattr_handler *hfsplus_xattr_handlers[] = {
+	&hfsplus_xattr_osx_handler,
+	&hfsplus_xattr_user_handler,
+	&hfsplus_xattr_trusted_handler,
+	&hfsplus_xattr_security_handler,
+	NULL
+};
+
+static int strcmp_xattr_finder_info(const char *name)
+{
+	if (name) {
+		return strncmp(name, HFSPLUS_XATTR_FINDER_INFO_NAME,
+				sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME));
+	}
+	return -1;
+}
+
+static int strcmp_xattr_acl(const char *name)
+{
+	if (name) {
+		return strncmp(name, HFSPLUS_XATTR_ACL_NAME,
+				sizeof(HFSPLUS_XATTR_ACL_NAME));
+	}
+	return -1;
+}
+
+static inline int is_known_namespace(const char *name)
+{
+	if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+		return false;
+
+	return true;
+}
+
+static int can_set_xattr(struct inode *inode, const char *name,
+				const void *value, size_t value_len)
+{
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return -EOPNOTSUPP; /* TODO: implement ACL support */
+
+	if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) {
+		/*
+		 * This makes sure that we aren't trying to set an
+		 * attribute in a different namespace by prefixing it
+		 * with "osx."
+		 */
+		if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN))
+			return -EOPNOTSUPP;
+
+		return 0;
+	}
+
+	/*
+	 * Don't allow setting an attribute in an unknown namespace.
+	 */
+	if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
+	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+int __hfsplus_setxattr(struct inode *inode, const char *name,
+			const void *value, size_t size, int flags)
+{
+	int err = 0;
+	struct hfs_find_data cat_fd;
+	hfsplus_cat_entry entry;
+	u16 cat_entry_flags, cat_entry_type;
+	u16 folder_finderinfo_len = sizeof(struct DInfo) +
+					sizeof(struct DXInfo);
+	u16 file_finderinfo_len = sizeof(struct FInfo) +
+					sizeof(struct FXInfo);
+
+	if ((!S_ISREG(inode->i_mode) &&
+			!S_ISDIR(inode->i_mode)) ||
+				HFSPLUS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	err = can_set_xattr(inode, name, value, size);
+	if (err)
+		return err;
+
+	if (strncmp(name, XATTR_MAC_OSX_PREFIX,
+				XATTR_MAC_OSX_PREFIX_LEN) == 0)
+		name += XATTR_MAC_OSX_PREFIX_LEN;
+
+	if (value == NULL) {
+		value = "";
+		size = 0;
+	}
+
+	err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
+	if (err) {
+		pr_err("can't init xattr find struct\n");
+		return err;
+	}
+
+	err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd);
+	if (err) {
+		pr_err("catalog searching failed\n");
+		goto end_setxattr;
+	}
+
+	if (!strcmp_xattr_finder_info(name)) {
+		if (flags & XATTR_CREATE) {
+			pr_err("xattr exists yet\n");
+			err = -EOPNOTSUPP;
+			goto end_setxattr;
+		}
+		hfs_bnode_read(cat_fd.bnode, &entry, cat_fd.entryoffset,
+					sizeof(hfsplus_cat_entry));
+		if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) {
+			if (size == folder_finderinfo_len) {
+				memcpy(&entry.folder.user_info, value,
+						folder_finderinfo_len);
+				hfs_bnode_write(cat_fd.bnode, &entry,
+					cat_fd.entryoffset,
+					sizeof(struct hfsplus_cat_folder));
+				hfsplus_mark_inode_dirty(inode,
+						HFSPLUS_I_CAT_DIRTY);
+			} else {
+				err = -ERANGE;
+				goto end_setxattr;
+			}
+		} else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) {
+			if (size == file_finderinfo_len) {
+				memcpy(&entry.file.user_info, value,
+						file_finderinfo_len);
+				hfs_bnode_write(cat_fd.bnode, &entry,
+					cat_fd.entryoffset,
+					sizeof(struct hfsplus_cat_file));
+				hfsplus_mark_inode_dirty(inode,
+						HFSPLUS_I_CAT_DIRTY);
+			} else {
+				err = -ERANGE;
+				goto end_setxattr;
+			}
+		} else {
+			err = -EOPNOTSUPP;
+			goto end_setxattr;
+		}
+		goto end_setxattr;
+	}
+
+	if (!HFSPLUS_SB(inode->i_sb)->attr_tree) {
+		err = -EOPNOTSUPP;
+		goto end_setxattr;
+	}
+
+	if (hfsplus_attr_exists(inode, name)) {
+		if (flags & XATTR_CREATE) {
+			pr_err("xattr exists yet\n");
+			err = -EOPNOTSUPP;
+			goto end_setxattr;
+		}
+		err = hfsplus_delete_attr(inode, name);
+		if (err)
+			goto end_setxattr;
+		err = hfsplus_create_attr(inode, name, value, size);
+		if (err)
+			goto end_setxattr;
+	} else {
+		if (flags & XATTR_REPLACE) {
+			pr_err("cannot replace xattr\n");
+			err = -EOPNOTSUPP;
+			goto end_setxattr;
+		}
+		err = hfsplus_create_attr(inode, name, value, size);
+		if (err)
+			goto end_setxattr;
+	}
+
+	cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset);
+	if (cat_entry_type == HFSPLUS_FOLDER) {
+		cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode,
+				    cat_fd.entryoffset +
+				    offsetof(struct hfsplus_cat_folder, flags));
+		cat_entry_flags |= HFSPLUS_XATTR_EXISTS;
+		if (!strcmp_xattr_acl(name))
+			cat_entry_flags |= HFSPLUS_ACL_EXISTS;
+		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+				offsetof(struct hfsplus_cat_folder, flags),
+				cat_entry_flags);
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+	} else if (cat_entry_type == HFSPLUS_FILE) {
+		cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode,
+				    cat_fd.entryoffset +
+				    offsetof(struct hfsplus_cat_file, flags));
+		cat_entry_flags |= HFSPLUS_XATTR_EXISTS;
+		if (!strcmp_xattr_acl(name))
+			cat_entry_flags |= HFSPLUS_ACL_EXISTS;
+		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+				    offsetof(struct hfsplus_cat_file, flags),
+				    cat_entry_flags);
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+	} else {
+		pr_err("invalid catalog entry type\n");
+		err = -EIO;
+		goto end_setxattr;
+	}
+
+end_setxattr:
+	hfs_find_exit(&cat_fd);
+	return err;
+}
+
+static inline int is_osx_xattr(const char *xattr_name)
+{
+	return !is_known_namespace(xattr_name);
+}
+
+static int name_len(const char *xattr_name, int xattr_name_len)
+{
+	int len = xattr_name_len + 1;
+
+	if (is_osx_xattr(xattr_name))
+		len += XATTR_MAC_OSX_PREFIX_LEN;
+
+	return len;
+}
+
+static int copy_name(char *buffer, const char *xattr_name, int name_len)
+{
+	int len = name_len;
+	int offset = 0;
+
+	if (is_osx_xattr(xattr_name)) {
+		strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN);
+		offset += XATTR_MAC_OSX_PREFIX_LEN;
+		len += XATTR_MAC_OSX_PREFIX_LEN;
+	}
+
+	strncpy(buffer + offset, xattr_name, name_len);
+	memset(buffer + offset + name_len, 0, 1);
+	len += 1;
+
+	return len;
+}
+
+static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry,
+						void *value, size_t size)
+{
+	ssize_t res = 0;
+	struct inode *inode = dentry->d_inode;
+	struct hfs_find_data fd;
+	u16 entry_type;
+	u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo);
+	u16 file_rec_len = sizeof(struct FInfo) + sizeof(struct FXInfo);
+	u16 record_len = max(folder_rec_len, file_rec_len);
+	u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)];
+	u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)];
+
+	if (size >= record_len) {
+		res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+		if (res) {
+			pr_err("can't init xattr find struct\n");
+			return res;
+		}
+		res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+		if (res)
+			goto end_getxattr_finder_info;
+		entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset);
+
+		if (entry_type == HFSPLUS_FOLDER) {
+			hfs_bnode_read(fd.bnode, folder_finder_info,
+				fd.entryoffset +
+				offsetof(struct hfsplus_cat_folder, user_info),
+				folder_rec_len);
+			memcpy(value, folder_finder_info, folder_rec_len);
+			res = folder_rec_len;
+		} else if (entry_type == HFSPLUS_FILE) {
+			hfs_bnode_read(fd.bnode, file_finder_info,
+				fd.entryoffset +
+				offsetof(struct hfsplus_cat_file, user_info),
+				file_rec_len);
+			memcpy(value, file_finder_info, file_rec_len);
+			res = file_rec_len;
+		} else {
+			res = -EOPNOTSUPP;
+			goto end_getxattr_finder_info;
+		}
+	} else
+		res = size ? -ERANGE : record_len;
+
+end_getxattr_finder_info:
+	if (size >= record_len)
+		hfs_find_exit(&fd);
+	return res;
+}
+
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+			 void *value, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct hfs_find_data fd;
+	hfsplus_attr_entry *entry;
+	__be32 xattr_record_type;
+	u32 record_type;
+	u16 record_length = 0;
+	ssize_t res = 0;
+
+	if ((!S_ISREG(inode->i_mode) &&
+			!S_ISDIR(inode->i_mode)) ||
+				HFSPLUS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	if (strncmp(name, XATTR_MAC_OSX_PREFIX,
+				XATTR_MAC_OSX_PREFIX_LEN) == 0) {
+		/* skip "osx." prefix */
+		name += XATTR_MAC_OSX_PREFIX_LEN;
+		/*
+		 * Don't allow retrieving properly prefixed attributes
+		 * by prepending them with "osx."
+		 */
+		if (is_known_namespace(name))
+			return -EOPNOTSUPP;
+	}
+
+	if (!strcmp_xattr_finder_info(name))
+		return hfsplus_getxattr_finder_info(dentry, value, size);
+
+	if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
+		return -EOPNOTSUPP;
+
+	entry = hfsplus_alloc_attr_entry();
+	if (!entry) {
+		pr_err("can't allocate xattr entry\n");
+		return -ENOMEM;
+	}
+
+	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd);
+	if (res) {
+		pr_err("can't init xattr find struct\n");
+		goto failed_getxattr_init;
+	}
+
+	res = hfsplus_find_attr(inode->i_sb, inode->i_ino, name, &fd);
+	if (res) {
+		if (res == -ENOENT)
+			res = -ENODATA;
+		else
+			pr_err("xattr searching failed\n");
+		goto out;
+	}
+
+	hfs_bnode_read(fd.bnode, &xattr_record_type,
+			fd.entryoffset, sizeof(xattr_record_type));
+	record_type = be32_to_cpu(xattr_record_type);
+	if (record_type == HFSPLUS_ATTR_INLINE_DATA) {
+		record_length = hfs_bnode_read_u16(fd.bnode,
+				fd.entryoffset +
+				offsetof(struct hfsplus_attr_inline_data,
+				length));
+		if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) {
+			pr_err("invalid xattr record size\n");
+			res = -EIO;
+			goto out;
+		}
+	} else if (record_type == HFSPLUS_ATTR_FORK_DATA ||
+			record_type == HFSPLUS_ATTR_EXTENTS) {
+		pr_err("only inline data xattr are supported\n");
+		res = -EOPNOTSUPP;
+		goto out;
+	} else {
+		pr_err("invalid xattr record\n");
+		res = -EIO;
+		goto out;
+	}
+
+	if (size) {
+		hfs_bnode_read(fd.bnode, entry, fd.entryoffset,
+				offsetof(struct hfsplus_attr_inline_data,
+					raw_bytes) + record_length);
+	}
+
+	if (size >= record_length) {
+		memcpy(value, entry->inline_data.raw_bytes, record_length);
+		res = record_length;
+	} else
+		res = size ? -ERANGE : record_length;
+
+out:
+	hfs_find_exit(&fd);
+
+failed_getxattr_init:
+	hfsplus_destroy_attr_entry(entry);
+	return res;
+}
+
+static inline int can_list(const char *xattr_name)
+{
+	if (!xattr_name)
+		return 0;
+
+	return strncmp(xattr_name, XATTR_TRUSTED_PREFIX,
+			XATTR_TRUSTED_PREFIX_LEN) ||
+				capable(CAP_SYS_ADMIN);
+}
+
+static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry,
+						char *buffer, size_t size)
+{
+	ssize_t res = 0;
+	struct inode *inode = dentry->d_inode;
+	struct hfs_find_data fd;
+	u16 entry_type;
+	u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)];
+	u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)];
+	unsigned long len, found_bit;
+	int xattr_name_len, symbols_count;
+
+	res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+	if (res) {
+		pr_err("can't init xattr find struct\n");
+		return res;
+	}
+
+	res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+	if (res)
+		goto end_listxattr_finder_info;
+
+	entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset);
+	if (entry_type == HFSPLUS_FOLDER) {
+		len = sizeof(struct DInfo) + sizeof(struct DXInfo);
+		hfs_bnode_read(fd.bnode, folder_finder_info,
+				fd.entryoffset +
+				offsetof(struct hfsplus_cat_folder, user_info),
+				len);
+		found_bit = find_first_bit((void *)folder_finder_info, len*8);
+	} else if (entry_type == HFSPLUS_FILE) {
+		len = sizeof(struct FInfo) + sizeof(struct FXInfo);
+		hfs_bnode_read(fd.bnode, file_finder_info,
+				fd.entryoffset +
+				offsetof(struct hfsplus_cat_file, user_info),
+				len);
+		found_bit = find_first_bit((void *)file_finder_info, len*8);
+	} else {
+		res = -EOPNOTSUPP;
+		goto end_listxattr_finder_info;
+	}
+
+	if (found_bit >= (len*8))
+		res = 0;
+	else {
+		symbols_count = sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME) - 1;
+		xattr_name_len =
+			name_len(HFSPLUS_XATTR_FINDER_INFO_NAME, symbols_count);
+		if (!buffer || !size) {
+			if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME))
+				res = xattr_name_len;
+		} else if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) {
+			if (size < xattr_name_len)
+				res = -ERANGE;
+			else {
+				res = copy_name(buffer,
+						HFSPLUS_XATTR_FINDER_INFO_NAME,
+						symbols_count);
+			}
+		}
+	}
+
+end_listxattr_finder_info:
+	hfs_find_exit(&fd);
+
+	return res;
+}
+
+ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	ssize_t err;
+	ssize_t res = 0;
+	struct inode *inode = dentry->d_inode;
+	struct hfs_find_data fd;
+	u16 key_len = 0;
+	struct hfsplus_attr_key attr_key;
+	char strbuf[HFSPLUS_ATTR_MAX_STRLEN +
+			XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
+	int xattr_name_len;
+
+	if ((!S_ISREG(inode->i_mode) &&
+			!S_ISDIR(inode->i_mode)) ||
+				HFSPLUS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	res = hfsplus_listxattr_finder_info(dentry, buffer, size);
+	if (res < 0)
+		return res;
+	else if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
+		return (res == 0) ? -EOPNOTSUPP : res;
+
+	err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd);
+	if (err) {
+		pr_err("can't init xattr find struct\n");
+		return err;
+	}
+
+	err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd);
+	if (err) {
+		if (err == -ENOENT) {
+			if (res == 0)
+				res = -ENODATA;
+			goto end_listxattr;
+		} else {
+			res = err;
+			goto end_listxattr;
+		}
+	}
+
+	for (;;) {
+		key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset);
+		if (key_len == 0 || key_len > fd.tree->max_key_len) {
+			pr_err("invalid xattr key length: %d\n", key_len);
+			res = -EIO;
+			goto end_listxattr;
+		}
+
+		hfs_bnode_read(fd.bnode, &attr_key,
+				fd.keyoffset, key_len + sizeof(key_len));
+
+		if (be32_to_cpu(attr_key.cnid) != inode->i_ino)
+			goto end_listxattr;
+
+		xattr_name_len = HFSPLUS_ATTR_MAX_STRLEN;
+		if (hfsplus_uni2asc(inode->i_sb,
+			(const struct hfsplus_unistr *)&fd.key->attr.key_name,
+					strbuf, &xattr_name_len)) {
+			pr_err("unicode conversion failed\n");
+			res = -EIO;
+			goto end_listxattr;
+		}
+
+		if (!buffer || !size) {
+			if (can_list(strbuf))
+				res += name_len(strbuf, xattr_name_len);
+		} else if (can_list(strbuf)) {
+			if (size < (res + name_len(strbuf, xattr_name_len))) {
+				res = -ERANGE;
+				goto end_listxattr;
+			} else
+				res += copy_name(buffer + res,
+						strbuf, xattr_name_len);
+		}
+
+		if (hfs_brec_goto(&fd, 1))
+			goto end_listxattr;
+	}
+
+end_listxattr:
+	hfs_find_exit(&fd);
+	return res;
+}
+
+int hfsplus_removexattr(struct dentry *dentry, const char *name)
+{
+	int err = 0;
+	struct inode *inode = dentry->d_inode;
+	struct hfs_find_data cat_fd;
+	u16 flags;
+	u16 cat_entry_type;
+	int is_xattr_acl_deleted = 0;
+	int is_all_xattrs_deleted = 0;
+
+	if ((!S_ISREG(inode->i_mode) &&
+			!S_ISDIR(inode->i_mode)) ||
+				HFSPLUS_IS_RSRC(inode))
+		return -EOPNOTSUPP;
+
+	if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
+		return -EOPNOTSUPP;
+
+	err = can_set_xattr(inode, name, NULL, 0);
+	if (err)
+		return err;
+
+	if (strncmp(name, XATTR_MAC_OSX_PREFIX,
+				XATTR_MAC_OSX_PREFIX_LEN) == 0)
+		name += XATTR_MAC_OSX_PREFIX_LEN;
+
+	if (!strcmp_xattr_finder_info(name))
+		return -EOPNOTSUPP;
+
+	err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
+	if (err) {
+		pr_err("can't init xattr find struct\n");
+		return err;
+	}
+
+	err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd);
+	if (err) {
+		pr_err("catalog searching failed\n");
+		goto end_removexattr;
+	}
+
+	err = hfsplus_delete_attr(inode, name);
+	if (err)
+		goto end_removexattr;
+
+	is_xattr_acl_deleted = !strcmp_xattr_acl(name);
+	is_all_xattrs_deleted = !hfsplus_attr_exists(inode, NULL);
+
+	if (!is_xattr_acl_deleted && !is_all_xattrs_deleted)
+		goto end_removexattr;
+
+	cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset);
+
+	if (cat_entry_type == HFSPLUS_FOLDER) {
+		flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset +
+				offsetof(struct hfsplus_cat_folder, flags));
+		if (is_xattr_acl_deleted)
+			flags &= ~HFSPLUS_ACL_EXISTS;
+		if (is_all_xattrs_deleted)
+			flags &= ~HFSPLUS_XATTR_EXISTS;
+		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+				offsetof(struct hfsplus_cat_folder, flags),
+				flags);
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+	} else if (cat_entry_type == HFSPLUS_FILE) {
+		flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset +
+				offsetof(struct hfsplus_cat_file, flags));
+		if (is_xattr_acl_deleted)
+			flags &= ~HFSPLUS_ACL_EXISTS;
+		if (is_all_xattrs_deleted)
+			flags &= ~HFSPLUS_XATTR_EXISTS;
+		hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset +
+				offsetof(struct hfsplus_cat_file, flags),
+				flags);
+		hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY);
+	} else {
+		pr_err("invalid catalog entry type\n");
+		err = -EIO;
+		goto end_removexattr;
+	}
+
+end_removexattr:
+	hfs_find_exit(&cat_fd);
+	return err;
+}
+
+static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
+					void *buffer, size_t size, int type)
+{
+	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN +
+				XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
+	size_t len = strlen(name);
+
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	if (len > HFSPLUS_ATTR_MAX_STRLEN)
+		return -EOPNOTSUPP;
+
+	strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
+	strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
+
+	return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+}
+
+static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN +
+				XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
+	size_t len = strlen(name);
+
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	if (len > HFSPLUS_ATTR_MAX_STRLEN)
+		return -EOPNOTSUPP;
+
+	strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
+	strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
+
+	return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+}
+
+static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	/*
+	 * This method is not used.
+	 * It is used hfsplus_listxattr() instead of generic_listxattr().
+	 */
+	return -EOPNOTSUPP;
+}
+
+const struct xattr_handler hfsplus_xattr_osx_handler = {
+	.prefix	= XATTR_MAC_OSX_PREFIX,
+	.list	= hfsplus_osx_listxattr,
+	.get	= hfsplus_osx_getxattr,
+	.set	= hfsplus_osx_setxattr,
+};
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
new file mode 100644
index 00000000000..847b695b984
--- /dev/null
+++ b/fs/hfsplus/xattr.h
@@ -0,0 +1,60 @@
+/*
+ * linux/fs/hfsplus/xattr.h
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Logic of processing extended attributes
+ */
+
+#ifndef _LINUX_HFSPLUS_XATTR_H
+#define _LINUX_HFSPLUS_XATTR_H
+
+#include <linux/xattr.h>
+
+extern const struct xattr_handler hfsplus_xattr_osx_handler;
+extern const struct xattr_handler hfsplus_xattr_user_handler;
+extern const struct xattr_handler hfsplus_xattr_trusted_handler;
+/*extern const struct xattr_handler hfsplus_xattr_acl_access_handler;*/
+/*extern const struct xattr_handler hfsplus_xattr_acl_default_handler;*/
+extern const struct xattr_handler hfsplus_xattr_security_handler;
+
+extern const struct xattr_handler *hfsplus_xattr_handlers[];
+
+int __hfsplus_setxattr(struct inode *inode, const char *name,
+			const void *value, size_t size, int flags);
+
+static inline int hfsplus_setxattr(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
+{
+	return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags);
+}
+
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+			void *value, size_t size);
+
+ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+int hfsplus_removexattr(struct dentry *dentry, const char *name);
+
+int hfsplus_init_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr);
+
+static inline int hfsplus_init_acl(struct inode *inode, struct inode *dir)
+{
+	/*TODO: implement*/
+	return 0;
+}
+
+static inline int hfsplus_init_inode_security(struct inode *inode,
+						struct inode *dir,
+						const struct qstr *qstr)
+{
+	int err;
+
+	err = hfsplus_init_acl(inode, dir);
+	if (!err)
+		err = hfsplus_init_security(inode, dir, qstr);
+	return err;
+}
+
+#endif
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
new file mode 100644
index 00000000000..83b842f113c
--- /dev/null
+++ b/fs/hfsplus/xattr_security.c
@@ -0,0 +1,104 @@
+/*
+ * linux/fs/hfsplus/xattr_trusted.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/security.h>
+#include "hfsplus_fs.h"
+#include "xattr.h"
+
+static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
+					void *buffer, size_t size, int type)
+{
+	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+	size_t len = strlen(name);
+
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+		return -EOPNOTSUPP;
+
+	strcpy(xattr_name, XATTR_SECURITY_PREFIX);
+	strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
+
+	return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+}
+
+static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+	size_t len = strlen(name);
+
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+		return -EOPNOTSUPP;
+
+	strcpy(xattr_name, XATTR_SECURITY_PREFIX);
+	strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
+
+	return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+}
+
+static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	/*
+	 * This method is not used.
+	 * It is used hfsplus_listxattr() instead of generic_listxattr().
+	 */
+	return -EOPNOTSUPP;
+}
+
+static int hfsplus_initxattrs(struct inode *inode,
+				const struct xattr *xattr_array,
+				void *fs_info)
+{
+	const struct xattr *xattr;
+	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+	size_t xattr_name_len;
+	int err = 0;
+
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		xattr_name_len = strlen(xattr->name);
+
+		if (xattr_name_len == 0)
+			continue;
+
+		if (xattr_name_len + XATTR_SECURITY_PREFIX_LEN >
+				HFSPLUS_ATTR_MAX_STRLEN)
+			return -EOPNOTSUPP;
+
+		strcpy(xattr_name, XATTR_SECURITY_PREFIX);
+		strcpy(xattr_name +
+			XATTR_SECURITY_PREFIX_LEN, xattr->name);
+		memset(xattr_name +
+			XATTR_SECURITY_PREFIX_LEN + xattr_name_len, 0, 1);
+
+		err = __hfsplus_setxattr(inode, xattr_name,
+					xattr->value, xattr->value_len, 0);
+		if (err)
+			break;
+	}
+	return err;
+}
+
+int hfsplus_init_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					&hfsplus_initxattrs, NULL);
+}
+
+const struct xattr_handler hfsplus_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= hfsplus_security_listxattr,
+	.get	= hfsplus_security_getxattr,
+	.set	= hfsplus_security_setxattr,
+};
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
new file mode 100644
index 00000000000..426cee27754
--- /dev/null
+++ b/fs/hfsplus/xattr_trusted.c
@@ -0,0 +1,63 @@
+/*
+ * linux/fs/hfsplus/xattr_trusted.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for trusted extended attributes.
+ */
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+
+static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
+					void *buffer, size_t size, int type)
+{
+	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+	size_t len = strlen(name);
+
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+		return -EOPNOTSUPP;
+
+	strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
+	strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
+
+	return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+}
+
+static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+	size_t len = strlen(name);
+
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+		return -EOPNOTSUPP;
+
+	strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
+	strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
+
+	return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+}
+
+static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	/*
+	 * This method is not used.
+	 * It is used hfsplus_listxattr() instead of generic_listxattr().
+	 */
+	return -EOPNOTSUPP;
+}
+
+const struct xattr_handler hfsplus_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= hfsplus_trusted_listxattr,
+	.get	= hfsplus_trusted_getxattr,
+	.set	= hfsplus_trusted_setxattr,
+};
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
new file mode 100644
index 00000000000..e34016561ae
--- /dev/null
+++ b/fs/hfsplus/xattr_user.c
@@ -0,0 +1,63 @@
+/*
+ * linux/fs/hfsplus/xattr_user.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for user extended attributes.
+ */
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+
+static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
+					void *buffer, size_t size, int type)
+{
+	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+	size_t len = strlen(name);
+
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+		return -EOPNOTSUPP;
+
+	strcpy(xattr_name, XATTR_USER_PREFIX);
+	strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
+
+	return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+}
+
+static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
+		const void *buffer, size_t size, int flags, int type)
+{
+	char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
+	size_t len = strlen(name);
+
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
+		return -EOPNOTSUPP;
+
+	strcpy(xattr_name, XATTR_USER_PREFIX);
+	strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
+
+	return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+}
+
+static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
+		size_t list_size, const char *name, size_t name_len, int type)
+{
+	/*
+	 * This method is not used.
+	 * It is used hfsplus_listxattr() instead of generic_listxattr().
+	 */
+	return -EOPNOTSUPP;
+}
+
+const struct xattr_handler hfsplus_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= hfsplus_user_listxattr,
+	.get	= hfsplus_user_getxattr,
+	.set	= hfsplus_user_setxattr,
+};
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 457addc5c91..cddb0521751 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/magic.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
@@ -30,7 +31,7 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 	return list_entry(inode, struct hostfs_inode_info, vfs_inode);
 }
 
-#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
+#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))
 
 static int hostfs_d_delete(const struct dentry *dentry)
 {
@@ -45,8 +46,6 @@ static const struct dentry_operations hostfs_dentry_ops = {
 static char *root_ino = "";
 static int append = 0;
 
-#define HOSTFS_SUPER_MAGIC 0x00c0ffee
-
 static const struct inode_operations hostfs_iops;
 static const struct inode_operations hostfs_dir_iops;
 static const struct inode_operations hostfs_link_iops;
@@ -121,7 +120,7 @@ static char *dentry_name(struct dentry *dentry)
 	if (!name)
 		return NULL;
 
-	return __dentry_name(dentry, name); /* will unlock */
+	return __dentry_name(dentry, name);
 }
 
 static char *inode_name(struct inode *ino)
@@ -229,10 +228,11 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 {
 	struct hostfs_inode_info *hi;
 
-	hi = kzalloc(sizeof(*hi), GFP_KERNEL);
+	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
 	if (hi == NULL)
 		return NULL;
 	hi->fd = -1;
+	hi->mode = 0;
 	inode_init_once(&hi->vfs_inode);
 	return &hi->vfs_inode;
 }
@@ -277,7 +277,7 @@ static const struct super_operations hostfs_sbops = {
 	.show_options	= hostfs_show_options,
 };
 
-int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
+int hostfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	void *dir;
 	char *name;
@@ -292,12 +292,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
 	__putname(name);
 	if (dir == NULL)
 		return -error;
-	next = file->f_pos;
+	next = ctx->pos;
 	while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
-		error = (*filldir)(ent, name, len, file->f_pos,
-				   ino, type);
-		if (error) break;
-		file->f_pos = next;
+		if (!dir_emit(ctx, name, len, ino, type))
+			break;
+		ctx->pos = next;
 	}
 	close_dir(dir);
 	return 0;
@@ -393,7 +392,7 @@ static const struct file_operations hostfs_file_fops = {
 
 static const struct file_operations hostfs_dir_fops = {
 	.llseek		= generic_file_llseek,
-	.readdir	= hostfs_readdir,
+	.iterate	= hostfs_readdir,
 	.read		= generic_read_dir,
 };
 
@@ -845,15 +844,8 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 		return err;
 
 	if ((attr->ia_valid & ATTR_SIZE) &&
-	    attr->ia_size != i_size_read(inode)) {
-		int error;
-
-		error = inode_newsize_ok(inode, attr->ia_size);
-		if (error)
-			return error;
-
+	    attr->ia_size != i_size_read(inode))
 		truncate_setsize(inode, attr->ia_size);
-	}
 
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
@@ -861,14 +853,6 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
 }
 
 static const struct inode_operations hostfs_iops = {
-	.create		= hostfs_create,
-	.link		= hostfs_link,
-	.unlink		= hostfs_unlink,
-	.symlink	= hostfs_symlink,
-	.mkdir		= hostfs_mkdir,
-	.rmdir		= hostfs_rmdir,
-	.mknod		= hostfs_mknod,
-	.rename		= hostfs_rename,
 	.permission	= hostfs_permission,
 	.setattr	= hostfs_setattr,
 };
@@ -1001,6 +985,7 @@ static struct file_system_type hostfs_type = {
 	.kill_sb	= hostfs_kill_sb,
 	.fs_flags 	= 0,
 };
+MODULE_ALIAS_FS("hostfs");
 
 static int __init init_hostfs(void)
 {
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index f49d1498aa2..4d0a1afa058 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -7,8 +7,37 @@
  */
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include "hpfs_fn.h"
 
+void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n)
+{
+	struct buffer_head *bh;
+	struct blk_plug plug;
+
+	if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size))
+		return;
+
+	bh = sb_find_get_block(s, secno);
+	if (bh) {
+		if (buffer_uptodate(bh)) {
+			brelse(bh);
+			return;
+		}
+		brelse(bh);
+	};
+
+	blk_start_plug(&plug);
+	while (n > 0) {
+		if (unlikely(secno >= hpfs_sb(s)->sb_fs_size))
+			break;
+		sb_breadahead(s, secno);
+		secno++;
+		n--;
+	}
+	blk_finish_plug(&plug);
+}
+
 /* Map a sector into a buffer and return pointers to it and to the buffer. */
 
 void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp,
@@ -18,6 +47,8 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head
 
 	hpfs_lock_assert(s);
 
+	hpfs_prefetch_sectors(s, secno, ahead);
+
 	cond_resched();
 
 	*bhp = bh = sb_bread(s, secno);
@@ -67,6 +98,8 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
 		return NULL;
 	}
 
+	hpfs_prefetch_sectors(s, secno, 4 + ahead);
+
 	qbh->data = data = kmalloc(2048, GFP_NOFS);
 	if (!data) {
 		printk("HPFS: hpfs_map_4sectors: out of memory\n");
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 05d4816e4e7..fa27980f222 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,8 +12,7 @@
  * Note: the dentry argument is the parent dentry.
  */
 
-static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
 {
 	unsigned long	 hash;
 	int		 i;
@@ -35,9 +34,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *ino
 	return 0;
 }
 
-static int hpfs_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	unsigned al = len;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 78e12b2e0ea..292b1acb9b8 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -25,7 +25,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
 	loff_t new_off = off + (whence == 1 ? filp->f_pos : 0);
 	loff_t pos;
 	struct quad_buffer_head qbh;
-	struct inode *i = filp->f_path.dentry->d_inode;
+	struct inode *i = file_inode(filp);
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
 	struct super_block *s = i->i_sb;
 
@@ -33,36 +33,38 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
 	if (whence == SEEK_DATA || whence == SEEK_HOLE)
 		return -EINVAL;
 
+	mutex_lock(&i->i_mutex);
 	hpfs_lock(s);
 
 	/*printk("dir lseek\n");*/
 	if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
-	mutex_lock(&i->i_mutex);
 	pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1;
 	while (pos != new_off) {
 		if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh);
 		else goto fail;
 		if (pos == 12) goto fail;
 	}
-	mutex_unlock(&i->i_mutex);
+	hpfs_add_pos(i, &filp->f_pos);
 ok:
+	filp->f_pos = new_off;
 	hpfs_unlock(s);
-	return filp->f_pos = new_off;
-fail:
 	mutex_unlock(&i->i_mutex);
+	return new_off;
+fail:
 	/*printk("illegal lseek: %016llx\n", new_off);*/
 	hpfs_unlock(s);
+	mutex_unlock(&i->i_mutex);
 	return -ESPIPE;
 }
 
-static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int hpfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
 	struct quad_buffer_head qbh;
 	struct hpfs_dirent *de;
 	int lc;
-	long old_pos;
+	loff_t next_pos;
 	unsigned char *tempname;
 	int c1, c2 = 0;
 	int ret = 0;
@@ -103,11 +105,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 	}
 	lc = hpfs_sb(inode->i_sb)->sb_lowercase;
-	if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */
-		filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */
+	if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */
+		ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */
 		goto out;
 	}
-	if (filp->f_pos == 13) {
+	if (ctx->pos == 13) {
 		ret = -ENOENT;
 		goto out;
 	}
@@ -118,33 +120,34 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		   accepted by filldir, but what can I do?
 		   maybe killall -9 ls helps */
 		if (hpfs_sb(inode->i_sb)->sb_chk)
-			if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) {
+			if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) {
 				ret = -EFSERROR;
 				goto out;
 			}
-		if (filp->f_pos == 12)
+		if (ctx->pos == 12)
 			goto out;
-		if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) {
-			printk("HPFS: warning: pos==%d\n",(int)filp->f_pos);
+		if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) {
+			printk("HPFS: warning: pos==%d\n",(int)ctx->pos);
 			goto out;
 		}
-		if (filp->f_pos == 0) {
-			if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+		if (ctx->pos == 0) {
+			if (!dir_emit_dot(file, ctx))
 				goto out;
-			filp->f_pos = 11;
+			ctx->pos = 11;
 		}
-		if (filp->f_pos == 11) {
-			if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0)
+		if (ctx->pos == 11) {
+			if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR))
 				goto out;
-			filp->f_pos = 1;
+			ctx->pos = 1;
 		}
-		if (filp->f_pos == 1) {
-			filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
-			hpfs_add_pos(inode, &filp->f_pos);
-			filp->f_version = inode->i_version;
+		if (ctx->pos == 1) {
+			ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
+			hpfs_add_pos(inode, &file->f_pos);
+			file->f_version = inode->i_version;
 		}
-		old_pos = filp->f_pos;
-		if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) {
+		next_pos = ctx->pos;
+		if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) {
+			ctx->pos = next_pos;
 			ret = -EIOERROR;
 			goto out;
 		}
@@ -152,20 +155,21 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			if (hpfs_sb(inode->i_sb)->sb_chk) {
 				if (de->first && !de->last && (de->namelen != 2
 				    || de ->name[0] != 1 || de->name[1] != 1))
-					hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos);
+					hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos);
 				if (de->last && (de->namelen != 1 || de ->name[0] != 255))
-					hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos);
+					hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos);
 			}
 			hpfs_brelse4(&qbh);
+			ctx->pos = next_pos;
 			goto again;
 		}
 		tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
-		if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) {
-			filp->f_pos = old_pos;
+		if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) {
 			if (tempname != de->name) kfree(tempname);
 			hpfs_brelse4(&qbh);
 			goto out;
 		}
+		ctx->pos = next_pos;
 		if (tempname != de->name) kfree(tempname);
 		hpfs_brelse4(&qbh);
 	}
@@ -320,7 +324,7 @@ const struct file_operations hpfs_dir_ops =
 {
 	.llseek		= hpfs_dir_lseek,
 	.read		= generic_read_dir,
-	.readdir	= hpfs_readdir,
+	.iterate	= hpfs_readdir,
 	.release	= hpfs_dir_release,
 	.fsync		= hpfs_file_fsync,
 };
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index fbfe2df5624..4e9dabcf1f4 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -7,6 +7,7 @@
  */
 
 #include "hpfs_fn.h"
+#include <linux/mpage.h>
 
 #define BLOCKS(size) (((size) + 511) >> 9)
 
@@ -34,7 +35,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
  * so we must ignore such errors.
  */
 
-static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
+static secno hpfs_bmap(struct inode *inode, unsigned file_secno, unsigned *n_secs)
 {
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
 	unsigned n, disk_secno;
@@ -42,11 +43,20 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
 	struct buffer_head *bh;
 	if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0;
 	n = file_secno - hpfs_inode->i_file_sec;
-	if (n < hpfs_inode->i_n_secs) return hpfs_inode->i_disk_sec + n;
+	if (n < hpfs_inode->i_n_secs) {
+		*n_secs = hpfs_inode->i_n_secs - n;
+		return hpfs_inode->i_disk_sec + n;
+	}
 	if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0;
 	disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh);
 	if (disk_secno == -1) return 0;
 	if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0;
+	n = file_secno - hpfs_inode->i_file_sec;
+	if (n < hpfs_inode->i_n_secs) {
+		*n_secs = hpfs_inode->i_n_secs - n;
+		return hpfs_inode->i_disk_sec + n;
+	}
+	*n_secs = 1;
 	return disk_secno;
 }
 
@@ -67,10 +77,14 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
 {
 	int r;
 	secno s;
+	unsigned n_secs;
 	hpfs_lock(inode->i_sb);
-	s = hpfs_bmap(inode, iblock);
+	s = hpfs_bmap(inode, iblock, &n_secs);
 	if (s) {
+		if (bh_result->b_size >> 9 < n_secs)
+			n_secs = bh_result->b_size >> 9;
 		map_bh(bh_result, inode->i_sb, s);
+		bh_result->b_size = n_secs << 9;
 		goto ret_0;
 	}
 	if (!create) goto ret_0;
@@ -95,24 +109,40 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
 	return r;
 }
 
+static int hpfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, hpfs_get_block);
+}
+
 static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	return block_write_full_page(page,hpfs_get_block, wbc);
+	return block_write_full_page(page, hpfs_get_block, wbc);
 }
 
-static int hpfs_readpage(struct file *file, struct page *page)
+static int hpfs_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block);
+}
+
+static int hpfs_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
 {
-	return block_read_full_page(page,hpfs_get_block);
+	return mpage_writepages(mapping, wbc, hpfs_get_block);
 }
 
 static void hpfs_write_failed(struct address_space *mapping, loff_t to)
 {
 	struct inode *inode = mapping->host;
 
+	hpfs_lock(inode->i_sb);
+
 	if (to > inode->i_size) {
 		truncate_pagecache(inode, to, inode->i_size);
 		hpfs_truncate(inode);
 	}
+
+	hpfs_unlock(inode->i_sb);
 }
 
 static int hpfs_write_begin(struct file *file, struct address_space *mapping,
@@ -131,6 +161,24 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
+static int hpfs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *pagep, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int err;
+	err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+	if (err < len)
+		hpfs_write_failed(mapping, pos + len);
+	if (!(err < 0)) {
+		/* make sure we write it on close, if not earlier */
+		hpfs_lock(inode->i_sb);
+		hpfs_i(inode)->i_dirty = 1;
+		hpfs_unlock(inode->i_sb);
+	}
+	return err;
+}
+
 static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,hpfs_get_block);
@@ -139,31 +187,19 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations hpfs_aops = {
 	.readpage = hpfs_readpage,
 	.writepage = hpfs_writepage,
+	.readpages = hpfs_readpages,
+	.writepages = hpfs_writepages,
 	.write_begin = hpfs_write_begin,
-	.write_end = generic_write_end,
+	.write_end = hpfs_write_end,
 	.bmap = _hpfs_bmap
 };
 
-static ssize_t hpfs_file_write(struct file *file, const char __user *buf,
-			size_t count, loff_t *ppos)
-{
-	ssize_t retval;
-
-	retval = do_sync_write(file, buf, count, ppos);
-	if (retval > 0) {
-		hpfs_lock(file->f_path.dentry->d_sb);
-		hpfs_i(file->f_path.dentry->d_inode)->i_dirty = 1;
-		hpfs_unlock(file->f_path.dentry->d_sb);
-	}
-	return retval;
-}
-
 const struct file_operations hpfs_file_ops =
 {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
-	.write		= hpfs_file_write,
+	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.release	= hpfs_file_release,
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b7ae286646b..1b398636e99 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -27,8 +27,9 @@
 #define ALLOC_FWD_MAX	128
 #define ALLOC_M		1
 #define FNODE_RD_AHEAD	16
-#define ANODE_RD_AHEAD	16
-#define DNODE_RD_AHEAD	4
+#define ANODE_RD_AHEAD	0
+#define DNODE_RD_AHEAD	72
+#define COUNT_RD_AHEAD	62
 
 #define FREE_DNODES_ADD	58
 #define FREE_DNODES_DEL	29
@@ -207,6 +208,7 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
 
 /* buffer.c */
 
+void hpfs_prefetch_sectors(struct super_block *, unsigned, int);
 void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int);
 void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **);
 void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int);
@@ -271,6 +273,7 @@ void hpfs_evict_inode(struct inode *);
 
 __le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
 __le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
+void hpfs_prefetch_bitmap(struct super_block *, unsigned);
 unsigned char *hpfs_load_code_page(struct super_block *, secno);
 __le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
 struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 5dc06c83710..9edeeb0ea97 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -147,7 +147,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
 	/*if (le32_to_cpu(fnode->acl_size_l) || le16_to_cpu(fnode->acl_size_s)) {
 		   Some unknown structures like ACL may be in fnode,
 		   we'd better not overwrite them
-		hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino);
+		hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 structures", i->i_ino);
 	} else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) {
 		__le32 ea;
 		if (!uid_eq(i->i_uid, hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) {
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index 4acb19d7835..3aa66ae1031 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -17,7 +17,9 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
 			 struct quad_buffer_head *qbh, char *id)
 {
 	secno sec;
-	if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) {
+	__le32 *ret;
+	unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+	if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) {
 		hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id);
 		return NULL;
 	}
@@ -26,7 +28,23 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
 		hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id);
 		return NULL;
 	}
-	return hpfs_map_4sectors(s, sec, qbh, 4);
+	ret = hpfs_map_4sectors(s, sec, qbh, 4);
+	if (ret) hpfs_prefetch_bitmap(s, bmp_block + 1);
+	return ret;
+}
+
+void hpfs_prefetch_bitmap(struct super_block *s, unsigned bmp_block)
+{
+	unsigned to_prefetch, next_prefetch;
+	unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
+	if (unlikely(bmp_block >= n_bands))
+		return;
+	to_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]);
+	if (unlikely(bmp_block + 1 >= n_bands))
+		next_prefetch = 0;
+	else
+		next_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block + 1]);
+	hpfs_prefetch_sectors(s, to_prefetch, 4 + 4 * (to_prefetch + 4 == next_prefetch));
 }
 
 /*
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a3076228523..4334cda8dba 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -121,7 +121,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
 	unsigned long *bits;
 	unsigned count;
 
-	bits = hpfs_map_4sectors(s, secno, &qbh, 4);
+	bits = hpfs_map_4sectors(s, secno, &qbh, 0);
 	if (!bits)
 		return 0;
 	count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
@@ -134,8 +134,13 @@ static unsigned count_bitmaps(struct super_block *s)
 	unsigned n, count, n_bands;
 	n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
 	count = 0;
-	for (n = 0; n < n_bands; n++)
+	for (n = 0; n < COUNT_RD_AHEAD; n++) {
+		hpfs_prefetch_bitmap(s, n);
+	}
+	for (n = 0; n < n_bands; n++) {
+		hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD);
 		count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+	}
 	return count;
 }
 
@@ -558,7 +563,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 	sbi->sb_cp_table = NULL;
 	sbi->sb_c_bitmap = -1;
 	sbi->sb_max_fwd_alloc = 0xffffff;
-	
+
+	if (sbi->sb_fs_size >= 0x80000000) {
+		hpfs_error(s, "invalid size in superblock: %08x",
+			(unsigned)sbi->sb_fs_size);
+		goto bail4;
+	}
+
 	/* Load bitmap directory */
 	if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps))))
 		goto bail4;
@@ -688,6 +699,7 @@ static struct file_system_type hpfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("hpfs");
 
 static int __init init_hpfs_fs(void)
 {
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 43b315f2002..4338ff32959 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -69,7 +69,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
 	struct dentry *parent;
 	char *root, *name;
 	const char *seg_name;
-	int len, seg_len;
+	int len, seg_len, root_len;
 
 	len = 0;
 	parent = dentry;
@@ -81,7 +81,8 @@ static char *dentry_name(struct dentry *dentry, int extra)
 	}
 
 	root = "proc";
-	len += strlen(root);
+	root_len = strlen(root);
+	len += root_len;
 	name = kmalloc(len + extra + 1, GFP_KERNEL);
 	if (name == NULL)
 		return NULL;
@@ -91,7 +92,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
 	while (parent->d_parent != parent) {
 		if (is_pid(parent)) {
 			seg_name = "pid";
-			seg_len = strlen("pid");
+			seg_len = strlen(seg_name);
 		}
 		else {
 			seg_name = parent->d_name.name;
@@ -100,10 +101,10 @@ static char *dentry_name(struct dentry *dentry, int extra)
 
 		len -= seg_len + 1;
 		name[len] = '/';
-		strncpy(&name[len + 1], seg_name, seg_len);
+		memcpy(&name[len + 1], seg_name, seg_len);
 		parent = parent->d_parent;
 	}
-	strncpy(name, root, strlen(root));
+	memcpy(name, root, root_len);
 	return name;
 }
 
@@ -180,7 +181,7 @@ static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count,
 	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
 	ssize_t n;
 
-	read = file->f_path.dentry->d_inode->i_fop->read;
+	read = file_inode(file)->i_fop->read;
 
 	if (!is_user)
 		set_fs(KERNEL_DS);
@@ -288,7 +289,7 @@ static ssize_t hppfs_write(struct file *file, const char __user *buf,
 	struct file *proc_file = data->proc_file;
 	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
 
-	write = proc_file->f_path.dentry->d_inode->i_fop->write;
+	write = file_inode(proc_file)->i_fop->write;
 	return (*write)(proc_file, buf, len, ppos);
 }
 
@@ -436,7 +437,6 @@ static int hppfs_open(struct inode *inode, struct file *file)
 	path.mnt = inode->i_sb->s_fs_info;
 	path.dentry = HPPFS_I(inode)->proc_dentry;
 
-	/* XXX This isn't closed anywhere */
 	data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred);
 	err = PTR_ERR(data->proc_file);
 	if (IS_ERR(data->proc_file))
@@ -513,7 +513,7 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
 	loff_t (*llseek)(struct file *, loff_t, int);
 	loff_t ret;
 
-	llseek = proc_file->f_path.dentry->d_inode->i_fop->llseek;
+	llseek = file_inode(proc_file)->i_fop->llseek;
 	if (llseek != NULL) {
 		ret = (*llseek)(proc_file, off, where);
 		if (ret < 0)
@@ -523,17 +523,28 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
 	return default_llseek(file, off, where);
 }
 
+static int hppfs_release(struct inode *inode, struct file *file)
+{
+	struct hppfs_private *data = file->private_data;
+	struct file *proc_file = data->proc_file;
+	if (proc_file)
+		fput(proc_file);
+	kfree(data);
+	return 0;
+}
+
 static const struct file_operations hppfs_file_fops = {
 	.owner		= NULL,
 	.llseek		= hppfs_llseek,
 	.read		= hppfs_read,
 	.write		= hppfs_write,
 	.open		= hppfs_open,
+	.release	= hppfs_release,
 };
 
 struct hppfs_dirent {
-	void *vfs_dirent;
-	filldir_t filldir;
+	struct dir_context ctx;
+	struct dir_context *caller;
 	struct dentry *dentry;
 };
 
@@ -545,43 +556,32 @@ static int hppfs_filldir(void *d, const char *name, int size,
 	if (file_removed(dirent->dentry, name))
 		return 0;
 
-	return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset,
-				  inode, type);
+	dirent->caller->pos = dirent->ctx.pos;
+	return !dir_emit(dirent->caller, name, size, inode, type);
 }
 
-static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
+static int hppfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct hppfs_private *data = file->private_data;
 	struct file *proc_file = data->proc_file;
-	int (*readdir)(struct file *, void *, filldir_t);
-	struct hppfs_dirent dirent = ((struct hppfs_dirent)
-		                      { .vfs_dirent  	= ent,
-					.filldir 	= filldir,
-					.dentry  	= file->f_path.dentry
-				      });
+	struct hppfs_dirent d = {
+		.ctx.actor	= hppfs_filldir,
+		.caller		= ctx,
+		.dentry  	= file->f_path.dentry
+	};
 	int err;
-
-	readdir = proc_file->f_path.dentry->d_inode->i_fop->readdir;
-
-	proc_file->f_pos = file->f_pos;
-	err = (*readdir)(proc_file, &dirent, hppfs_filldir);
-	file->f_pos = proc_file->f_pos;
-
+	proc_file->f_pos = ctx->pos;
+	err = iterate_dir(proc_file, &d.ctx);
+	ctx->pos = d.ctx.pos;
 	return err;
 }
 
-static int hppfs_fsync(struct file *file, loff_t start, loff_t end,
-		       int datasync)
-{
-	return filemap_write_and_wait_range(file->f_mapping, start, end);
-}
-
 static const struct file_operations hppfs_dir_fops = {
 	.owner		= NULL,
-	.readdir	= hppfs_readdir,
+	.iterate	= hppfs_readdir,
 	.open		= hppfs_dir_open,
-	.fsync		= hppfs_fsync,
 	.llseek		= default_llseek,
+	.release	= hppfs_release,
 };
 
 static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
@@ -748,6 +748,7 @@ static struct file_system_type hppfs_type = {
 	.kill_sb	= kill_anon_super,
 	.fs_flags 	= 0,
 };
+MODULE_ALIAS_FS("hppfs");
 
 static int __init init_hppfs(void)
 {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 78bde32ea95..a3f868ae3fd 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -97,7 +97,7 @@ static void huge_pagevec_release(struct pagevec *pvec)
 
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	loff_t len, vma_len;
 	int ret;
 	struct hstate *h = hstate_file(file);
@@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * way when do_mmap_pgoff unwinds (may be important on powerpc
 	 * and ia64).
 	 */
-	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP;
+	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
 	vma->vm_ops = &hugetlb_vm_ops;
 
 	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
@@ -896,6 +896,7 @@ static struct file_system_type hugetlbfs_fs_type = {
 	.mount		= hugetlbfs_mount,
 	.kill_sb	= kill_litter_super,
 };
+MODULE_ALIAS_FS("hugetlbfs");
 
 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
 
@@ -908,29 +909,36 @@ static int can_do_hugetlb_shm(void)
 
 static int get_hstate_idx(int page_size_log)
 {
-	struct hstate *h;
+	struct hstate *h = hstate_sizelog(page_size_log);
 
-	if (!page_size_log)
-		return default_hstate_idx;
-	h = size_to_hstate(1 << page_size_log);
 	if (!h)
 		return -1;
 	return h - hstates;
 }
 
-struct file *hugetlb_file_setup(const char *name, unsigned long addr,
-				size_t size, vm_flags_t acctflag,
-				struct user_struct **user,
+static char *hugetlb_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)",
+				dentry->d_name.name);
+}
+
+static struct dentry_operations anon_ops = {
+	.d_dname = hugetlb_dname
+};
+
+/*
+ * Note that size should be aligned to proper hugepage size in caller side,
+ * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
+ */
+struct file *hugetlb_file_setup(const char *name, size_t size,
+				vm_flags_t acctflag, struct user_struct **user,
 				int creat_flags, int page_size_log)
 {
-	int error = -ENOMEM;
-	struct file *file;
+	struct file *file = ERR_PTR(-ENOMEM);
 	struct inode *inode;
 	struct path path;
-	struct dentry *root;
+	struct super_block *sb;
 	struct qstr quick_string;
-	struct hstate *hstate;
-	unsigned long num_pages;
 	int hstate_idx;
 
 	hstate_idx = get_hstate_idx(page_size_log);
@@ -955,36 +963,34 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
 		}
 	}
 
-	root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
+	sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
 	quick_string.name = name;
 	quick_string.len = strlen(quick_string.name);
 	quick_string.hash = 0;
-	path.dentry = d_alloc(root, &quick_string);
+	path.dentry = d_alloc_pseudo(sb, &quick_string);
 	if (!path.dentry)
 		goto out_shm_unlock;
 
+	d_set_d_op(path.dentry, &anon_ops);
 	path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
-	error = -ENOSPC;
-	inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
+	file = ERR_PTR(-ENOSPC);
+	inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
 	if (!inode)
 		goto out_dentry;
 
-	hstate = hstate_inode(inode);
-	size += addr & ~huge_page_mask(hstate);
-	num_pages = ALIGN(size, huge_page_size(hstate)) >>
-			huge_page_shift(hstate);
-	error = -ENOMEM;
-	if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
+	file = ERR_PTR(-ENOMEM);
+	if (hugetlb_reserve_pages(inode, 0,
+			size >> huge_page_shift(hstate_inode(inode)), NULL,
+			acctflag))
 		goto out_inode;
 
 	d_instantiate(path.dentry, inode);
 	inode->i_size = size;
 	clear_nlink(inode);
 
-	error = -ENFILE;
 	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
 			&hugetlbfs_file_operations);
-	if (!file)
+	if (IS_ERR(file))
 		goto out_dentry; /* inode is already attached */
 
 	return file;
@@ -998,7 +1004,7 @@ out_shm_unlock:
 		user_shm_unlock(size, *user);
 		*user = NULL;
 	}
-	return ERR_PTR(error);
+	return file;
 }
 
 static int __init init_hugetlbfs_fs(void)
diff --git a/fs/inode.c b/fs/inode.c
index 14084b72b25..d6dfb09c828 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -333,8 +333,10 @@ EXPORT_SYMBOL(set_nlink);
  */
 void inc_nlink(struct inode *inode)
 {
-	if (WARN_ON(inode->i_nlink == 0))
+	if (unlikely(inode->i_nlink == 0)) {
+		WARN_ON(!(inode->i_state & I_LINKABLE));
 		atomic_long_dec(&inode->i_sb->s_remove_count);
+	}
 
 	inode->__i_nlink++;
 }
@@ -725,7 +727,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
 		 * inode to the back of the list so we don't spin on it.
 		 */
 		if (!spin_trylock(&inode->i_lock)) {
-			list_move_tail(&inode->i_lru, &sb->s_inode_lru);
+			list_move(&inode->i_lru, &sb->s_inode_lru);
 			continue;
 		}
 
@@ -798,11 +800,10 @@ static struct inode *find_inode(struct super_block *sb,
 				int (*test)(struct inode *, void *),
 				void *data)
 {
-	struct hlist_node *node;
 	struct inode *inode = NULL;
 
 repeat:
-	hlist_for_each_entry(inode, node, head, i_hash) {
+	hlist_for_each_entry(inode, head, i_hash) {
 		spin_lock(&inode->i_lock);
 		if (inode->i_sb != sb) {
 			spin_unlock(&inode->i_lock);
@@ -830,11 +831,10 @@ repeat:
 static struct inode *find_inode_fast(struct super_block *sb,
 				struct hlist_head *head, unsigned long ino)
 {
-	struct hlist_node *node;
 	struct inode *inode = NULL;
 
 repeat:
-	hlist_for_each_entry(inode, node, head, i_hash) {
+	hlist_for_each_entry(inode, head, i_hash) {
 		spin_lock(&inode->i_lock);
 		if (inode->i_ino != ino) {
 			spin_unlock(&inode->i_lock);
@@ -1132,11 +1132,10 @@ EXPORT_SYMBOL(iget_locked);
 static int test_inode_iunique(struct super_block *sb, unsigned long ino)
 {
 	struct hlist_head *b = inode_hashtable + hash(sb, ino);
-	struct hlist_node *node;
 	struct inode *inode;
 
 	spin_lock(&inode_hash_lock);
-	hlist_for_each_entry(inode, node, b, i_hash) {
+	hlist_for_each_entry(inode, b, i_hash) {
 		if (inode->i_ino == ino && inode->i_sb == sb) {
 			spin_unlock(&inode_hash_lock);
 			return 0;
@@ -1291,10 +1290,9 @@ int insert_inode_locked(struct inode *inode)
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 
 	while (1) {
-		struct hlist_node *node;
 		struct inode *old = NULL;
 		spin_lock(&inode_hash_lock);
-		hlist_for_each_entry(old, node, head, i_hash) {
+		hlist_for_each_entry(old, head, i_hash) {
 			if (old->i_ino != ino)
 				continue;
 			if (old->i_sb != sb)
@@ -1306,7 +1304,7 @@ int insert_inode_locked(struct inode *inode)
 			}
 			break;
 		}
-		if (likely(!node)) {
+		if (likely(!old)) {
 			spin_lock(&inode->i_lock);
 			inode->i_state |= I_NEW;
 			hlist_add_head(&inode->i_hash, head);
@@ -1334,11 +1332,10 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 
 	while (1) {
-		struct hlist_node *node;
 		struct inode *old = NULL;
 
 		spin_lock(&inode_hash_lock);
-		hlist_for_each_entry(old, node, head, i_hash) {
+		hlist_for_each_entry(old, head, i_hash) {
 			if (old->i_sb != sb)
 				continue;
 			if (!test(old, data))
@@ -1350,7 +1347,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 			}
 			break;
 		}
-		if (likely(!node)) {
+		if (likely(!old)) {
 			spin_lock(&inode->i_lock);
 			inode->i_state |= I_NEW;
 			hlist_add_head(&inode->i_hash, head);
@@ -1655,7 +1652,7 @@ EXPORT_SYMBOL(file_remove_suid);
 
 int file_update_time(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct timespec now;
 	int sync_it = 0;
 	int ret;
@@ -1808,7 +1805,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
 		inode->i_fop = &def_blk_fops;
 		inode->i_rdev = rdev;
 	} else if (S_ISFIFO(mode))
-		inode->i_fop = &def_fifo_fops;
+		inode->i_fop = &pipefifo_fops;
 	else if (S_ISSOCK(mode))
 		inode->i_fop = &bad_sock_fops;
 	else
diff --git a/fs/internal.h b/fs/internal.h
index 2f6af7f645e..7c5f01cf619 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -69,7 +69,7 @@ extern void __mnt_drop_write_file(struct file *);
 /*
  * fs_struct.c
  */
-extern void chroot_fs_refs(struct path *, struct path *);
+extern void chroot_fs_refs(const struct path *, const struct path *);
 
 /*
  * file_table.c
@@ -96,11 +96,12 @@ struct open_flags {
 	umode_t mode;
 	int acc_mode;
 	int intent;
+	int lookup_flags;
 };
 extern struct file *do_filp_open(int dfd, struct filename *pathname,
-		const struct open_flags *op, int flags);
+		const struct open_flags *op);
 extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
-		const char *, const struct open_flags *, int lookup_flags);
+		const char *, const struct open_flags *);
 
 extern long do_handle_open(int mountdirfd,
 			   struct file_handle __user *ufh, int open_flag);
@@ -125,3 +126,20 @@ extern int invalidate_inodes(struct super_block *, bool);
  * dcache.c
  */
 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
+
+/*
+ * read_write.c
+ */
+extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
+extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
+
+/*
+ * splice.c
+ */
+extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+		loff_t *opos, size_t len, unsigned int flags);
+
+/*
+ * pipe.c
+ */
+extern const struct file_operations pipefifo_fops;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 3bdad6d1f26..fd507fb460f 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -175,7 +175,7 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 	struct fiemap fiemap;
 	struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
 	struct fiemap_extent_info fieinfo = { 0, };
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct super_block *sb = inode->i_sb;
 	u64 len;
 	int error;
@@ -424,7 +424,7 @@ EXPORT_SYMBOL(generic_block_fiemap);
  */
 int ioctl_preallocate(struct file *filp, void __user *argp)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct space_resv sr;
 
 	if (copy_from_user(&sr, argp, sizeof(sr)))
@@ -449,7 +449,7 @@ int ioctl_preallocate(struct file *filp, void __user *argp)
 static int file_ioctl(struct file *filp, unsigned int cmd,
 		unsigned long arg)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	int __user *p = (int __user *)arg;
 
 	switch (cmd) {
@@ -512,7 +512,7 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 
 static int ioctl_fsfreeze(struct file *filp)
 {
-	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+	struct super_block *sb = file_inode(filp)->i_sb;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -527,7 +527,7 @@ static int ioctl_fsfreeze(struct file *filp)
 
 static int ioctl_fsthaw(struct file *filp)
 {
-	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+	struct super_block *sb = file_inode(filp)->i_sb;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -548,7 +548,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 {
 	int error = 0;
 	int __user *argp = (int __user *)arg;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
 	switch (cmd) {
 	case FIOCLEX:
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 0b3fa7974fa..592e5115a56 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -296,7 +296,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
  */
 static int zisofs_readpage(struct file *file, struct page *page)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
 	int err;
 	int i, pcount, full_page;
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index f20437c068a..b943cbd963b 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -78,8 +78,8 @@ int get_acorn_filename(struct iso_directory_record *de,
 /*
  * This should _really_ be cleaned up some day..
  */
-static int do_isofs_readdir(struct inode *inode, struct file *filp,
-		void *dirent, filldir_t filldir,
+static int do_isofs_readdir(struct inode *inode, struct file *file,
+		struct dir_context *ctx,
 		char *tmpname, struct iso_directory_record *tmpde)
 {
 	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -94,10 +94,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 	struct iso_directory_record *de;
 	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
 
-	offset = filp->f_pos & (bufsize - 1);
-	block = filp->f_pos >> bufbits;
+	offset = ctx->pos & (bufsize - 1);
+	block = ctx->pos >> bufbits;
 
-	while (filp->f_pos < inode->i_size) {
+	while (ctx->pos < inode->i_size) {
 		int de_len;
 
 		if (!bh) {
@@ -108,7 +108,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 
 		de = (struct iso_directory_record *) (bh->b_data + offset);
 
-		de_len = *(unsigned char *) de;
+		de_len = *(unsigned char *)de;
 
 		/*
 		 * If the length byte is zero, we should move on to the next
@@ -119,8 +119,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 		if (de_len == 0) {
 			brelse(bh);
 			bh = NULL;
-			filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
-			block = filp->f_pos >> bufbits;
+			ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
+			block = ctx->pos >> bufbits;
 			offset = 0;
 			continue;
 		}
@@ -164,16 +164,16 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 
 		if (de->flags[-sbi->s_high_sierra] & 0x80) {
 			first_de = 0;
-			filp->f_pos += de_len;
+			ctx->pos += de_len;
 			continue;
 		}
 		first_de = 1;
 
 		/* Handle the case of the '.' directory */
 		if (de->name_len[0] == 1 && de->name[0] == 0) {
-			if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0)
+			if (!dir_emit_dot(file, ctx))
 				break;
-			filp->f_pos += de_len;
+			ctx->pos += de_len;
 			continue;
 		}
 
@@ -181,10 +181,9 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 
 		/* Handle the case of the '..' directory */
 		if (de->name_len[0] == 1 && de->name[0] == 1) {
-			inode_number = parent_ino(filp->f_path.dentry);
-			if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0)
+			if (!dir_emit_dotdot(file, ctx))
 				break;
-			filp->f_pos += de_len;
+			ctx->pos += de_len;
 			continue;
 		}
 
@@ -198,7 +197,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 		if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
 		    (!sbi->s_showassoc &&
 				(de->flags[-sbi->s_high_sierra] & 4))) {
-			filp->f_pos += de_len;
+			ctx->pos += de_len;
 			continue;
 		}
 
@@ -230,10 +229,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 			}
 		}
 		if (len > 0) {
-			if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0)
+			if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN))
 				break;
 		}
-		filp->f_pos += de_len;
+		ctx->pos += de_len;
 
 		continue;
 	}
@@ -247,13 +246,12 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
  * handling split directory entries.. The real work is done by
  * "do_isofs_readdir()".
  */
-static int isofs_readdir(struct file *filp,
-		void *dirent, filldir_t filldir)
+static int isofs_readdir(struct file *file, struct dir_context *ctx)
 {
 	int result;
 	char *tmpname;
 	struct iso_directory_record *tmpde;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 
 	tmpname = (char *)__get_free_page(GFP_KERNEL);
 	if (tmpname == NULL)
@@ -261,7 +259,7 @@ static int isofs_readdir(struct file *filp,
 
 	tmpde = (struct iso_directory_record *) (tmpname+1024);
 
-	result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde);
+	result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde);
 
 	free_page((unsigned long) tmpname);
 	return result;
@@ -271,7 +269,7 @@ const struct file_operations isofs_dir_operations =
 {
 	.llseek = generic_file_llseek,
 	.read = generic_read_dir,
-	.readdir = isofs_readdir,
+	.iterate = isofs_readdir,
 };
 
 /*
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 2b4f2358ead..12088d8de3f 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -125,10 +125,10 @@ isofs_export_encode_fh(struct inode *inode,
 	 */
 	if (parent && (len < 5)) {
 		*max_len = 5;
-		return 255;
+		return FILEID_INVALID;
 	} else if (len < 3) {
 		*max_len = 3;
-		return 255;
+		return FILEID_INVALID;
 	}
 
 	len = 3;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 67ce52507d7..c348d6d8862 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -28,31 +28,23 @@
 
 #define BEQUIET
 
-static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
-		struct qstr *qstr);
-static int isofs_hash(const struct dentry *parent, const struct inode *inode,
-		struct qstr *qstr);
+static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
+static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
 static int isofs_dentry_cmpi(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+		const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 static int isofs_dentry_cmp(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+		const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 
 #ifdef CONFIG_JOLIET
-static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
-		struct qstr *qstr);
-static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
-		struct qstr *qstr);
+static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
+static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr);
 static int isofs_dentry_cmpi_ms(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+		const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 static int isofs_dentry_cmp_ms(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+		const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 #endif
 
@@ -265,30 +257,26 @@ static int isofs_dentry_cmp_common(
 }
 
 static int
-isofs_hash(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+isofs_hash(const struct dentry *dentry, struct qstr *qstr)
 {
 	return isofs_hash_common(dentry, qstr, 0);
 }
 
 static int
-isofs_hashi(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
 {
 	return isofs_hashi_common(dentry, qstr, 0);
 }
 
 static int
-isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return isofs_dentry_cmp_common(len, str, name, 0, 0);
 }
 
 static int
-isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return isofs_dentry_cmp_common(len, str, name, 0, 1);
@@ -296,30 +284,26 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
 
 #ifdef CONFIG_JOLIET
 static int
-isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
 {
 	return isofs_hash_common(dentry, qstr, 1);
 }
 
 static int
-isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
 {
 	return isofs_hashi_common(dentry, qstr, 1);
 }
 
 static int
-isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return isofs_dentry_cmp_common(len, str, name, 1, 0);
 }
 
 static int
-isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return isofs_dentry_cmp_common(len, str, name, 1, 1);
@@ -1556,6 +1540,8 @@ static struct file_system_type iso9660_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("iso9660");
+MODULE_ALIAS("iso9660");
 
 static int __init init_iso9660_fs(void)
 {
@@ -1593,5 +1579,3 @@ static void __exit exit_iso9660_fs(void)
 module_init(init_iso9660_fs)
 module_exit(exit_iso9660_fs)
 MODULE_LICENSE("GPL");
-/* Actual filesystem name is iso9660, as requested in filesystems.c */
-MODULE_ALIAS("iso9660");
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index c167028844e..95295640d9c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,8 +37,7 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
 
 	qstr.name = compare;
 	qstr.len = dlen;
-	return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
-			dentry->d_name.len, dentry->d_name.name, &qstr);
+	return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
 }
 
 /*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 86b39b167c2..11bb11f48b3 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -162,8 +162,17 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
 
 	for (i = 0; i < bufs; i++) {
 		wbuf[i]->b_end_io = end_buffer_write_sync;
-		/* We use-up our safety reference in submit_bh() */
-		submit_bh(write_op, wbuf[i]);
+		/*
+		 * Here we write back pagecache data that may be mmaped. Since
+		 * we cannot afford to clean the page and set PageWriteback
+		 * here due to lock ordering (page lock ranks above transaction
+		 * start), the data can change while IO is in flight. Tell the
+		 * block layer it should bounce the bio pages if stable data
+		 * during write is required.
+		 *
+		 * We use up our safety reference in submit_bh().
+		 */
+		_submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
 	}
 }
 
@@ -667,7 +676,17 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(write_op, bh);
+				/*
+				 * In data=journal mode, here we can end up
+				 * writing pagecache data that might be
+				 * mmapped. Since we can't afford to clean the
+				 * page and set PageWriteback (see the comment
+				 * near the other use of _submit_bh()), the
+				 * data can change while the write is in
+				 * flight.  Tell the block layer to bounce the
+				 * bio pages if stable pages are required.
+				 */
+				_submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
 			}
 			cond_resched();
 
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 81cc7eaff86..6510d635572 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -310,8 +310,6 @@ int journal_write_metadata_buffer(transaction_t *transaction,
 
 	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
 	/* keep subsequent assertions sane */
-	new_bh->b_state = 0;
-	init_buffer(new_bh, NULL, NULL);
 	atomic_set(&new_bh->b_count, 1);
 	new_jh = journal_add_journal_head(new_bh);	/* This sleeps */
 
@@ -564,6 +562,16 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 	spin_unlock(&journal->j_state_lock);
 #endif
 	spin_lock(&journal->j_state_lock);
+	/*
+	 * Not running or committing trans? Must be already committed. This
+	 * saves us from waiting for a *long* time when tid overflows.
+	 */
+	if (!((journal->j_running_transaction &&
+	       journal->j_running_transaction->t_tid == tid) ||
+	      (journal->j_committing_transaction &&
+	       journal->j_committing_transaction->t_tid == tid)))
+		goto out_unlock;
+
 	if (!tid_geq(journal->j_commit_waited, tid))
 		journal->j_commit_waited = tid;
 	while (tid_gt(tid, journal->j_commit_sequence)) {
@@ -575,6 +583,7 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 				!tid_gt(tid, journal->j_commit_sequence));
 		spin_lock(&journal->j_state_lock);
 	}
+out_unlock:
 	spin_unlock(&journal->j_state_lock);
 
 	if (unlikely(is_journal_aborted(journal))) {
@@ -1845,7 +1854,7 @@ static struct journal_head *journal_alloc_journal_head(void)
 #ifdef CONFIG_JBD_DEBUG
 	atomic_inc(&nr_journal_heads);
 #endif
-	ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+	ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
 	if (ret == NULL) {
 		jbd_debug(1, "out of memory for journal_head\n");
 		printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n",
@@ -1853,7 +1862,7 @@ static struct journal_head *journal_alloc_journal_head(void)
 
 		while (ret == NULL) {
 			yield();
-			ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+			ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS);
 		}
 	}
 	return ret;
@@ -1915,10 +1924,8 @@ struct journal_head *journal_add_journal_head(struct buffer_head *bh)
 	struct journal_head *new_jh = NULL;
 
 repeat:
-	if (!buffer_jbd(bh)) {
+	if (!buffer_jbd(bh))
 		new_jh = journal_alloc_journal_head();
-		memset(new_jh, 0, sizeof(*new_jh));
-	}
 
 	jbd_lock_bh_journal_head(bh);
 	if (buffer_jbd(bh)) {
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 071d6905f0d..be0c39b66fe 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -245,7 +245,6 @@ static handle_t *new_handle(int nblocks)
 	handle_t *handle = jbd_alloc_handle(GFP_NOFS);
 	if (!handle)
 		return NULL;
-	memset(handle, 0, sizeof(*handle));
 	handle->h_buffer_credits = nblocks;
 	handle->h_ref = 1;
 
@@ -2020,16 +2019,20 @@ zap_buffer_unlocked:
  * void journal_invalidatepage() - invalidate a journal page
  * @journal: journal to use for flush
  * @page:    page to flush
- * @offset:  length of page to invalidate.
+ * @offset:  offset of the range to invalidate
+ * @length:  length of the range to invalidate
  *
- * Reap page buffers containing data after offset in page.
+ * Reap page buffers containing data in specified range in page.
  */
 void journal_invalidatepage(journal_t *journal,
 		      struct page *page,
-		      unsigned long offset)
+		      unsigned int offset,
+		      unsigned int length)
 {
 	struct buffer_head *head, *bh, *next;
+	unsigned int stop = offset + length;
 	unsigned int curr_off = 0;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
 	int may_free = 1;
 
 	if (!PageLocked(page))
@@ -2037,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal,
 	if (!page_has_buffers(page))
 		return;
 
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
 	/* We will potentially be playing with lists other than just the
 	 * data lists (especially for journaled data mode), so be
 	 * cautious in our locking. */
@@ -2046,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal,
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;
 
+		if (next_off > stop)
+			return;
+
 		if (offset <= curr_off) {
 			/* This block is wholly outside the truncation point */
 			lock_buffer(bh);
 			may_free &= journal_unmap_buffer(journal, bh,
-							 offset > 0);
+							 partial_page);
 			unlock_buffer(bh);
 		}
 		curr_off = next_off;
@@ -2058,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal,
 
 	} while (bh != head);
 
-	if (!offset) {
+	if (!partial_page) {
 		if (may_free && try_to_free_buffers(page))
 			J_ASSERT(!page_has_buffers(page));
 	}
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 69a48c2944d..5a9f5534d57 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -20,7 +20,7 @@ config JBD2
 
 config JBD2_DEBUG
 	bool "JBD2 (ext4) debugging support"
-	depends on JBD2 && DEBUG_FS
+	depends on JBD2
 	help
 	  If you are using the ext4 journaled file system (or
 	  potentially any other filesystem/device using JBD2), this option
@@ -29,7 +29,7 @@ config JBD2_DEBUG
 	  By default, the debugging output will be turned off.
 
 	  If you select Y here, then you will be able to turn on debugging
-	  with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+	  with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
 	  number between 1 and 5. The higher the number, the more debugging
 	  output is generated.  To turn debugging off again, do
-	  "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+	  "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c78841ee81c..7f34f471616 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 	int nblocks, space_left;
 	/* assert_spin_locked(&journal->j_state_lock); */
 
-	nblocks = jbd_space_needed(journal);
-	while (__jbd2_log_space_left(journal) < nblocks) {
+	nblocks = jbd2_space_needed(journal);
+	while (jbd2_log_space_left(journal) < nblocks) {
 		if (journal->j_flags & JBD2_ABORT)
 			return;
 		write_unlock(&journal->j_state_lock);
@@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 		 */
 		write_lock(&journal->j_state_lock);
 		spin_lock(&journal->j_list_lock);
-		nblocks = jbd_space_needed(journal);
-		space_left = __jbd2_log_space_left(journal);
+		nblocks = jbd2_space_needed(journal);
+		space_left = jbd2_log_space_left(journal);
 		if (space_left < nblocks) {
 			int chkpt = journal->j_checkpoint_transactions != NULL;
 			tid_t tid = 0;
@@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 				/* We were able to recover space; yay! */
 				;
 			} else if (tid) {
+				/*
+				 * jbd2_journal_commit_transaction() may want
+				 * to take the checkpoint_mutex if JBD2_FLUSHED
+				 * is set.  So we need to temporarily drop it.
+				 */
+				mutex_unlock(&journal->j_checkpoint_mutex);
 				jbd2_log_wait_commit(journal, tid);
+				write_lock(&journal->j_state_lock);
+				continue;
 			} else {
 				printk(KERN_ERR "%s: needed %d blocks and "
 				       "only had %d space available\n",
@@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 
 	__jbd2_journal_drop_transaction(journal, transaction);
 	jbd2_journal_free_transaction(transaction);
-
-	/* Just in case anybody was waiting for more transactions to be
-           checkpointed... */
-	wake_up(&journal->j_wait_logspace);
 	ret = 1;
 out:
 	return ret;
@@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 	J_ASSERT(transaction->t_state == T_FINISHED);
 	J_ASSERT(transaction->t_buffers == NULL);
 	J_ASSERT(transaction->t_forget == NULL);
-	J_ASSERT(transaction->t_iobuf_list == NULL);
 	J_ASSERT(transaction->t_shadow_list == NULL);
-	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(atomic_read(&transaction->t_updates) == 0);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3091d42992f..559bec1a37b 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@
 #include <trace/events/jbd2.h>
 
 /*
- * Default IO end handler for temporary BJ_IO buffer_heads.
+ * IO end handler for temporary buffer_heads handling writes to the journal.
  */
 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
+	struct buffer_head *orig_bh = bh->b_private;
+
 	BUFFER_TRACE(bh, "");
 	if (uptodate)
 		set_buffer_uptodate(bh);
 	else
 		clear_buffer_uptodate(bh);
+	if (orig_bh) {
+		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&orig_bh->b_state, BH_Shadow);
+	}
 	unlock_buffer(bh);
 }
 
@@ -85,8 +92,7 @@ nope:
 	__brelse(bh);
 }
 
-static void jbd2_commit_block_csum_set(journal_t *j,
-				       struct journal_head *descriptor)
+static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
 {
 	struct commit_header *h;
 	__u32 csum;
@@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j,
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return;
 
-	h = (struct commit_header *)(jh2bh(descriptor)->b_data);
+	h = (struct commit_header *)(bh->b_data);
 	h->h_chksum_type = 0;
 	h->h_chksum_size = 0;
 	h->h_chksum[0] = 0;
-	csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
-			   j->j_blocksize);
+	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 	h->h_chksum[0] = cpu_to_be32(csum);
 }
 
@@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal,
 					struct buffer_head **cbh,
 					__u32 crc32_sum)
 {
-	struct journal_head *descriptor;
 	struct commit_header *tmp;
 	struct buffer_head *bh;
 	int ret;
@@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal,
 	if (is_journal_aborted(journal))
 		return 0;
 
-	descriptor = jbd2_journal_get_descriptor_buffer(journal);
-	if (!descriptor)
+	bh = jbd2_journal_get_descriptor_buffer(journal);
+	if (!bh)
 		return 1;
 
-	bh = jh2bh(descriptor);
-
 	tmp = (struct commit_header *)bh->b_data;
 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal,
 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
 	}
-	jbd2_commit_block_csum_set(journal, descriptor);
+	jbd2_commit_block_csum_set(journal, bh);
 
-	JBUFFER_TRACE(descriptor, "submit commit block");
+	BUFFER_TRACE(bh, "submit commit block");
 	lock_buffer(bh);
 	clear_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
@@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
 	if (unlikely(!buffer_uptodate(bh)))
 		ret = -EIO;
 	put_bh(bh);            /* One for getblk() */
-	jbd2_journal_put_journal_head(bh2jh(bh));
 
 	return ret;
 }
@@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 }
 
 static void jbd2_descr_block_csum_set(journal_t *j,
-				      struct journal_head *descriptor)
+				      struct buffer_head *bh)
 {
 	struct jbd2_journal_block_tail *tail;
 	__u32 csum;
@@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j,
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return;
 
-	tail = (struct jbd2_journal_block_tail *)
-			(jh2bh(descriptor)->b_data + j->j_blocksize -
+	tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
 			sizeof(struct jbd2_journal_block_tail));
 	tail->t_checksum = 0;
-	csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
-			   j->j_blocksize);
+	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 	tail->t_checksum = cpu_to_be32(csum);
 }
 
@@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
 {
 	struct page *page = bh->b_page;
 	__u8 *addr;
-	__u32 csum;
+	__u32 csum32;
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return;
 
 	sequence = cpu_to_be32(sequence);
 	addr = kmap_atomic(page);
-	csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
-			  sizeof(sequence));
-	csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
-			  bh->b_size);
+	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+			     sizeof(sequence));
+	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
+			     bh->b_size);
 	kunmap_atomic(addr);
 
-	tag->t_checksum = cpu_to_be32(csum);
+	/* We only have space to store the lower 16 bits of the crc32c. */
+	tag->t_checksum = cpu_to_be16(csum32);
 }
 /*
  * jbd2_journal_commit_transaction
@@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 {
 	struct transaction_stats_s stats;
 	transaction_t *commit_transaction;
-	struct journal_head *jh, *new_jh, *descriptor;
+	struct journal_head *jh;
+	struct buffer_head *descriptor;
 	struct buffer_head **wbuf = journal->j_wbuf;
 	int bufs;
 	int flags;
@@ -382,7 +383,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int space_left = 0;
 	int first_tag = 0;
 	int tag_flag;
-	int i, to_free = 0;
+	int i;
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
@@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	tid_t first_tid;
 	int update_tail;
 	int csum_size = 0;
+	LIST_HEAD(io_bufs);
+	LIST_HEAD(log_bufs);
 
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		csum_size = sizeof(struct jbd2_journal_block_tail);
@@ -424,18 +427,23 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	J_ASSERT(journal->j_committing_transaction == NULL);
 
 	commit_transaction = journal->j_running_transaction;
-	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 
 	trace_jbd2_start_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
 	write_lock(&journal->j_state_lock);
+	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 	commit_transaction->t_state = T_LOCKED;
 
 	trace_jbd2_commit_locking(journal, commit_transaction);
 	stats.run.rs_wait = commit_transaction->t_max_wait;
+	stats.run.rs_request_delay = 0;
 	stats.run.rs_locked = jiffies;
+	if (commit_transaction->t_requested)
+		stats.run.rs_request_delay =
+			jbd2_time_diff(commit_transaction->t_requested,
+				       stats.run.rs_locked);
 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 					      stats.run.rs_locked);
 
@@ -515,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	jbd2_journal_switch_revoke_table(journal);
 
+	/*
+	 * Reserved credits cannot be claimed anymore, free them
+	 */
+	atomic_sub(atomic_read(&journal->j_reserved_credits),
+		   &commit_transaction->t_outstanding_credits);
+
 	trace_jbd2_commit_flushing(journal, commit_transaction);
 	stats.run.rs_flushing = jiffies;
 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -528,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	wake_up(&journal->j_wait_transaction_locked);
 	write_unlock(&journal->j_state_lock);
 
-	jbd_debug(3, "JBD2: commit phase 2\n");
+	jbd_debug(3, "JBD2: commit phase 2a\n");
 
 	/*
 	 * Now start flushing things to disk, in the order they appear
@@ -540,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 	blk_start_plug(&plug);
 	jbd2_journal_write_revoke_records(journal, commit_transaction,
-					  WRITE_SYNC);
+					  &log_bufs, WRITE_SYNC);
 	blk_finish_plug(&plug);
 
-	jbd_debug(3, "JBD2: commit phase 2\n");
+	jbd_debug(3, "JBD2: commit phase 2b\n");
 
 	/*
 	 * Way to go: we have now written out all of the data for a
@@ -566,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		 atomic_read(&commit_transaction->t_outstanding_credits));
 
 	err = 0;
-	descriptor = NULL;
 	bufs = 0;
+	descriptor = NULL;
 	blk_start_plug(&plug);
 	while (commit_transaction->t_buffers) {
 
@@ -599,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		   record the metadata buffer. */
 
 		if (!descriptor) {
-			struct buffer_head *bh;
-
 			J_ASSERT (bufs == 0);
 
 			jbd_debug(4, "JBD2: get descriptor\n");
@@ -611,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 				continue;
 			}
 
-			bh = jh2bh(descriptor);
 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
-				(unsigned long long)bh->b_blocknr, bh->b_data);
-			header = (journal_header_t *)&bh->b_data[0];
+				(unsigned long long)descriptor->b_blocknr,
+				descriptor->b_data);
+			header = (journal_header_t *)descriptor->b_data;
 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 
-			tagp = &bh->b_data[sizeof(journal_header_t)];
-			space_left = bh->b_size - sizeof(journal_header_t);
+			tagp = &descriptor->b_data[sizeof(journal_header_t)];
+			space_left = descriptor->b_size -
+						sizeof(journal_header_t);
 			first_tag = 1;
-			set_buffer_jwrite(bh);
-			set_buffer_dirty(bh);
-			wbuf[bufs++] = bh;
+			set_buffer_jwrite(descriptor);
+			set_buffer_dirty(descriptor);
+			wbuf[bufs++] = descriptor;
 
 			/* Record it so that we can wait for IO
                            completion later */
-			BUFFER_TRACE(bh, "ph3: file as descriptor");
-			jbd2_journal_file_buffer(descriptor, commit_transaction,
-					BJ_LogCtl);
+			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
+			jbd2_file_log_bh(&log_bufs, descriptor);
 		}
 
 		/* Where is the buffer to be written? */
@@ -653,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 		/* Bump b_count to prevent truncate from stumbling over
                    the shadowed buffer!  @@@ This can go if we ever get
-                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+                   rid of the shadow pairing of buffers. */
 		atomic_inc(&jh2bh(jh)->b_count);
 
-		/* Make a temporary IO buffer with which to write it out
-                   (this will requeue both the metadata buffer and the
-                   temporary IO buffer). new_bh goes on BJ_IO*/
-
-		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 		/*
-		 * akpm: jbd2_journal_write_metadata_buffer() sets
-		 * new_bh->b_transaction to commit_transaction.
-		 * We need to clean this up before we release new_bh
-		 * (which is of type BJ_IO)
+		 * Make a temporary IO buffer with which to write it out
+		 * (this will requeue the metadata buffer to BJ_Shadow).
 		 */
+		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 		JBUFFER_TRACE(jh, "ph3: write metadata");
 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
-						      jh, &new_jh, blocknr);
+						jh, &wbuf[bufs], blocknr);
 		if (flags < 0) {
 			jbd2_journal_abort(journal, flags);
 			continue;
 		}
-		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
-		wbuf[bufs++] = jh2bh(new_jh);
+		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
 
 		/* Record the new block's tag in the current descriptor
                    buffer */
@@ -689,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		tag = (journal_block_tag_t *) tagp;
 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 		tag->t_flags = cpu_to_be16(tag_flag);
-		jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
+		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
 					commit_transaction->t_tid);
 		tagp += tag_bytes;
 		space_left -= tag_bytes;
+		bufs++;
 
 		if (first_tag) {
 			memcpy (tagp, journal->j_uuid, 16);
@@ -804,7 +810,7 @@ start_journal_io:
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
            transaction's t_log_list queue, and metadata buffers are on
-           the t_iobuf_list queue.
+           the io_bufs list.
 
 	   Wait for the buffers in reverse order.  That way we are
 	   less likely to be woken up until all IOs have completed, and
@@ -813,47 +819,33 @@ start_journal_io:
 
 	jbd_debug(3, "JBD2: commit phase 3\n");
 
-	/*
-	 * akpm: these are BJ_IO, and j_list_lock is not needed.
-	 * See __journal_try_to_free_buffer.
-	 */
-wait_for_iobuf:
-	while (commit_transaction->t_iobuf_list != NULL) {
-		struct buffer_head *bh;
+	while (!list_empty(&io_bufs)) {
+		struct buffer_head *bh = list_entry(io_bufs.prev,
+						    struct buffer_head,
+						    b_assoc_buffers);
 
-		jh = commit_transaction->t_iobuf_list->b_tprev;
-		bh = jh2bh(jh);
-		if (buffer_locked(bh)) {
-			wait_on_buffer(bh);
-			goto wait_for_iobuf;
-		}
-		if (cond_resched())
-			goto wait_for_iobuf;
+		wait_on_buffer(bh);
+		cond_resched();
 
 		if (unlikely(!buffer_uptodate(bh)))
 			err = -EIO;
-
-		clear_buffer_jwrite(bh);
-
-		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
-		jbd2_journal_unfile_buffer(journal, jh);
+		jbd2_unfile_log_bh(bh);
 
 		/*
-		 * ->t_iobuf_list should contain only dummy buffer_heads
-		 * which were created by jbd2_journal_write_metadata_buffer().
+		 * The list contains temporary buffer heads created by
+		 * jbd2_journal_write_metadata_buffer().
 		 */
 		BUFFER_TRACE(bh, "dumping temporary bh");
-		jbd2_journal_put_journal_head(jh);
 		__brelse(bh);
 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 		free_buffer_head(bh);
 
-		/* We also have to unlock and free the corresponding
-                   shadowed buffer */
+		/* We also have to refile the corresponding shadowed buffer */
 		jh = commit_transaction->t_shadow_list->b_tprev;
 		bh = jh2bh(jh);
-		clear_bit(BH_JWrite, &bh->b_state);
+		clear_buffer_jwrite(bh);
 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
+		J_ASSERT_BH(bh, !buffer_shadow(bh));
 
 		/* The metadata is now released for reuse, but we need
                    to remember it against this transaction so that when
@@ -861,14 +853,6 @@ wait_for_iobuf:
                    required. */
 		JBUFFER_TRACE(jh, "file as BJ_Forget");
 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
-		/*
-		 * Wake up any transactions which were waiting for this IO to
-		 * complete. The barrier must be here so that changes by
-		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
-		 * does the waitqueue check.
-		 */
-		smp_mb();
-		wake_up_bit(&bh->b_state, BH_Unshadow);
 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
 		__brelse(bh);
 	}
@@ -878,26 +862,19 @@ wait_for_iobuf:
 	jbd_debug(3, "JBD2: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
-	while (commit_transaction->t_log_list != NULL) {
+	while (!list_empty(&log_bufs)) {
 		struct buffer_head *bh;
 
-		jh = commit_transaction->t_log_list->b_tprev;
-		bh = jh2bh(jh);
-		if (buffer_locked(bh)) {
-			wait_on_buffer(bh);
-			goto wait_for_ctlbuf;
-		}
-		if (cond_resched())
-			goto wait_for_ctlbuf;
+		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
+		wait_on_buffer(bh);
+		cond_resched();
 
 		if (unlikely(!buffer_uptodate(bh)))
 			err = -EIO;
 
 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 		clear_buffer_jwrite(bh);
-		jbd2_journal_unfile_buffer(journal, jh);
-		jbd2_journal_put_journal_head(jh);
+		jbd2_unfile_log_bh(bh);
 		__brelse(bh);		/* One for getblk */
 		/* AKPM: bforget here */
 	}
@@ -947,9 +924,7 @@ wait_for_iobuf:
 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
-	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
-	J_ASSERT(commit_transaction->t_log_list == NULL);
 
 restart_loop:
 	/*
@@ -1116,7 +1091,10 @@ restart_loop:
 	 */
 	spin_lock(&journal->j_history_lock);
 	journal->j_stats.ts_tid++;
+	if (commit_transaction->t_requested)
+		journal->j_stats.ts_requested++;
 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
+	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
 	journal->j_stats.run.rs_running += stats.run.rs_running;
 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
@@ -1126,7 +1104,7 @@ restart_loop:
 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
 	spin_unlock(&journal->j_history_lock);
 
-	commit_transaction->t_state = T_FINISHED;
+	commit_transaction->t_state = T_COMMIT_CALLBACK;
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
 	journal->j_committing_transaction = NULL;
@@ -1141,38 +1119,44 @@ restart_loop:
 				journal->j_average_commit_time*3) / 4;
 	else
 		journal->j_average_commit_time = commit_time;
+
 	write_unlock(&journal->j_state_lock);
 
-	if (commit_transaction->t_checkpoint_list == NULL &&
-	    commit_transaction->t_checkpoint_io_list == NULL) {
-		__jbd2_journal_drop_transaction(journal, commit_transaction);
-		to_free = 1;
+	if (journal->j_checkpoint_transactions == NULL) {
+		journal->j_checkpoint_transactions = commit_transaction;
+		commit_transaction->t_cpnext = commit_transaction;
+		commit_transaction->t_cpprev = commit_transaction;
 	} else {
-		if (journal->j_checkpoint_transactions == NULL) {
-			journal->j_checkpoint_transactions = commit_transaction;
-			commit_transaction->t_cpnext = commit_transaction;
-			commit_transaction->t_cpprev = commit_transaction;
-		} else {
-			commit_transaction->t_cpnext =
-				journal->j_checkpoint_transactions;
-			commit_transaction->t_cpprev =
-				commit_transaction->t_cpnext->t_cpprev;
-			commit_transaction->t_cpnext->t_cpprev =
-				commit_transaction;
-			commit_transaction->t_cpprev->t_cpnext =
+		commit_transaction->t_cpnext =
+			journal->j_checkpoint_transactions;
+		commit_transaction->t_cpprev =
+			commit_transaction->t_cpnext->t_cpprev;
+		commit_transaction->t_cpnext->t_cpprev =
+			commit_transaction;
+		commit_transaction->t_cpprev->t_cpnext =
 				commit_transaction;
-		}
 	}
 	spin_unlock(&journal->j_list_lock);
-
+	/* Drop all spin_locks because commit_callback may be block.
+	 * __journal_remove_checkpoint() can not destroy transaction
+	 * under us because it is not marked as T_FINISHED yet */
 	if (journal->j_commit_callback)
 		journal->j_commit_callback(journal, commit_transaction);
 
 	trace_jbd2_end_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
-	if (to_free)
-		jbd2_journal_free_transaction(commit_transaction);
 
+	write_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	commit_transaction->t_state = T_FINISHED;
+	/* Recheck checkpoint lists after j_list_lock was dropped */
+	if (commit_transaction->t_checkpoint_list == NULL &&
+	    commit_transaction->t_checkpoint_io_list == NULL) {
+		__jbd2_journal_drop_transaction(journal, commit_transaction);
+		jbd2_journal_free_transaction(commit_transaction);
+	}
+	spin_unlock(&journal->j_list_lock);
+	write_unlock(&journal->j_state_lock);
 	wake_up(&journal->j_wait_done_commit);
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index dbf41f9452d..02c7ad9d7a4 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -35,7 +35,6 @@
 #include <linux/kthread.h>
 #include <linux/poison.h>
 #include <linux/proc_fs.h>
-#include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/math64.h>
 #include <linux/hash.h>
@@ -51,6 +50,14 @@
 #include <asm/uaccess.h>
 #include <asm/page.h>
 
+#ifdef CONFIG_JBD2_DEBUG
+ushort jbd2_journal_enable_debug __read_mostly;
+EXPORT_SYMBOL(jbd2_journal_enable_debug);
+
+module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
+MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
+#endif
+
 EXPORT_SYMBOL(jbd2_journal_extend);
 EXPORT_SYMBOL(jbd2_journal_stop);
 EXPORT_SYMBOL(jbd2_journal_lock_updates);
@@ -96,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache);
 static void __journal_abort_soft (journal_t *journal, int errno);
 static int jbd2_journal_create_slab(size_t slab_size);
 
+#ifdef CONFIG_JBD2_DEBUG
+void __jbd2_debug(int level, const char *file, const char *func,
+		  unsigned int line, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (level > jbd2_journal_enable_debug)
+		return;
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+	va_end(args);
+}
+EXPORT_SYMBOL(__jbd2_debug);
+#endif
+
 /* Checksumming functions */
 int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 {
@@ -303,14 +328,12 @@ static void journal_kill_thread(journal_t *journal)
  *
  * If the source buffer has already been modified by a new transaction
  * since we took the last commit snapshot, we use the frozen copy of
- * that data for IO.  If we end up using the existing buffer_head's data
- * for the write, then we *have* to lock the buffer to prevent anyone
- * else from using and possibly modifying it while the IO is in
- * progress.
- *
- * The function returns a pointer to the buffer_heads to be used for IO.
+ * that data for IO. If we end up using the existing buffer_head's data
+ * for the write, then we have to make sure nobody modifies it while the
+ * IO is in progress. do_get_write_access() handles this.
  *
- * We assume that the journal has already been locked in this function.
+ * The function returns a pointer to the buffer_head to be used for IO.
+ * 
  *
  * Return value:
  *  <0: Error
@@ -323,15 +346,14 @@ static void journal_kill_thread(journal_t *journal)
 
 int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 				  struct journal_head  *jh_in,
-				  struct journal_head **jh_out,
-				  unsigned long long blocknr)
+				  struct buffer_head **bh_out,
+				  sector_t blocknr)
 {
 	int need_copy_out = 0;
 	int done_copy_out = 0;
 	int do_escape = 0;
 	char *mapped_data;
 	struct buffer_head *new_bh;
-	struct journal_head *new_jh;
 	struct page *new_page;
 	unsigned int new_offset;
 	struct buffer_head *bh_in = jh2bh(jh_in);
@@ -360,17 +382,14 @@ retry_alloc:
 	}
 
 	/* keep subsequent assertions sane */
-	new_bh->b_state = 0;
-	init_buffer(new_bh, NULL, NULL);
 	atomic_set(&new_bh->b_count, 1);
-	new_jh = jbd2_journal_add_journal_head(new_bh);	/* This sleeps */
 
+	jbd_lock_bh_state(bh_in);
+repeat:
 	/*
 	 * If a new transaction has already done a buffer copy-out, then
 	 * we use that version of the data for the commit.
 	 */
-	jbd_lock_bh_state(bh_in);
-repeat:
 	if (jh_in->b_frozen_data) {
 		done_copy_out = 1;
 		new_page = virt_to_page(jh_in->b_frozen_data);
@@ -410,7 +429,7 @@ repeat:
 		jbd_unlock_bh_state(bh_in);
 		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
 		if (!tmp) {
-			jbd2_journal_put_journal_head(new_jh);
+			brelse(new_bh);
 			return -ENOMEM;
 		}
 		jbd_lock_bh_state(bh_in);
@@ -421,7 +440,7 @@ repeat:
 
 		jh_in->b_frozen_data = tmp;
 		mapped_data = kmap_atomic(new_page);
-		memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+		memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
 		kunmap_atomic(mapped_data);
 
 		new_page = virt_to_page(tmp);
@@ -447,14 +466,14 @@ repeat:
 	}
 
 	set_bh_page(new_bh, new_page, new_offset);
-	new_jh->b_transaction = NULL;
-	new_bh->b_size = jh2bh(jh_in)->b_size;
-	new_bh->b_bdev = transaction->t_journal->j_dev;
+	new_bh->b_size = bh_in->b_size;
+	new_bh->b_bdev = journal->j_dev;
 	new_bh->b_blocknr = blocknr;
+	new_bh->b_private = bh_in;
 	set_buffer_mapped(new_bh);
 	set_buffer_dirty(new_bh);
 
-	*jh_out = new_jh;
+	*bh_out = new_bh;
 
 	/*
 	 * The to-be-written buffer needs to get moved to the io queue,
@@ -465,11 +484,9 @@ repeat:
 	spin_lock(&journal->j_list_lock);
 	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
 	spin_unlock(&journal->j_list_lock);
+	set_buffer_shadow(bh_in);
 	jbd_unlock_bh_state(bh_in);
 
-	JBUFFER_TRACE(new_jh, "file as BJ_IO");
-	jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
-
 	return do_escape | (done_copy_out << 1);
 }
 
@@ -479,40 +496,15 @@ repeat:
  */
 
 /*
- * __jbd2_log_space_left: Return the number of free blocks left in the journal.
- *
- * Called with the journal already locked.
- *
- * Called under j_state_lock
- */
-
-int __jbd2_log_space_left(journal_t *journal)
-{
-	int left = journal->j_free;
-
-	/* assert_spin_locked(&journal->j_state_lock); */
-
-	/*
-	 * Be pessimistic here about the number of those free blocks which
-	 * might be required for log descriptor control blocks.
-	 */
-
-#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-
-	left -= MIN_LOG_RESERVED_BLOCKS;
-
-	if (left <= 0)
-		return 0;
-	left -= (left >> 3);
-	return left;
-}
-
-/*
  * Called with j_state_lock locked for writing.
  * Returns true if a transaction commit was started.
  */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
+	/* Return if the txn has already requested to be committed */
+	if (journal->j_commit_request == target)
+		return 0;
+
 	/*
 	 * The only transaction we can possibly wait upon is the
 	 * currently running transaction (if it exists).  Otherwise,
@@ -529,6 +521,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 		jbd_debug(1, "JBD2: requesting commit %d/%d\n",
 			  journal->j_commit_request,
 			  journal->j_commit_sequence);
+		journal->j_running_transaction->t_requested = jiffies;
 		wake_up(&journal->j_wait_commit);
 		return 1;
 	} else if (!tid_geq(journal->j_commit_request, target))
@@ -554,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
 }
 
 /*
- * Force and wait upon a commit if the calling process is not within
- * transaction.  This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
- *
- * We can only force the running transaction if we don't have an active handle;
- * otherwise, we will deadlock.
- *
- * Returns true if a transaction was started.
+ * Force and wait any uncommitted transactions.  We can only force the running
+ * transaction if we don't have an active handle, otherwise, we will deadlock.
+ * Returns: <0 in case of error,
+ *           0 if nothing to commit,
+ *           1 if transaction was successfully committed.
  */
-int jbd2_journal_force_commit_nested(journal_t *journal)
+static int __jbd2_journal_force_commit(journal_t *journal)
 {
 	transaction_t *transaction = NULL;
 	tid_t tid;
-	int need_to_start = 0;
+	int need_to_start = 0, ret = 0;
 
 	read_lock(&journal->j_state_lock);
 	if (journal->j_running_transaction && !current->journal_info) {
@@ -578,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 		transaction = journal->j_committing_transaction;
 
 	if (!transaction) {
+		/* Nothing to commit */
 		read_unlock(&journal->j_state_lock);
-		return 0;	/* Nothing to retry */
+		return 0;
 	}
-
 	tid = transaction->t_tid;
 	read_unlock(&journal->j_state_lock);
 	if (need_to_start)
 		jbd2_log_start_commit(journal, tid);
-	jbd2_log_wait_commit(journal, tid);
-	return 1;
+	ret = jbd2_log_wait_commit(journal, tid);
+	if (!ret)
+		ret = 1;
+
+	return ret;
+}
+
+/**
+ * Force and wait upon a commit if the calling process is not within
+ * transaction.  This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * @journal: journal to force
+ * Returns true if progress was made.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+	int ret;
+
+	ret = __jbd2_journal_force_commit(journal);
+	return ret > 0;
+}
+
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * Caller want unconditional commit. We can only force the running transaction
+ * if we don't have an active handle, otherwise, we will deadlock.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+	int ret;
+
+	J_ASSERT(!current->journal_info);
+	ret = __jbd2_journal_force_commit(journal);
+	if (ret > 0)
+		ret = 0;
+	return ret;
 }
 
 /*
@@ -698,6 +725,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 }
 
 /*
+ * When this function returns the transaction corresponding to tid
+ * will be completed.  If the transaction has currently running, start
+ * committing that transaction before waiting for it to complete.  If
+ * the transaction id is stale, it is by definition already completed,
+ * so just return SUCCESS.
+ */
+int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+{
+	int	need_to_wait = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid) {
+		if (journal->j_commit_request != tid) {
+			/* transaction not yet started, so request it */
+			read_unlock(&journal->j_state_lock);
+			jbd2_log_start_commit(journal, tid);
+			goto wait_commit;
+		}
+	} else if (!(journal->j_committing_transaction &&
+		     journal->j_committing_transaction->t_tid == tid))
+		need_to_wait = 0;
+	read_unlock(&journal->j_state_lock);
+	if (!need_to_wait)
+		return 0;
+wait_commit:
+	return jbd2_log_wait_commit(journal, tid);
+}
+EXPORT_SYMBOL(jbd2_complete_transaction);
+
+/*
  * Log buffer allocation routines:
  */
 
@@ -757,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
  * But we don't bother doing that, so there will be coherency problems with
  * mmaps of blockdevs which hold live JBD-controlled filesystems.
  */
-struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
 {
 	struct buffer_head *bh;
 	unsigned long long blocknr;
@@ -776,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 	BUFFER_TRACE(bh, "return this buffer");
-	return jbd2_journal_add_journal_head(bh);
+	return bh;
 }
 
 /*
@@ -894,13 +952,18 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
 
 	if (v != SEQ_START_TOKEN)
 		return 0;
-	seq_printf(seq, "%lu transaction, each up to %u blocks\n",
-			s->stats->ts_tid,
-			s->journal->j_max_transaction_buffers);
+	seq_printf(seq, "%lu transactions (%lu requested), "
+		   "each up to %u blocks\n",
+		   s->stats->ts_tid, s->stats->ts_requested,
+		   s->journal->j_max_transaction_buffers);
 	if (s->stats->ts_tid == 0)
 		return 0;
 	seq_printf(seq, "average: \n  %ums waiting for transaction\n",
 	    jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
+	seq_printf(seq, "  %ums request delay\n",
+	    (s->stats->ts_requested == 0) ? 0 :
+	    jiffies_to_msecs(s->stats->run.rs_request_delay /
+			     s->stats->ts_requested));
 	seq_printf(seq, "  %ums running transaction\n",
 	    jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
 	seq_printf(seq, "  %ums transaction was being locked\n",
@@ -933,7 +996,7 @@ static const struct seq_operations jbd2_seq_info_ops = {
 
 static int jbd2_seq_info_open(struct inode *inode, struct file *file)
 {
-	journal_t *journal = PDE(inode)->data;
+	journal_t *journal = PDE_DATA(inode);
 	struct jbd2_stats_proc_session *s;
 	int rc, size;
 
@@ -1016,11 +1079,10 @@ static journal_t * journal_init_common (void)
 		return NULL;
 
 	init_waitqueue_head(&journal->j_wait_transaction_locked);
-	init_waitqueue_head(&journal->j_wait_logspace);
 	init_waitqueue_head(&journal->j_wait_done_commit);
-	init_waitqueue_head(&journal->j_wait_checkpoint);
 	init_waitqueue_head(&journal->j_wait_commit);
 	init_waitqueue_head(&journal->j_wait_updates);
+	init_waitqueue_head(&journal->j_wait_reserved);
 	mutex_init(&journal->j_barrier);
 	mutex_init(&journal->j_checkpoint_mutex);
 	spin_lock_init(&journal->j_revoke_lock);
@@ -1030,6 +1092,7 @@ static journal_t * journal_init_common (void)
 	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
 	journal->j_min_batch_time = 0;
 	journal->j_max_batch_time = 15000; /* 15ms */
+	atomic_set(&journal->j_reserved_credits, 0);
 
 	/* The journal is marked for error until we succeed with recovery! */
 	journal->j_flags = JBD2_ABORT;
@@ -1272,6 +1335,7 @@ static int journal_reset(journal_t *journal)
 static void jbd2_write_superblock(journal_t *journal, int write_op)
 {
 	struct buffer_head *bh = journal->j_sb_buffer;
+	journal_superblock_t *sb = journal->j_superblock;
 	int ret;
 
 	trace_jbd2_write_superblock(journal, write_op);
@@ -1293,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
 		clear_buffer_write_io_error(bh);
 		set_buffer_uptodate(bh);
 	}
+	jbd2_superblock_csum_set(journal, sb);
 	get_bh(bh);
 	bh->b_end_io = end_buffer_write_sync;
 	ret = submit_bh(write_op, bh);
@@ -1389,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
 		  journal->j_errno);
 	sb->s_errno    = cpu_to_be32(journal->j_errno);
-	jbd2_superblock_csum_set(journal, sb);
 	read_unlock(&journal->j_state_lock);
 
 	jbd2_write_superblock(journal, WRITE_SYNC);
@@ -2279,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void)
 #ifdef CONFIG_JBD2_DEBUG
 	atomic_inc(&nr_journal_heads);
 #endif
-	ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+	ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
 	if (!ret) {
 		jbd_debug(1, "out of memory for journal_head\n");
 		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
 		while (!ret) {
 			yield();
-			ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+			ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
 		}
 	}
 	return ret;
@@ -2347,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
 	struct journal_head *new_jh = NULL;
 
 repeat:
-	if (!buffer_jbd(bh)) {
+	if (!buffer_jbd(bh))
 		new_jh = journal_alloc_journal_head();
-		memset(new_jh, 0, sizeof(*new_jh));
-	}
 
 	jbd_lock_bh_journal_head(bh);
 	if (buffer_jbd(bh)) {
@@ -2485,45 +2547,6 @@ restart:
 	spin_unlock(&journal->j_list_lock);
 }
 
-/*
- * debugfs tunables
- */
-#ifdef CONFIG_JBD2_DEBUG
-u8 jbd2_journal_enable_debug __read_mostly;
-EXPORT_SYMBOL(jbd2_journal_enable_debug);
-
-#define JBD2_DEBUG_NAME "jbd2-debug"
-
-static struct dentry *jbd2_debugfs_dir;
-static struct dentry *jbd2_debug;
-
-static void __init jbd2_create_debugfs_entry(void)
-{
-	jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL);
-	if (jbd2_debugfs_dir)
-		jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME,
-					       S_IRUGO | S_IWUSR,
-					       jbd2_debugfs_dir,
-					       &jbd2_journal_enable_debug);
-}
-
-static void __exit jbd2_remove_debugfs_entry(void)
-{
-	debugfs_remove(jbd2_debug);
-	debugfs_remove(jbd2_debugfs_dir);
-}
-
-#else
-
-static void __init jbd2_create_debugfs_entry(void)
-{
-}
-
-static void __exit jbd2_remove_debugfs_entry(void)
-{
-}
-
-#endif
 
 #ifdef CONFIG_PROC_FS
 
@@ -2609,7 +2632,6 @@ static int __init journal_init(void)
 
 	ret = journal_init_caches();
 	if (ret == 0) {
-		jbd2_create_debugfs_entry();
 		jbd2_create_jbd_stats_proc_entry();
 	} else {
 		jbd2_journal_destroy_caches();
@@ -2624,7 +2646,6 @@ static void __exit journal_exit(void)
 	if (n)
 		printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
 #endif
-	jbd2_remove_debugfs_entry();
 	jbd2_remove_jbd_stats_proc_entry();
 	jbd2_journal_destroy_caches();
 }
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 626846bac32..d4851464b57 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
 static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
 				      void *buf, __u32 sequence)
 {
-	__u32 provided, calculated;
+	__u32 csum32;
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return 1;
 
 	sequence = cpu_to_be32(sequence);
-	calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
-				 sizeof(sequence));
-	calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
-	provided = be32_to_cpu(tag->t_checksum);
+	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+			     sizeof(sequence));
+	csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
 
-	return provided == cpu_to_be32(calculated);
+	return tag->t_checksum == cpu_to_be16(csum32);
 }
 
 static int do_one_pass(journal_t *journal,
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f30b80b4ce8..198c9c10276 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s
 
 #ifdef __KERNEL__
 static void write_one_revoke_record(journal_t *, transaction_t *,
-				    struct journal_head **, int *,
+				    struct list_head *,
+				    struct buffer_head **, int *,
 				    struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
 #endif
 
 /* Utility functions to maintain the revoke table */
@@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
  */
 void jbd2_journal_write_revoke_records(journal_t *journal,
 				       transaction_t *transaction,
+				       struct list_head *log_bufs,
 				       int write_op)
 {
-	struct journal_head *descriptor;
+	struct buffer_head *descriptor;
 	struct jbd2_revoke_record_s *record;
 	struct jbd2_revoke_table_s *revoke;
 	struct list_head *hash_list;
@@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
 		while (!list_empty(hash_list)) {
 			record = (struct jbd2_revoke_record_s *)
 				hash_list->next;
-			write_one_revoke_record(journal, transaction,
+			write_one_revoke_record(journal, transaction, log_bufs,
 						&descriptor, &offset,
 						record, write_op);
 			count++;
@@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
 
 static void write_one_revoke_record(journal_t *journal,
 				    transaction_t *transaction,
-				    struct journal_head **descriptorp,
+				    struct list_head *log_bufs,
+				    struct buffer_head **descriptorp,
 				    int *offsetp,
 				    struct jbd2_revoke_record_s *record,
 				    int write_op)
 {
 	int csum_size = 0;
-	struct journal_head *descriptor;
+	struct buffer_head *descriptor;
 	int offset;
 	journal_header_t *header;
 
@@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal,
 		descriptor = jbd2_journal_get_descriptor_buffer(journal);
 		if (!descriptor)
 			return;
-		header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+		header = (journal_header_t *)descriptor->b_data;
 		header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 		header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
 		header->h_sequence  = cpu_to_be32(transaction->t_tid);
 
 		/* Record it so that we can wait for IO completion later */
-		JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
-		jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+		BUFFER_TRACE(descriptor, "file in log_bufs");
+		jbd2_file_log_bh(log_bufs, descriptor);
 
 		offset = sizeof(jbd2_journal_revoke_header_t);
 		*descriptorp = descriptor;
 	}
 
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
-		* ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+		* ((__be64 *)(&descriptor->b_data[offset])) =
 			cpu_to_be64(record->blocknr);
 		offset += 8;
 
 	} else {
-		* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+		* ((__be32 *)(&descriptor->b_data[offset])) =
 			cpu_to_be32(record->blocknr);
 		offset += 4;
 	}
@@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal,
 	*offsetp = offset;
 }
 
-static void jbd2_revoke_csum_set(journal_t *j,
-				 struct journal_head *descriptor)
+static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
 {
 	struct jbd2_journal_revoke_tail *tail;
 	__u32 csum;
@@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j,
 	if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 		return;
 
-	tail = (struct jbd2_journal_revoke_tail *)
-			(jh2bh(descriptor)->b_data + j->j_blocksize -
+	tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
 			sizeof(struct jbd2_journal_revoke_tail));
 	tail->r_checksum = 0;
-	csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
-			   j->j_blocksize);
+	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 	tail->r_checksum = cpu_to_be32(csum);
 }
 
@@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j,
  */
 
 static void flush_descriptor(journal_t *journal,
-			     struct journal_head *descriptor,
+			     struct buffer_head *descriptor,
 			     int offset, int write_op)
 {
 	jbd2_journal_revoke_header_t *header;
-	struct buffer_head *bh = jh2bh(descriptor);
 
 	if (is_journal_aborted(journal)) {
-		put_bh(bh);
+		put_bh(descriptor);
 		return;
 	}
 
-	header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+	header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
 	header->r_count = cpu_to_be32(offset);
 	jbd2_revoke_csum_set(journal, descriptor);
 
-	set_buffer_jwrite(bh);
-	BUFFER_TRACE(bh, "write");
-	set_buffer_dirty(bh);
-	write_dirty_buffer(bh, write_op);
+	set_buffer_jwrite(descriptor);
+	BUFFER_TRACE(descriptor, "write");
+	set_buffer_dirty(descriptor);
+	write_dirty_buffer(descriptor, write_op);
 }
 #endif
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index df9f29760ef..7aa9a32573b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -30,6 +30,8 @@
 #include <linux/bug.h>
 #include <linux/module.h>
 
+#include <trace/events/jbd2.h>
+
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
 
@@ -87,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
 	atomic_set(&transaction->t_updates, 0);
-	atomic_set(&transaction->t_outstanding_credits, 0);
+	atomic_set(&transaction->t_outstanding_credits,
+		   atomic_read(&journal->j_reserved_credits));
 	atomic_set(&transaction->t_handle_count, 0);
 	INIT_LIST_HEAD(&transaction->t_inode_list);
 	INIT_LIST_HEAD(&transaction->t_private_list);
@@ -100,6 +103,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	journal->j_running_transaction = transaction;
 	transaction->t_max_wait = 0;
 	transaction->t_start = jiffies;
+	transaction->t_requested = 0;
 
 	return transaction;
 }
@@ -138,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction,
 }
 
 /*
+ * Wait until running transaction passes T_LOCKED state. Also starts the commit
+ * if needed. The function expects running transaction to exist and releases
+ * j_state_lock.
+ */
+static void wait_transaction_locked(journal_t *journal)
+	__releases(journal->j_state_lock)
+{
+	DEFINE_WAIT(wait);
+	int need_to_start;
+	tid_t tid = journal->j_running_transaction->t_tid;
+
+	prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+			TASK_UNINTERRUPTIBLE);
+	need_to_start = !tid_geq(journal->j_commit_request, tid);
+	read_unlock(&journal->j_state_lock);
+	if (need_to_start)
+		jbd2_log_start_commit(journal, tid);
+	schedule();
+	finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+
+static void sub_reserved_credits(journal_t *journal, int blocks)
+{
+	atomic_sub(blocks, &journal->j_reserved_credits);
+	wake_up(&journal->j_wait_reserved);
+}
+
+/*
+ * Wait until we can add credits for handle to the running transaction.  Called
+ * with j_state_lock held for reading. Returns 0 if handle joined the running
+ * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
+ * caller must retry.
+ */
+static int add_transaction_credits(journal_t *journal, int blocks,
+				   int rsv_blocks)
+{
+	transaction_t *t = journal->j_running_transaction;
+	int needed;
+	int total = blocks + rsv_blocks;
+
+	/*
+	 * If the current transaction is locked down for commit, wait
+	 * for the lock to be released.
+	 */
+	if (t->t_state == T_LOCKED) {
+		wait_transaction_locked(journal);
+		return 1;
+	}
+
+	/*
+	 * If there is not enough space left in the log to write all
+	 * potential buffers requested by this operation, we need to
+	 * stall pending a log checkpoint to free some more log space.
+	 */
+	needed = atomic_add_return(total, &t->t_outstanding_credits);
+	if (needed > journal->j_max_transaction_buffers) {
+		/*
+		 * If the current transaction is already too large,
+		 * then start to commit it: we can then go back and
+		 * attach this handle to a new transaction.
+		 */
+		atomic_sub(total, &t->t_outstanding_credits);
+		wait_transaction_locked(journal);
+		return 1;
+	}
+
+	/*
+	 * The commit code assumes that it can get enough log space
+	 * without forcing a checkpoint.  This is *critical* for
+	 * correctness: a checkpoint of a buffer which is also
+	 * associated with a committing transaction creates a deadlock,
+	 * so commit simply cannot force through checkpoints.
+	 *
+	 * We must therefore ensure the necessary space in the journal
+	 * *before* starting to dirty potentially checkpointed buffers
+	 * in the new transaction.
+	 */
+	if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+		atomic_sub(total, &t->t_outstanding_credits);
+		read_unlock(&journal->j_state_lock);
+		write_lock(&journal->j_state_lock);
+		if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+			__jbd2_log_wait_for_space(journal);
+		write_unlock(&journal->j_state_lock);
+		return 1;
+	}
+
+	/* No reservation? We are done... */
+	if (!rsv_blocks)
+		return 0;
+
+	needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
+	/* We allow at most half of a transaction to be reserved */
+	if (needed > journal->j_max_transaction_buffers / 2) {
+		sub_reserved_credits(journal, rsv_blocks);
+		atomic_sub(total, &t->t_outstanding_credits);
+		read_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_reserved,
+			 atomic_read(&journal->j_reserved_credits) + rsv_blocks
+			 <= journal->j_max_transaction_buffers / 2);
+		return 1;
+	}
+	return 0;
+}
+
+/*
  * start_this_handle: Given a handle, deal with any locking or stalling
  * needed to make sure that there is enough journal space for the handle
  * to begin.  Attach the handle to a transaction and set up the
@@ -148,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
 			     gfp_t gfp_mask)
 {
 	transaction_t	*transaction, *new_transaction = NULL;
-	tid_t		tid;
-	int		needed, need_to_start;
-	int		nblocks = handle->h_buffer_credits;
+	int		blocks = handle->h_buffer_credits;
+	int		rsv_blocks = 0;
 	unsigned long ts = jiffies;
 
-	if (nblocks > journal->j_max_transaction_buffers) {
+	/*
+	 * 1/2 of transaction can be reserved so we can practically handle
+	 * only 1/2 of maximum transaction size per operation
+	 */
+	if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
 		printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
-		       current->comm, nblocks,
-		       journal->j_max_transaction_buffers);
+		       current->comm, blocks,
+		       journal->j_max_transaction_buffers / 2);
 		return -ENOSPC;
 	}
 
+	if (handle->h_rsv_handle)
+		rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+
 alloc_transaction:
 	if (!journal->j_running_transaction) {
 		new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -196,8 +312,12 @@ repeat:
 		return -EROFS;
 	}
 
-	/* Wait on the journal's transaction barrier if necessary */
-	if (journal->j_barrier_count) {
+	/*
+	 * Wait on the journal's transaction barrier if necessary. Specifically
+	 * we allow reserved handles to proceed because otherwise commit could
+	 * deadlock on page writeback not being able to complete.
+	 */
+	if (!handle->h_reserved && journal->j_barrier_count) {
 		read_unlock(&journal->j_state_lock);
 		wait_event(journal->j_wait_transaction_locked,
 				journal->j_barrier_count == 0);
@@ -210,7 +330,7 @@ repeat:
 			goto alloc_transaction;
 		write_lock(&journal->j_state_lock);
 		if (!journal->j_running_transaction &&
-		    !journal->j_barrier_count) {
+		    (handle->h_reserved || !journal->j_barrier_count)) {
 			jbd2_get_transaction(journal, new_transaction);
 			new_transaction = NULL;
 		}
@@ -220,85 +340,18 @@ repeat:
 
 	transaction = journal->j_running_transaction;
 
-	/*
-	 * If the current transaction is locked down for commit, wait for the
-	 * lock to be released.
-	 */
-	if (transaction->t_state == T_LOCKED) {
-		DEFINE_WAIT(wait);
-
-		prepare_to_wait(&journal->j_wait_transaction_locked,
-					&wait, TASK_UNINTERRUPTIBLE);
-		read_unlock(&journal->j_state_lock);
-		schedule();
-		finish_wait(&journal->j_wait_transaction_locked, &wait);
-		goto repeat;
-	}
-
-	/*
-	 * If there is not enough space left in the log to write all potential
-	 * buffers requested by this operation, we need to stall pending a log
-	 * checkpoint to free some more log space.
-	 */
-	needed = atomic_add_return(nblocks,
-				   &transaction->t_outstanding_credits);
-
-	if (needed > journal->j_max_transaction_buffers) {
+	if (!handle->h_reserved) {
+		/* We may have dropped j_state_lock - restart in that case */
+		if (add_transaction_credits(journal, blocks, rsv_blocks))
+			goto repeat;
+	} else {
 		/*
-		 * If the current transaction is already too large, then start
-		 * to commit it: we can then go back and attach this handle to
-		 * a new transaction.
+		 * We have handle reserved so we are allowed to join T_LOCKED
+		 * transaction and we don't have to check for transaction size
+		 * and journal space.
 		 */
-		DEFINE_WAIT(wait);
-
-		jbd_debug(2, "Handle %p starting new commit...\n", handle);
-		atomic_sub(nblocks, &transaction->t_outstanding_credits);
-		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
-				TASK_UNINTERRUPTIBLE);
-		tid = transaction->t_tid;
-		need_to_start = !tid_geq(journal->j_commit_request, tid);
-		read_unlock(&journal->j_state_lock);
-		if (need_to_start)
-			jbd2_log_start_commit(journal, tid);
-		schedule();
-		finish_wait(&journal->j_wait_transaction_locked, &wait);
-		goto repeat;
-	}
-
-	/*
-	 * The commit code assumes that it can get enough log space
-	 * without forcing a checkpoint.  This is *critical* for
-	 * correctness: a checkpoint of a buffer which is also
-	 * associated with a committing transaction creates a deadlock,
-	 * so commit simply cannot force through checkpoints.
-	 *
-	 * We must therefore ensure the necessary space in the journal
-	 * *before* starting to dirty potentially checkpointed buffers
-	 * in the new transaction.
-	 *
-	 * The worst part is, any transaction currently committing can
-	 * reduce the free space arbitrarily.  Be careful to account for
-	 * those buffers when checkpointing.
-	 */
-
-	/*
-	 * @@@ AKPM: This seems rather over-defensive.  We're giving commit
-	 * a _lot_ of headroom: 1/4 of the journal plus the size of
-	 * the committing transaction.  Really, we only need to give it
-	 * committing_transaction->t_outstanding_credits plus "enough" for
-	 * the log control blocks.
-	 * Also, this test is inconsistent with the matching one in
-	 * jbd2_journal_extend().
-	 */
-	if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
-		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
-		atomic_sub(nblocks, &transaction->t_outstanding_credits);
-		read_unlock(&journal->j_state_lock);
-		write_lock(&journal->j_state_lock);
-		if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
-			__jbd2_log_wait_for_space(journal);
-		write_unlock(&journal->j_state_lock);
-		goto repeat;
+		sub_reserved_credits(journal, blocks);
+		handle->h_reserved = 0;
 	}
 
 	/* OK, account for the buffers that this operation expects to
@@ -306,13 +359,16 @@ repeat:
 	 */
 	update_t_max_wait(transaction, ts);
 	handle->h_transaction = transaction;
+	handle->h_requested_credits = blocks;
+	handle->h_start_jiffies = jiffies;
 	atomic_inc(&transaction->t_updates);
 	atomic_inc(&transaction->t_handle_count);
-	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
-		  handle, nblocks,
+	jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+		  handle, blocks,
 		  atomic_read(&transaction->t_outstanding_credits),
-		  __jbd2_log_space_left(journal));
+		  jbd2_log_space_left(journal));
 	read_unlock(&journal->j_state_lock);
+	current->journal_info = handle;
 
 	lock_map_acquire(&handle->h_lockdep_map);
 	jbd2_journal_free_transaction(new_transaction);
@@ -327,7 +383,6 @@ static handle_t *new_handle(int nblocks)
 	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
 	if (!handle)
 		return NULL;
-	memset(handle, 0, sizeof(*handle));
 	handle->h_buffer_credits = nblocks;
 	handle->h_ref = 1;
 
@@ -344,15 +399,21 @@ static handle_t *new_handle(int nblocks)
  *
  * We make sure that the transaction can guarantee at least nblocks of
  * modified buffers in the log.  We block until the log can guarantee
- * that much space.
- *
- * This function is visible to journal users (like ext3fs), so is not
- * called with the journal already locked.
+ * that much space. Additionally, if rsv_blocks > 0, we also create another
+ * handle with rsv_blocks reserved blocks in the journal. This handle is
+ * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * and thus doesn't block transaction commit. If the caller uses this reserved
+ * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
+ * on the parent handle will dispose the reserved one. Reserved handle has to
+ * be converted to a normal handle using jbd2_journal_start_reserved() before
+ * it can be used.
  *
  * Return a pointer to a newly allocated handle, or an ERR_PTR() value
  * on failure.
  */
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
+			      gfp_t gfp_mask, unsigned int type,
+			      unsigned int line_no)
 {
 	handle_t *handle = journal_current_handle();
 	int err;
@@ -369,15 +430,31 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
 	handle = new_handle(nblocks);
 	if (!handle)
 		return ERR_PTR(-ENOMEM);
+	if (rsv_blocks) {
+		handle_t *rsv_handle;
 
-	current->journal_info = handle;
+		rsv_handle = new_handle(rsv_blocks);
+		if (!rsv_handle) {
+			jbd2_free_handle(handle);
+			return ERR_PTR(-ENOMEM);
+		}
+		rsv_handle->h_reserved = 1;
+		rsv_handle->h_journal = journal;
+		handle->h_rsv_handle = rsv_handle;
+	}
 
 	err = start_this_handle(journal, handle, gfp_mask);
 	if (err < 0) {
+		if (handle->h_rsv_handle)
+			jbd2_free_handle(handle->h_rsv_handle);
 		jbd2_free_handle(handle);
-		current->journal_info = NULL;
-		handle = ERR_PTR(err);
+		return ERR_PTR(err);
 	}
+	handle->h_type = type;
+	handle->h_line_no = line_no;
+	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
+				handle->h_transaction->t_tid, type,
+				line_no, nblocks);
 	return handle;
 }
 EXPORT_SYMBOL(jbd2__journal_start);
@@ -385,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start);
 
 handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 {
-	return jbd2__journal_start(journal, nblocks, GFP_NOFS);
+	return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
 }
 EXPORT_SYMBOL(jbd2_journal_start);
 
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+	journal_t *journal = handle->h_journal;
+
+	WARN_ON(!handle->h_reserved);
+	sub_reserved_credits(journal, handle->h_buffer_credits);
+	jbd2_free_handle(handle);
+}
+EXPORT_SYMBOL(jbd2_journal_free_reserved);
+
+/**
+ * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
+ * @handle: handle to start
+ *
+ * Start handle that has been previously reserved with jbd2_journal_reserve().
+ * This attaches @handle to the running transaction (or creates one if there's
+ * not transaction running). Unlike jbd2_journal_start() this function cannot
+ * block on journal commit, checkpointing, or similar stuff. It can block on
+ * memory allocation or frozen journal though.
+ *
+ * Return 0 on success, non-zero on error - handle is freed in that case.
+ */
+int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
+				unsigned int line_no)
+{
+	journal_t *journal = handle->h_journal;
+	int ret = -EIO;
+
+	if (WARN_ON(!handle->h_reserved)) {
+		/* Someone passed in normal handle? Just stop it. */
+		jbd2_journal_stop(handle);
+		return ret;
+	}
+	/*
+	 * Usefulness of mixing of reserved and unreserved handles is
+	 * questionable. So far nobody seems to need it so just error out.
+	 */
+	if (WARN_ON(current->journal_info)) {
+		jbd2_journal_free_reserved(handle);
+		return ret;
+	}
+
+	handle->h_journal = NULL;
+	/*
+	 * GFP_NOFS is here because callers are likely from writeback or
+	 * similarly constrained call sites
+	 */
+	ret = start_this_handle(journal, handle, GFP_NOFS);
+	if (ret < 0)
+		jbd2_journal_free_reserved(handle);
+	handle->h_type = type;
+	handle->h_line_no = line_no;
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_journal_start_reserved);
 
 /**
  * int jbd2_journal_extend() - extend buffer credits.
@@ -413,42 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start);
 int jbd2_journal_extend(handle_t *handle, int nblocks)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	journal_t *journal;
 	int result;
 	int wanted;
 
-	result = -EIO;
+	WARN_ON(!transaction);
 	if (is_handle_aborted(handle))
-		goto out;
+		return -EROFS;
+	journal = transaction->t_journal;
 
 	result = 1;
 
 	read_lock(&journal->j_state_lock);
 
 	/* Don't extend a locked-down transaction! */
-	if (handle->h_transaction->t_state != T_RUNNING) {
+	if (transaction->t_state != T_RUNNING) {
 		jbd_debug(3, "denied handle %p %d blocks: "
 			  "transaction not running\n", handle, nblocks);
 		goto error_out;
 	}
 
 	spin_lock(&transaction->t_handle_lock);
-	wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+	wanted = atomic_add_return(nblocks,
+				   &transaction->t_outstanding_credits);
 
 	if (wanted > journal->j_max_transaction_buffers) {
 		jbd_debug(3, "denied handle %p %d blocks: "
 			  "transaction too large\n", handle, nblocks);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		goto unlock;
 	}
 
-	if (wanted > __jbd2_log_space_left(journal)) {
+	if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
+	    jbd2_log_space_left(journal)) {
 		jbd_debug(3, "denied handle %p %d blocks: "
 			  "insufficient log space\n", handle, nblocks);
+		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		goto unlock;
 	}
 
+	trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
+				 transaction->t_tid,
+				 handle->h_type, handle->h_line_no,
+				 handle->h_buffer_credits,
+				 nblocks);
+
 	handle->h_buffer_credits += nblocks;
-	atomic_add(nblocks, &transaction->t_outstanding_credits);
+	handle->h_requested_credits += nblocks;
 	result = 0;
 
 	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -456,7 +599,6 @@ unlock:
 	spin_unlock(&transaction->t_handle_lock);
 error_out:
 	read_unlock(&journal->j_state_lock);
-out:
 	return result;
 }
 
@@ -473,19 +615,22 @@ out:
  * to a running handle, a call to jbd2_journal_restart will commit the
  * handle's transaction so far and reattach the handle to a new
  * transaction capabable of guaranteeing the requested number of
- * credits.
+ * credits. We preserve reserved handle if there's any attached to the
+ * passed in handle.
  */
 int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	journal_t *journal;
 	tid_t		tid;
 	int		need_to_start, ret;
 
+	WARN_ON(!transaction);
 	/* If we've had an abort of any type, don't even think about
 	 * actually doing the restart! */
 	if (is_handle_aborted(handle))
 		return 0;
+	journal = transaction->t_journal;
 
 	/*
 	 * First unlink the handle from its current transaction, and start the
@@ -498,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
 	spin_lock(&transaction->t_handle_lock);
 	atomic_sub(handle->h_buffer_credits,
 		   &transaction->t_outstanding_credits);
+	if (handle->h_rsv_handle) {
+		sub_reserved_credits(journal,
+				     handle->h_rsv_handle->h_buffer_credits);
+	}
 	if (atomic_dec_and_test(&transaction->t_updates))
 		wake_up(&journal->j_wait_updates);
+	tid = transaction->t_tid;
 	spin_unlock(&transaction->t_handle_lock);
+	handle->h_transaction = NULL;
+	current->journal_info = NULL;
 
 	jbd_debug(2, "restarting handle %p\n", handle);
-	tid = transaction->t_tid;
 	need_to_start = !tid_geq(journal->j_commit_request, tid);
 	read_unlock(&journal->j_state_lock);
 	if (need_to_start)
@@ -540,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal)
 	write_lock(&journal->j_state_lock);
 	++journal->j_barrier_count;
 
+	/* Wait until there are no reserved handles */
+	if (atomic_read(&journal->j_reserved_credits)) {
+		write_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_reserved,
+			   atomic_read(&journal->j_reserved_credits) == 0);
+		write_lock(&journal->j_state_lock);
+	}
+
 	/* Wait until there are no running updates */
 	while (1) {
 		transaction_t *transaction = journal->j_running_transaction;
@@ -602,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
 	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 
+static int sleep_on_shadow_bh(void *word)
+{
+	io_schedule();
+	return 0;
+}
+
 /*
  * If the buffer is already part of the current transaction, then there
  * is nothing we need to do.  If it is already part of a prior
@@ -617,16 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 			int force_copy)
 {
 	struct buffer_head *bh;
-	transaction_t *transaction;
+	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal;
 	int error;
 	char *frozen_buffer = NULL;
 	int need_copy = 0;
+	unsigned long start_lock, time_lock;
 
+	WARN_ON(!transaction);
 	if (is_handle_aborted(handle))
 		return -EROFS;
-
-	transaction = handle->h_transaction;
 	journal = transaction->t_journal;
 
 	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -637,9 +802,16 @@ repeat:
 
 	/* @@@ Need to check for errors here at some point. */
 
+ 	start_lock = jiffies;
 	lock_buffer(bh);
 	jbd_lock_bh_state(bh);
 
+	/* If it takes too long to lock the buffer, trace it */
+	time_lock = jbd2_time_diff(start_lock, jiffies);
+	if (time_lock > HZ/10)
+		trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
+			jiffies_to_msecs(time_lock));
+
 	/* We now hold the buffer lock so it is safe to query the buffer
 	 * state.  Is the buffer dirty?
 	 *
@@ -729,41 +901,29 @@ repeat:
 		 * journaled.  If the primary copy is already going to
 		 * disk then we cannot do copy-out here. */
 
-		if (jh->b_jlist == BJ_Shadow) {
-			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
-			wait_queue_head_t *wqh;
-
-			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
-
+		if (buffer_shadow(bh)) {
 			JBUFFER_TRACE(jh, "on shadow: sleep");
 			jbd_unlock_bh_state(bh);
-			/* commit wakes up all shadow buffers after IO */
-			for ( ; ; ) {
-				prepare_to_wait(wqh, &wait.wait,
-						TASK_UNINTERRUPTIBLE);
-				if (jh->b_jlist != BJ_Shadow)
-					break;
-				schedule();
-			}
-			finish_wait(wqh, &wait.wait);
+			wait_on_bit(&bh->b_state, BH_Shadow,
+				    sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
 			goto repeat;
 		}
 
-		/* Only do the copy if the currently-owning transaction
-		 * still needs it.  If it is on the Forget list, the
-		 * committing transaction is past that stage.  The
-		 * buffer had better remain locked during the kmalloc,
-		 * but that should be true --- we hold the journal lock
-		 * still and the buffer is already on the BUF_JOURNAL
-		 * list so won't be flushed.
+		/*
+		 * Only do the copy if the currently-owning transaction still
+		 * needs it. If buffer isn't on BJ_Metadata list, the
+		 * committing transaction is past that stage (here we use the
+		 * fact that BH_Shadow is set under bh_state lock together with
+		 * refiling to BJ_Shadow list and at this point we know the
+		 * buffer doesn't have BH_Shadow set).
 		 *
 		 * Subtle point, though: if this is a get_undo_access,
 		 * then we will be relying on the frozen_data to contain
 		 * the new value of the committed_data record after the
 		 * transaction, so we HAVE to force the frozen_data copy
-		 * in that case. */
-
-		if (jh->b_jlist != BJ_Forget || force_copy) {
+		 * in that case.
+		 */
+		if (jh->b_jlist == BJ_Metadata || force_copy) {
 			JBUFFER_TRACE(jh, "generate frozen data");
 			if (!frozen_buffer) {
 				JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -890,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
 int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	journal_t *journal;
 	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
 	int err;
 
 	jbd_debug(5, "journal_head %p\n", jh);
+	WARN_ON(!transaction);
 	err = -EROFS;
 	if (is_handle_aborted(handle))
 		goto out;
+	journal = transaction->t_journal;
 	err = 0;
 
 	JBUFFER_TRACE(jh, "entry");
@@ -1047,9 +1209,12 @@ out:
 void jbd2_journal_set_triggers(struct buffer_head *bh,
 			       struct jbd2_buffer_trigger_type *type)
 {
-	struct journal_head *jh = bh2jh(bh);
+	struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
 
+	if (WARN_ON(!jh))
+		return;
 	jh->b_triggers = type;
+	jbd2_journal_put_journal_head(jh);
 }
 
 void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
@@ -1100,18 +1265,21 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
 int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
-	struct journal_head *jh = bh2jh(bh);
+	journal_t *journal;
+	struct journal_head *jh;
 	int ret = 0;
 
-	jbd_debug(5, "journal_head %p\n", jh);
-	JBUFFER_TRACE(jh, "entry");
+	WARN_ON(!transaction);
 	if (is_handle_aborted(handle))
-		goto out;
-	if (!buffer_jbd(bh)) {
+		return -EROFS;
+	journal = transaction->t_journal;
+	jh = jbd2_journal_grab_journal_head(bh);
+	if (!jh) {
 		ret = -EUCLEAN;
 		goto out;
 	}
+	jbd_debug(5, "journal_head %p\n", jh);
+	JBUFFER_TRACE(jh, "entry");
 
 	jbd_lock_bh_state(bh);
 
@@ -1198,10 +1366,11 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 
 	JBUFFER_TRACE(jh, "file as BJ_Metadata");
 	spin_lock(&journal->j_list_lock);
-	__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+	__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
 	spin_unlock(&journal->j_list_lock);
 out_unlock_bh:
 	jbd_unlock_bh_state(bh);
+	jbd2_journal_put_journal_head(jh);
 out:
 	JBUFFER_TRACE(jh, "exit");
 	WARN_ON(ret);	/* All errors are bugs, so dump the stack */
@@ -1228,12 +1397,17 @@ out:
 int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	journal_t *journal;
 	struct journal_head *jh;
 	int drop_reserve = 0;
 	int err = 0;
 	int was_modified = 0;
 
+	WARN_ON(!transaction);
+	if (is_handle_aborted(handle))
+		return -EROFS;
+	journal = transaction->t_journal;
+
 	BUFFER_TRACE(bh, "entry");
 
 	jbd_lock_bh_state(bh);
@@ -1260,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 	 */
 	jh->b_modified = 0;
 
-	if (jh->b_transaction == handle->h_transaction) {
+	if (jh->b_transaction == transaction) {
 		J_ASSERT_JH(jh, !jh->b_frozen_data);
 
 		/* If we are forgetting a buffer which is already part
@@ -1355,19 +1529,21 @@ drop:
 int jbd2_journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
-	int err, wait_for_commit = 0;
+	journal_t *journal;
+	int err = 0, wait_for_commit = 0;
 	tid_t tid;
 	pid_t pid;
 
+	if (!transaction)
+		goto free_and_exit;
+	journal = transaction->t_journal;
+
 	J_ASSERT(journal_current_handle() == handle);
 
 	if (is_handle_aborted(handle))
 		err = -EIO;
-	else {
+	else
 		J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-		err = 0;
-	}
 
 	if (--handle->h_ref > 0) {
 		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1376,6 +1552,13 @@ int jbd2_journal_stop(handle_t *handle)
 	}
 
 	jbd_debug(4, "Handle %p going down\n", handle);
+	trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
+				transaction->t_tid,
+				handle->h_type, handle->h_line_no,
+				jiffies - handle->h_start_jiffies,
+				handle->h_sync, handle->h_requested_credits,
+				(handle->h_requested_credits -
+				 handle->h_buffer_credits));
 
 	/*
 	 * Implement synchronous transaction batching.  If the handle
@@ -1481,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle)
 
 	lock_map_release(&handle->h_lockdep_map);
 
+	if (handle->h_rsv_handle)
+		jbd2_journal_free_reserved(handle->h_rsv_handle);
+free_and_exit:
 	jbd2_free_handle(handle);
 	return err;
 }
 
-/**
- * int jbd2_journal_force_commit() - force any uncommitted transactions
- * @journal: journal to force
- *
- * For synchronous operations: force any uncommitted transactions
- * to disk.  May seem kludgy, but it reuses all the handle batching
- * code in a very simple manner.
- */
-int jbd2_journal_force_commit(journal_t *journal)
-{
-	handle_t *handle;
-	int ret;
-
-	handle = jbd2_journal_start(journal, 1);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-	} else {
-		handle->h_sync = 1;
-		ret = jbd2_journal_stop(handle);
-	}
-	return ret;
-}
-
 /*
  *
  * List management code snippets: various functions for manipulating the
@@ -1564,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
  * Remove a buffer from the appropriate transaction list.
  *
  * Note that this function can *change* the value of
- * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
- * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
- * of these pointers, it could go bad.  Generally the caller needs to re-read
- * the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
+ * t_reserved_list.  If the caller is holding onto a copy of one of these
+ * pointers, it could go bad.  Generally the caller needs to re-read the
+ * pointer from the transaction_t.
  *
  * Called under j_list_lock.
  */
@@ -1597,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 	case BJ_Forget:
 		list = &transaction->t_forget;
 		break;
-	case BJ_IO:
-		list = &transaction->t_iobuf_list;
-		break;
 	case BJ_Shadow:
 		list = &transaction->t_shadow_list;
 		break;
-	case BJ_LogCtl:
-		list = &transaction->t_log_list;
-		break;
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
@@ -1997,18 +2154,23 @@ zap_buffer_unlocked:
  * void jbd2_journal_invalidatepage()
  * @journal: journal to use for flush...
  * @page:    page to flush
- * @offset:  length of page to invalidate.
+ * @offset:  start of the range to invalidate
+ * @length:  length of the range to invalidate
  *
- * Reap page buffers containing data after offset in page. Can return -EBUSY
- * if buffers are part of the committing transaction and the page is straddling
- * i_size. Caller then has to wait for current commit and try again.
+ * Reap page buffers containing data after in the specified range in page.
+ * Can return -EBUSY if buffers are part of the committing transaction and
+ * the page is straddling i_size. Caller then has to wait for current commit
+ * and try again.
  */
 int jbd2_journal_invalidatepage(journal_t *journal,
 				struct page *page,
-				unsigned long offset)
+				unsigned int offset,
+				unsigned int length)
 {
 	struct buffer_head *head, *bh, *next;
+	unsigned int stop = offset + length;
 	unsigned int curr_off = 0;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
 	int may_free = 1;
 	int ret = 0;
 
@@ -2017,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
 	if (!page_has_buffers(page))
 		return 0;
 
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
 	/* We will potentially be playing with lists other than just the
 	 * data lists (especially for journaled data mode), so be
 	 * cautious in our locking. */
@@ -2026,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal,
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;
 
+		if (next_off > stop)
+			return 0;
+
 		if (offset <= curr_off) {
 			/* This block is wholly outside the truncation point */
 			lock_buffer(bh);
-			ret = journal_unmap_buffer(journal, bh, offset > 0);
+			ret = journal_unmap_buffer(journal, bh, partial_page);
 			unlock_buffer(bh);
 			if (ret < 0)
 				return ret;
@@ -2040,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
 
 	} while (bh != head);
 
-	if (!offset) {
+	if (!partial_page) {
 		if (may_free && try_to_free_buffers(page))
 			J_ASSERT(!page_has_buffers(page));
 	}
@@ -2101,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 	case BJ_Forget:
 		list = &transaction->t_forget;
 		break;
-	case BJ_IO:
-		list = &transaction->t_iobuf_list;
-		break;
 	case BJ_Shadow:
 		list = &transaction->t_shadow_list;
 		break;
-	case BJ_LogCtl:
-		list = &transaction->t_log_list;
-		break;
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
@@ -2211,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
 {
 	transaction_t *transaction = handle->h_transaction;
-	journal_t *journal = transaction->t_journal;
+	journal_t *journal;
 
+	WARN_ON(!transaction);
 	if (is_handle_aborted(handle))
-		return -EIO;
+		return -EROFS;
+	journal = transaction->t_journal;
 
 	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
 			transaction->t_tid);
diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig
index 6ae169cd8fa..d8bb6c411e9 100644
--- a/fs/jffs2/Kconfig
+++ b/fs/jffs2/Kconfig
@@ -50,8 +50,8 @@ config JFFS2_FS_WBUF_VERIFY
 	  write-buffer, and check for errors.
 
 config JFFS2_SUMMARY
-	bool "JFFS2 summary support (EXPERIMENTAL)"
-	depends on JFFS2_FS && EXPERIMENTAL
+	bool "JFFS2 summary support"
+	depends on JFFS2_FS
 	default n
 	help
 	  This feature makes it possible to use summary information
@@ -63,8 +63,8 @@ config JFFS2_SUMMARY
 	  If unsure, say 'N'.
 
 config JFFS2_FS_XATTR
-	bool "JFFS2 XATTR support (EXPERIMENTAL)"
-	depends on JFFS2_FS && EXPERIMENTAL
+	bool "JFFS2 XATTR support"
+	depends on JFFS2_FS
 	default n
 	help
 	  Extended attributes are name:value pairs associated with inodes by
@@ -173,7 +173,7 @@ config JFFS2_CMODE_PRIORITY
 	  successful one.
 
 config JFFS2_CMODE_SIZE
-	bool "size (EXPERIMENTAL)"
+	bool "size"
 	help
 	  Tries all compressors and chooses the one which has the smallest
 	  result.
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ad7774d3209..e3aac222472 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -22,7 +22,7 @@
 #include <linux/time.h>
 #include "nodelist.h"
 
-static int jffs2_readdir (struct file *, void *, filldir_t);
+static int jffs2_readdir (struct file *, struct dir_context *);
 
 static int jffs2_create (struct inode *,struct dentry *,umode_t,
 			 bool);
@@ -40,7 +40,7 @@ static int jffs2_rename (struct inode *, struct dentry *,
 const struct file_operations jffs2_dir_operations =
 {
 	.read =		generic_read_dir,
-	.readdir =	jffs2_readdir,
+	.iterate =	jffs2_readdir,
 	.unlocked_ioctl=jffs2_ioctl,
 	.fsync =	jffs2_fsync,
 	.llseek =	generic_file_llseek,
@@ -114,60 +114,40 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
 /***********************************************************************/
 
 
-static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int jffs2_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct jffs2_inode_info *f;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
 	struct jffs2_full_dirent *fd;
-	unsigned long offset, curofs;
+	unsigned long curofs = 1;
 
-	jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n",
-		  filp->f_path.dentry->d_inode->i_ino);
+	jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino);
 
-	f = JFFS2_INODE_INFO(inode);
-
-	offset = filp->f_pos;
-
-	if (offset == 0) {
-		jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino);
-		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-			goto out;
-		offset++;
-	}
-	if (offset == 1) {
-		unsigned long pino = parent_ino(filp->f_path.dentry);
-		jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino);
-		if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0)
-			goto out;
-		offset++;
-	}
+	if (!dir_emit_dots(file, ctx))
+		return 0;
 
-	curofs=1;
 	mutex_lock(&f->sem);
 	for (fd = f->dents; fd; fd = fd->next) {
-
 		curofs++;
-		/* First loop: curofs = 2; offset = 2 */
-		if (curofs < offset) {
+		/* First loop: curofs = 2; pos = 2 */
+		if (curofs < ctx->pos) {
 			jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
-				  fd->name, fd->ino, fd->type, curofs, offset);
+				  fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos);
 			continue;
 		}
 		if (!fd->ino) {
 			jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n",
 				  fd->name);
-			offset++;
+			ctx->pos++;
 			continue;
 		}
 		jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n",
-			  offset, fd->name, fd->ino, fd->type);
-		if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0)
+			  (unsigned long)ctx->pos, fd->name, fd->ino, fd->type);
+		if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type))
 			break;
-		offset++;
+		ctx->pos++;
 	}
 	mutex_unlock(&f->sem);
- out:
-	filp->f_pos = offset;
 	return 0;
 }
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index d3d8799e218..0defb1cc2a3 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -356,6 +356,7 @@ static struct file_system_type jffs2_fs_type = {
 	.mount =	jffs2_mount,
 	.kill_sb =	jffs2_kill_sb,
 };
+MODULE_ALIAS_FS("jffs2");
 
 static int __init init_jffs2_fs(void)
 {
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b7dc47ba675..730f24e282a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -23,6 +23,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h>
+#include <linux/aio.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
@@ -125,7 +126,7 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int wait = wbc->sync_mode == WB_SYNC_ALL;
 
-	if (test_cflag(COMMIT_Nolink, inode))
+	if (inode->i_nlink == 0)
 		return 0;
 	/*
 	 * If COMMIT_DIRTY is not set, the inode isn't really dirty.
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index bc555ff417e..93a1232894f 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -58,7 +58,7 @@ static long jfs_map_ext2(unsigned long flags, int from)
 
 long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct jfs_inode_info *jfs_inode = JFS_IP(inode);
 	unsigned int flags;
 
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 9197a1b0d02..9f4ed13d9f1 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -3002,9 +3002,9 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
  * return: offset = (pn, index) of start entry
  *	of next jfs_readdir()/dtRead()
  */
-int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int jfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *ip = filp->f_path.dentry->d_inode;
+	struct inode *ip = file_inode(file);
 	struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
 	int rc = 0;
 	loff_t dtpos;	/* legacy OS/2 style position */
@@ -3033,7 +3033,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int overflow, fix_page, page_fixed = 0;
 	static int unique_pos = 2;	/* If we can't fix broken index */
 
-	if (filp->f_pos == DIREND)
+	if (ctx->pos == DIREND)
 		return 0;
 
 	if (DO_INDEX(ip)) {
@@ -3045,7 +3045,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		 */
 		do_index = 1;
 
-		dir_index = (u32) filp->f_pos;
+		dir_index = (u32) ctx->pos;
 
 		if (dir_index > 1) {
 			struct dir_table_slot dirtab_slot;
@@ -3053,25 +3053,25 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			if (dtEmpty(ip) ||
 			    (dir_index >= JFS_IP(ip)->next_index)) {
 				/* Stale position.  Directory has shrunk */
-				filp->f_pos = DIREND;
+				ctx->pos = DIREND;
 				return 0;
 			}
 		      repeat:
 			rc = read_index(ip, dir_index, &dirtab_slot);
 			if (rc) {
-				filp->f_pos = DIREND;
+				ctx->pos = DIREND;
 				return rc;
 			}
 			if (dirtab_slot.flag == DIR_INDEX_FREE) {
 				if (loop_count++ > JFS_IP(ip)->next_index) {
 					jfs_err("jfs_readdir detected "
 						   "infinite loop!");
-					filp->f_pos = DIREND;
+					ctx->pos = DIREND;
 					return 0;
 				}
 				dir_index = le32_to_cpu(dirtab_slot.addr2);
 				if (dir_index == -1) {
-					filp->f_pos = DIREND;
+					ctx->pos = DIREND;
 					return 0;
 				}
 				goto repeat;
@@ -3080,13 +3080,13 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			index = dirtab_slot.slot;
 			DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
 			if (rc) {
-				filp->f_pos = DIREND;
+				ctx->pos = DIREND;
 				return 0;
 			}
 			if (p->header.flag & BT_INTERNAL) {
 				jfs_err("jfs_readdir: bad index table");
 				DT_PUTPAGE(mp);
-				filp->f_pos = -1;
+				ctx->pos = -1;
 				return 0;
 			}
 		} else {
@@ -3094,23 +3094,22 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				/*
 				 * self "."
 				 */
-				filp->f_pos = 0;
-				if (filldir(dirent, ".", 1, 0, ip->i_ino,
-					    DT_DIR))
+				ctx->pos = 0;
+				if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
 					return 0;
 			}
 			/*
 			 * parent ".."
 			 */
-			filp->f_pos = 1;
-			if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR))
+			ctx->pos = 1;
+			if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
 				return 0;
 
 			/*
 			 * Find first entry of left-most leaf
 			 */
 			if (dtEmpty(ip)) {
-				filp->f_pos = DIREND;
+				ctx->pos = DIREND;
 				return 0;
 			}
 
@@ -3128,23 +3127,19 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		 * pn > 0:		Real entries, pn=1 -> leftmost page
 		 * pn = index = -1:	No more entries
 		 */
-		dtpos = filp->f_pos;
+		dtpos = ctx->pos;
 		if (dtpos == 0) {
 			/* build "." entry */
-
-			if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
-				    DT_DIR))
+			if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
 				return 0;
 			dtoffset->index = 1;
-			filp->f_pos = dtpos;
+			ctx->pos = dtpos;
 		}
 
 		if (dtoffset->pn == 0) {
 			if (dtoffset->index == 1) {
 				/* build ".." entry */
-
-				if (filldir(dirent, "..", 2, filp->f_pos,
-					    PARENT(ip), DT_DIR))
+				if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
 					return 0;
 			} else {
 				jfs_err("jfs_readdir called with "
@@ -3152,18 +3147,18 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			}
 			dtoffset->pn = 1;
 			dtoffset->index = 0;
-			filp->f_pos = dtpos;
+			ctx->pos = dtpos;
 		}
 
 		if (dtEmpty(ip)) {
-			filp->f_pos = DIREND;
+			ctx->pos = DIREND;
 			return 0;
 		}
 
-		if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) {
+		if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
 			jfs_err("jfs_readdir: unexpected rc = %d "
 				"from dtReadNext", rc);
-			filp->f_pos = DIREND;
+			ctx->pos = DIREND;
 			return 0;
 		}
 		/* get start leaf page and index */
@@ -3171,7 +3166,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 		/* offset beyond directory eof ? */
 		if (bn < 0) {
-			filp->f_pos = DIREND;
+			ctx->pos = DIREND;
 			return 0;
 		}
 	}
@@ -3180,7 +3175,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (dirent_buf == 0) {
 		DT_PUTPAGE(mp);
 		jfs_warn("jfs_readdir: __get_free_page failed!");
-		filp->f_pos = DIREND;
+		ctx->pos = DIREND;
 		return -ENOMEM;
 	}
 
@@ -3295,9 +3290,9 @@ skip_one:
 
 		jfs_dirent = (struct jfs_dirent *) dirent_buf;
 		while (jfs_dirents--) {
-			filp->f_pos = jfs_dirent->position;
-			if (filldir(dirent, jfs_dirent->name,
-				    jfs_dirent->name_len, filp->f_pos,
+			ctx->pos = jfs_dirent->position;
+			if (!dir_emit(ctx, jfs_dirent->name,
+				    jfs_dirent->name_len,
 				    jfs_dirent->ino, DT_UNKNOWN))
 				goto out;
 			jfs_dirent = next_jfs_dirent(jfs_dirent);
@@ -3309,7 +3304,7 @@ skip_one:
 		}
 
 		if (!overflow && (bn == 0)) {
-			filp->f_pos = DIREND;
+			ctx->pos = DIREND;
 			break;
 		}
 
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 2545bb31723..fd4169e6e69 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -265,5 +265,5 @@ extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key,
 extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
 		    ino_t * orig_ino, ino_t new_ino, int flag);
 
-extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir);
+extern int jfs_readdir(struct file *file, struct dir_context *ctx);
 #endif				/* !_H_JFS_DTREE */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 6ba4006e011..f7e042b63dd 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1493,7 +1493,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 		/* mask any prior bits for the starting words of the
 		 * summary map.
 		 */
-		mask = ONES << (EXTSPERSUM - bitno);
+		mask = (bitno == 0) ? 0 : (ONES << (EXTSPERSUM - bitno));
 		inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask;
 		extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask;
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 2eb952c41a6..360d27c4888 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1058,7 +1058,8 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
  */
 void jfs_syncpt(struct jfs_log *log, int hard_sync)
 {	LOG_LOCK(log);
-	lmLogSync(log, hard_sync);
+	if (!test_bit(log_QUIESCE, &log->flag))
+		lmLogSync(log, hard_sync);
 	LOG_UNLOCK(log);
 }
 
@@ -2004,12 +2005,17 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 	bio->bi_io_vec[0].bv_offset = bp->l_offset;
 
 	bio->bi_vcnt = 1;
-	bio->bi_idx = 0;
 	bio->bi_size = LOGPSIZE;
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	submit_bio(READ_SYNC, bio);
+	/*check if journaling to disk has been disabled*/
+	if (log->no_integrity) {
+		bio->bi_size = 0;
+		lbmIODone(bio, 0);
+	} else {
+		submit_bio(READ_SYNC, bio);
+	}
 
 	wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
 
@@ -2145,7 +2151,6 @@ static void lbmStartIO(struct lbuf * bp)
 	bio->bi_io_vec[0].bv_offset = bp->l_offset;
 
 	bio->bi_vcnt = 1;
-	bio->bi_idx = 0;
 	bio->bi_size = LOGPSIZE;
 
 	bio->bi_end_io = lbmIODone;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34cd82..9e3aaff11f8 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
 	return ret;
 }
 
-static void metapage_invalidatepage(struct page *page, unsigned long offset)
+static void metapage_invalidatepage(struct page *page, unsigned int offset,
+				    unsigned int length)
 {
-	BUG_ON(offset);
+	BUG_ON(offset || length < PAGE_CACHE_SIZE);
 
 	BUG_ON(PageWriteback(page));
 
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 3b91a7ad608..8b19027291d 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1529,7 +1529,7 @@ const struct inode_operations jfs_dir_inode_operations = {
 
 const struct file_operations jfs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= jfs_readdir,
+	.iterate	= jfs_readdir,
 	.fsync		= jfs_fsync,
 	.unlocked_ioctl = jfs_ioctl,
 #ifdef CONFIG_COMPAT
@@ -1538,8 +1538,7 @@ const struct file_operations jfs_dir_operations = {
 	.llseek		= generic_file_llseek,
 };
 
-static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
-		struct qstr *this)
+static int jfs_ci_hash(const struct dentry *dir, struct qstr *this)
 {
 	unsigned long hash;
 	int i;
@@ -1552,9 +1551,7 @@ static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
 	return 0;
 }
 
-static int jfs_ci_compare(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	int i, result = 1;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 1a543be09c7..788e0a9c1fb 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -154,7 +154,7 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	/*
 	 * If we really return the number of allocated & free inodes, some
 	 * applications will fail because they won't see enough free inodes.
-	 * We'll try to calculate some guess as to how may inodes we can
+	 * We'll try to calculate some guess as to how many inodes we can
 	 * really allocate
 	 *
 	 * buf->f_files = atomic_read(&imap->im_numinos);
@@ -611,11 +611,28 @@ static int jfs_freeze(struct super_block *sb)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	struct jfs_log *log = sbi->log;
+	int rc = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		txQuiesce(sb);
-		lmLogShutdown(log);
-		updateSuper(sb, FM_CLEAN);
+		rc = lmLogShutdown(log);
+		if (rc) {
+			jfs_error(sb, "jfs_freeze: lmLogShutdown failed");
+
+			/* let operations fail rather than hang */
+			txResume(sb);
+
+			return rc;
+		}
+		rc = updateSuper(sb, FM_CLEAN);
+		if (rc) {
+			jfs_err("jfs_freeze: updateSuper failed\n");
+			/*
+			 * Don't fail here. Everything succeeded except
+			 * marking the superblock clean, so there's really
+			 * no harm in leaving it frozen for now.
+			 */
+		}
 	}
 	return 0;
 }
@@ -627,13 +644,18 @@ static int jfs_unfreeze(struct super_block *sb)
 	int rc = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		updateSuper(sb, FM_MOUNT);
-		if ((rc = lmLogInit(log)))
-			jfs_err("jfs_unlock failed with return code %d", rc);
-		else
-			txResume(sb);
+		rc = updateSuper(sb, FM_MOUNT);
+		if (rc) {
+			jfs_error(sb, "jfs_unfreeze: updateSuper failed");
+			goto out;
+		}
+		rc = lmLogInit(log);
+		if (rc)
+			jfs_error(sb, "jfs_unfreeze: lmLogInit failed");
+out:
+		txResume(sb);
 	}
-	return 0;
+	return rc;
 }
 
 static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
@@ -833,6 +855,7 @@ static struct file_system_type jfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("jfs");
 
 static void init_once(void *foo)
 {
diff --git a/fs/libfs.c b/fs/libfs.c
index 916da8c4158..c3a0837fb86 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -135,60 +135,40 @@ static inline unsigned char dt_type(struct inode *inode)
  * both impossible due to the lock on directory.
  */
 
-int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int dcache_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct dentry *cursor = filp->private_data;
+	struct dentry *dentry = file->f_path.dentry;
+	struct dentry *cursor = file->private_data;
 	struct list_head *p, *q = &cursor->d_u.d_child;
-	ino_t ino;
-	int i = filp->f_pos;
 
-	switch (i) {
-		case 0:
-			ino = dentry->d_inode->i_ino;
-			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-				break;
-			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		case 1:
-			ino = parent_ino(dentry);
-			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-				break;
-			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		default:
-			spin_lock(&dentry->d_lock);
-			if (filp->f_pos == 2)
-				list_move(q, &dentry->d_subdirs);
-
-			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
-				struct dentry *next;
-				next = list_entry(p, struct dentry, d_u.d_child);
-				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
-				if (!simple_positive(next)) {
-					spin_unlock(&next->d_lock);
-					continue;
-				}
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+	spin_lock(&dentry->d_lock);
+	if (ctx->pos == 2)
+		list_move(q, &dentry->d_subdirs);
+
+	for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
+		struct dentry *next = list_entry(p, struct dentry, d_u.d_child);
+		spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+		if (!simple_positive(next)) {
+			spin_unlock(&next->d_lock);
+			continue;
+		}
 
-				spin_unlock(&next->d_lock);
-				spin_unlock(&dentry->d_lock);
-				if (filldir(dirent, next->d_name.name, 
-					    next->d_name.len, filp->f_pos, 
-					    next->d_inode->i_ino, 
-					    dt_type(next->d_inode)) < 0)
-					return 0;
-				spin_lock(&dentry->d_lock);
-				spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
-				/* next is still alive */
-				list_move(q, p);
-				spin_unlock(&next->d_lock);
-				p = q;
-				filp->f_pos++;
-			}
-			spin_unlock(&dentry->d_lock);
+		spin_unlock(&next->d_lock);
+		spin_unlock(&dentry->d_lock);
+		if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
+			      next->d_inode->i_ino, dt_type(next->d_inode)))
+			return 0;
+		spin_lock(&dentry->d_lock);
+		spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
+		/* next is still alive */
+		list_move(q, p);
+		spin_unlock(&next->d_lock);
+		p = q;
+		ctx->pos++;
 	}
+	spin_unlock(&dentry->d_lock);
 	return 0;
 }
 
@@ -202,7 +182,7 @@ const struct file_operations simple_dir_operations = {
 	.release	= dcache_dir_close,
 	.llseek		= dcache_dir_lseek,
 	.read		= generic_read_dir,
-	.readdir	= dcache_readdir,
+	.iterate	= dcache_readdir,
 	.fsync		= noop_fsync,
 };
 
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index ca0a0800144..01bfe766275 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -11,7 +11,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/nfs_fs.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 #include <linux/kthread.h>
@@ -144,6 +144,9 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 			timeout);
 	if (ret < 0)
 		return -ERESTARTSYS;
+	/* Reset the lock status after a server reboot so we resend */
+	if (block->b_status == nlm_lck_denied_grace_period)
+		block->b_status = nlm_lck_blocked;
 	req->a_res.status = block->b_status;
 	return 0;
 }
@@ -178,7 +181,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 			continue;
 		if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
 			continue;
-		if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
+		if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)) ,fh) != 0)
 			continue;
 		/* Alright, we found a lock. Set the return status
 		 * and wake up the caller
@@ -220,10 +223,19 @@ reclaimer(void *ptr)
 {
 	struct nlm_host	  *host = (struct nlm_host *) ptr;
 	struct nlm_wait	  *block;
+	struct nlm_rqst   *req;
 	struct file_lock *fl, *next;
 	u32 nsmstate;
 	struct net *net = host->net;
 
+	req = kmalloc(sizeof(*req), GFP_KERNEL);
+	if (!req) {
+		printk(KERN_ERR "lockd: reclaimer unable to alloc memory."
+				" Locks for %s won't be reclaimed!\n",
+				host->h_name);
+		return 0;
+	}
+
 	allow_signal(SIGKILL);
 
 	down_write(&host->h_rwsem);
@@ -253,7 +265,7 @@ restart:
 		 */
 		if (signalled())
 			continue;
-		if (nlmclnt_reclaim(host, fl) != 0)
+		if (nlmclnt_reclaim(host, fl, req) != 0)
 			continue;
 		list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
 		if (host->h_nsmstate != nsmstate) {
@@ -279,5 +291,6 @@ restart:
 	/* Release host handle after use */
 	nlmclnt_release_host(host);
 	lockd_down(net);
+	kfree(req);
 	return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 54f9e6ce043..9760ecb9b60 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -127,7 +127,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
 	struct nlm_lock	*lock = &argp->lock;
 
 	nlmclnt_next_cookie(&argp->cookie);
-	memcpy(&lock->fh, NFS_FH(fl->fl_file->f_path.dentry->d_inode), sizeof(struct nfs_fh));
+	memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));
 	lock->caller  = utsname()->nodename;
 	lock->oh.data = req->a_owner;
 	lock->oh.len  = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
@@ -615,17 +615,15 @@ out_unlock:
  * RECLAIM: Try to reclaim a lock
  */
 int
-nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
+nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl,
+		struct nlm_rqst *req)
 {
-	struct nlm_rqst reqst, *req;
 	int		status;
 
-	req = &reqst;
 	memset(req, 0, sizeof(*req));
 	locks_init_lock(&req->a_args.lock.fl);
 	locks_init_lock(&req->a_res.lock.fl);
 	req->a_host  = host;
-	req->a_flags = 0;
 
 	/* Set up the argument struct */
 	nlmclnt_setlockargs(req, fl);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 0e17090c310..969d589c848 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -13,6 +13,7 @@
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 #include <linux/mutex.h>
@@ -32,15 +33,15 @@
 static struct hlist_head	nlm_server_hosts[NLM_HOST_NRHASH];
 static struct hlist_head	nlm_client_hosts[NLM_HOST_NRHASH];
 
-#define for_each_host(host, pos, chain, table) \
+#define for_each_host(host, chain, table) \
 	for ((chain) = (table); \
 	     (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
-		hlist_for_each_entry((host), (pos), (chain), h_hash)
+		hlist_for_each_entry((host), (chain), h_hash)
 
-#define for_each_host_safe(host, pos, next, chain, table) \
+#define for_each_host_safe(host, next, chain, table) \
 	for ((chain) = (table); \
 	     (chain) < (table) + NLM_HOST_NRHASH; ++(chain)) \
-		hlist_for_each_entry_safe((host), (pos), (next), \
+		hlist_for_each_entry_safe((host), (next), \
 						(chain), h_hash)
 
 static unsigned long		nrhosts;
@@ -225,7 +226,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
 		.net		= net,
 	};
 	struct hlist_head *chain;
-	struct hlist_node *pos;
 	struct nlm_host	*host;
 	struct nsm_handle *nsm = NULL;
 	struct lockd_net *ln = net_generic(net, lockd_net_id);
@@ -237,7 +237,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
 	mutex_lock(&nlm_host_mutex);
 
 	chain = &nlm_client_hosts[nlm_hash_address(sap)];
-	hlist_for_each_entry(host, pos, chain, h_hash) {
+	hlist_for_each_entry(host, chain, h_hash) {
 		if (host->net != net)
 			continue;
 		if (!rpc_cmp_addr(nlm_addr(host), sap))
@@ -322,7 +322,6 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 				    const size_t hostname_len)
 {
 	struct hlist_head *chain;
-	struct hlist_node *pos;
 	struct nlm_host	*host = NULL;
 	struct nsm_handle *nsm = NULL;
 	struct sockaddr *src_sap = svc_daddr(rqstp);
@@ -350,7 +349,7 @@ struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
 		nlm_gc_hosts(net);
 
 	chain = &nlm_server_hosts[nlm_hash_address(ni.sap)];
-	hlist_for_each_entry(host, pos, chain, h_hash) {
+	hlist_for_each_entry(host, chain, h_hash) {
 		if (host->net != net)
 			continue;
 		if (!rpc_cmp_addr(nlm_addr(host), ni.sap))
@@ -515,10 +514,9 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
 {
 	struct nlm_host *host;
 	struct hlist_head *chain;
-	struct hlist_node *pos;
 
 	mutex_lock(&nlm_host_mutex);
-	for_each_host(host, pos, chain, cache) {
+	for_each_host(host, chain, cache) {
 		if (host->h_nsmhandle == nsm
 		    && host->h_nsmstate != info->state) {
 			host->h_nsmstate = info->state;
@@ -570,7 +568,6 @@ void nlm_host_rebooted(const struct nlm_reboot *info)
 static void nlm_complain_hosts(struct net *net)
 {
 	struct hlist_head *chain;
-	struct hlist_node *pos;
 	struct nlm_host	*host;
 
 	if (net) {
@@ -587,7 +584,7 @@ static void nlm_complain_hosts(struct net *net)
 		dprintk("lockd: %lu hosts left:\n", nrhosts);
 	}
 
-	for_each_host(host, pos, chain, nlm_server_hosts) {
+	for_each_host(host, chain, nlm_server_hosts) {
 		if (net && host->net != net)
 			continue;
 		dprintk("       %s (cnt %d use %d exp %ld net %p)\n",
@@ -600,14 +597,13 @@ void
 nlm_shutdown_hosts_net(struct net *net)
 {
 	struct hlist_head *chain;
-	struct hlist_node *pos;
 	struct nlm_host	*host;
 
 	mutex_lock(&nlm_host_mutex);
 
 	/* First, make all hosts eligible for gc */
 	dprintk("lockd: nuking all hosts in net %p...\n", net);
-	for_each_host(host, pos, chain, nlm_server_hosts) {
+	for_each_host(host, chain, nlm_server_hosts) {
 		if (net && host->net != net)
 			continue;
 		host->h_expires = jiffies - 1;
@@ -644,11 +640,11 @@ static void
 nlm_gc_hosts(struct net *net)
 {
 	struct hlist_head *chain;
-	struct hlist_node *pos, *next;
+	struct hlist_node *next;
 	struct nlm_host	*host;
 
 	dprintk("lockd: host garbage collection for net %p\n", net);
-	for_each_host(host, pos, chain, nlm_server_hosts) {
+	for_each_host(host, chain, nlm_server_hosts) {
 		if (net && host->net != net)
 			continue;
 		host->h_inuse = 0;
@@ -657,7 +653,7 @@ nlm_gc_hosts(struct net *net)
 	/* Mark all hosts that hold locks, blocks or shares */
 	nlmsvc_mark_resources(net);
 
-	for_each_host_safe(host, pos, next, chain, nlm_server_hosts) {
+	for_each_host_safe(host, next, chain, nlm_server_hosts) {
 		if (net && host->net != net)
 			continue;
 		if (atomic_read(&host->h_count) || host->h_inuse
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3c2cfc68363..1812f026960 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -12,6 +12,7 @@
 #include <linux/slab.h>
 
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/xprtsock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index a2aa97d4567..10d6c41aeca 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -305,7 +305,7 @@ static int lockd_start_svc(struct svc_serv *serv)
 	svc_sock_update_bufs(serv);
 	serv->sv_maxconn = nlm_max_connections;
 
-	nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
+	nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name);
 	if (IS_ERR(nlmsvc_task)) {
 		error = PTR_ERR(nlmsvc_task);
 		printk(KERN_WARNING
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 8d80c990dff..067778b0ccc 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -276,7 +276,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block)
 	dprintk("lockd: unlinking block %p...\n", block);
 
 	/* Remove block from list */
-	status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl);
+	status = posix_unblock_lock(&block->b_call->a_args.lock.fl);
 	nlmsvc_remove_block(block);
 	return status;
 }
@@ -406,8 +406,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	__be32			ret;
 
 	dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
-				file->f_file->f_path.dentry->d_inode->i_sb->s_id,
-				file->f_file->f_path.dentry->d_inode->i_ino,
+				file_inode(file->f_file)->i_sb->s_id,
+				file_inode(file->f_file)->i_ino,
 				lock->fl.fl_type, lock->fl.fl_pid,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end,
@@ -513,8 +513,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 	__be32			ret;
 
 	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
-				file->f_file->f_path.dentry->d_inode->i_sb->s_id,
-				file->f_file->f_path.dentry->d_inode->i_ino,
+				file_inode(file->f_file)->i_sb->s_id,
+				file_inode(file->f_file)->i_ino,
 				lock->fl.fl_type,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
@@ -606,8 +606,8 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
 	int	error;
 
 	dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n",
-				file->f_file->f_path.dentry->d_inode->i_sb->s_id,
-				file->f_file->f_path.dentry->d_inode->i_ino,
+				file_inode(file->f_file)->i_sb->s_id,
+				file_inode(file->f_file)->i_ino,
 				lock->fl.fl_pid,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
@@ -635,8 +635,8 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
 	int status = 0;
 
 	dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
-				file->f_file->f_path.dentry->d_inode->i_sb->s_id,
-				file->f_file->f_path.dentry->d_inode->i_ino,
+				file_inode(file->f_file)->i_sb->s_id,
+				file_inode(file->f_file)->i_ino,
 				lock->fl.fl_pid,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
@@ -744,8 +744,20 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 	return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
 }
 
+/*
+ * Since NLM uses two "keys" for tracking locks, we need to hash them down
+ * to one for the blocked_hash. Here, we're just xor'ing the host address
+ * with the pid in order to create a key value for picking a hash bucket.
+ */
+static unsigned long
+nlmsvc_owner_key(struct file_lock *fl)
+{
+	return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid;
+}
+
 const struct lock_manager_operations nlmsvc_lock_operations = {
 	.lm_compare_owner = nlmsvc_same_owner,
+	.lm_owner_key = nlmsvc_owner_key,
 	.lm_notify = nlmsvc_notify_blocked,
 	.lm_grant = nlmsvc_grant_deferred,
 };
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 0deb5f6c9dd..dc5c75930f0 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -13,7 +13,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/sunrpc/svc.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/nfsd/nfsfh.h>
 #include <linux/nfsd/export.h>
 #include <linux/lockd/lockd.h>
@@ -45,7 +45,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)
 
 static inline void nlm_debug_print_file(char *msg, struct nlm_file *file)
 {
-	struct inode *inode = file->f_file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file->f_file);
 
 	dprintk("lockd: %s %s/%ld\n",
 		msg, inode->i_sb->s_id, inode->i_ino);
@@ -84,7 +84,6 @@ __be32
 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
 					struct nfs_fh *f)
 {
-	struct hlist_node *pos;
 	struct nlm_file	*file;
 	unsigned int	hash;
 	__be32		nfserr;
@@ -96,7 +95,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
 	/* Lock file table */
 	mutex_lock(&nlm_file_mutex);
 
-	hlist_for_each_entry(file, pos, &nlm_files[hash], f_list)
+	hlist_for_each_entry(file, &nlm_files[hash], f_list)
 		if (!nfs_compare_fh(&file->f_handle, f))
 			goto found;
 
@@ -170,7 +169,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 
 again:
 	file->f_locks = 0;
-	lock_flocks(); /* protects i_flock list */
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
 		if (fl->fl_lmops != &nlmsvc_lock_operations)
 			continue;
@@ -182,7 +181,7 @@ again:
 		if (match(lockhost, host)) {
 			struct file_lock lock = *fl;
 
-			unlock_flocks();
+			spin_unlock(&inode->i_lock);
 			lock.fl_type  = F_UNLCK;
 			lock.fl_start = 0;
 			lock.fl_end   = OFFSET_MAX;
@@ -194,7 +193,7 @@ again:
 			goto again;
 		}
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 
 	return 0;
 }
@@ -229,14 +228,14 @@ nlm_file_inuse(struct nlm_file *file)
 	if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
 		return 1;
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
 		if (fl->fl_lmops == &nlmsvc_lock_operations) {
-			unlock_flocks();
+			spin_unlock(&inode->i_lock);
 			return 1;
 		}
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	file->f_locks = 0;
 	return 0;
 }
@@ -248,13 +247,13 @@ static int
 nlm_traverse_files(void *data, nlm_host_match_fn_t match,
 		int (*is_failover_file)(void *data, struct nlm_file *file))
 {
-	struct hlist_node *pos, *next;
+	struct hlist_node *next;
 	struct nlm_file	*file;
 	int i, ret = 0;
 
 	mutex_lock(&nlm_file_mutex);
 	for (i = 0; i < FILE_NRHASH; i++) {
-		hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) {
+		hlist_for_each_entry_safe(file, next, &nlm_files[i], f_list) {
 			if (is_failover_file && !is_failover_file(data, file))
 				continue;
 			file->f_count++;
diff --git a/fs/locks.c b/fs/locks.c
index a94e331a52a..b27a3005d78 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -126,6 +126,9 @@
 #include <linux/time.h>
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
+#include <linux/hashtable.h>
+#include <linux/percpu.h>
+#include <linux/lglock.h>
 
 #include <asm/uaccess.h>
 
@@ -153,30 +156,53 @@ int lease_break_time = 45;
 #define for_each_lock(inode, lockp) \
 	for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
 
-static LIST_HEAD(file_lock_list);
-static LIST_HEAD(blocked_list);
-static DEFINE_SPINLOCK(file_lock_lock);
+/*
+ * The global file_lock_list is only used for displaying /proc/locks, so we
+ * keep a list on each CPU, with each list protected by its own spinlock via
+ * the file_lock_lglock. Note that alterations to the list also require that
+ * the relevant i_lock is held.
+ */
+DEFINE_STATIC_LGLOCK(file_lock_lglock);
+static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
 
 /*
- * Protects the two list heads above, plus the inode->i_flock list
+ * The blocked_hash is used to find POSIX lock loops for deadlock detection.
+ * It is protected by blocked_lock_lock.
+ *
+ * We hash locks by lockowner in order to optimize searching for the lock a
+ * particular lockowner is waiting on.
+ *
+ * FIXME: make this value scale via some heuristic? We generally will want more
+ * buckets when we have more lockowners holding locks, but that's a little
+ * difficult to determine without knowing what the workload will look like.
  */
-void lock_flocks(void)
-{
-	spin_lock(&file_lock_lock);
-}
-EXPORT_SYMBOL_GPL(lock_flocks);
+#define BLOCKED_HASH_BITS	7
+static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
 
-void unlock_flocks(void)
-{
-	spin_unlock(&file_lock_lock);
-}
-EXPORT_SYMBOL_GPL(unlock_flocks);
+/*
+ * This lock protects the blocked_hash. Generally, if you're accessing it, you
+ * want to be holding this lock.
+ *
+ * In addition, it also protects the fl->fl_block list, and the fl->fl_next
+ * pointer for file_lock structures that are acting as lock requests (in
+ * contrast to those that are acting as records of acquired locks).
+ *
+ * Note that when we acquire this lock in order to change the above fields,
+ * we often hold the i_lock as well. In certain cases, when reading the fields
+ * protected by this lock, we can skip acquiring it iff we already hold the
+ * i_lock.
+ *
+ * In particular, adding an entry to the fl_block list requires that you hold
+ * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
+ * an entry from the list however only requires the file_lock_lock.
+ */
+static DEFINE_SPINLOCK(blocked_lock_lock);
 
 static struct kmem_cache *filelock_cache __read_mostly;
 
 static void locks_init_lock_heads(struct file_lock *fl)
 {
-	INIT_LIST_HEAD(&fl->fl_link);
+	INIT_HLIST_NODE(&fl->fl_link);
 	INIT_LIST_HEAD(&fl->fl_block);
 	init_waitqueue_head(&fl->fl_wait);
 }
@@ -210,7 +236,7 @@ void locks_free_lock(struct file_lock *fl)
 {
 	BUG_ON(waitqueue_active(&fl->fl_wait));
 	BUG_ON(!list_empty(&fl->fl_block));
-	BUG_ON(!list_empty(&fl->fl_link));
+	BUG_ON(!hlist_unhashed(&fl->fl_link));
 
 	locks_release_private(fl);
 	kmem_cache_free(filelock_cache, fl);
@@ -334,7 +360,7 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
 		start = filp->f_pos;
 		break;
 	case SEEK_END:
-		start = i_size_read(filp->f_path.dentry->d_inode);
+		start = i_size_read(file_inode(filp));
 		break;
 	default:
 		return -EINVAL;
@@ -384,7 +410,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
 		start = filp->f_pos;
 		break;
 	case SEEK_END:
-		start = i_size_read(filp->f_path.dentry->d_inode);
+		start = i_size_read(file_inode(filp));
 		break;
 	default:
 		return -EINVAL;
@@ -484,47 +510,118 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 	return fl1->fl_owner == fl2->fl_owner;
 }
 
+/* Must be called with the i_lock held! */
+static inline void
+locks_insert_global_locks(struct file_lock *fl)
+{
+	lg_local_lock(&file_lock_lglock);
+	fl->fl_link_cpu = smp_processor_id();
+	hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
+	lg_local_unlock(&file_lock_lglock);
+}
+
+/* Must be called with the i_lock held! */
+static inline void
+locks_delete_global_locks(struct file_lock *fl)
+{
+	/*
+	 * Avoid taking lock if already unhashed. This is safe since this check
+	 * is done while holding the i_lock, and new insertions into the list
+	 * also require that it be held.
+	 */
+	if (hlist_unhashed(&fl->fl_link))
+		return;
+	lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+	hlist_del_init(&fl->fl_link);
+	lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+}
+
+static unsigned long
+posix_owner_key(struct file_lock *fl)
+{
+	if (fl->fl_lmops && fl->fl_lmops->lm_owner_key)
+		return fl->fl_lmops->lm_owner_key(fl);
+	return (unsigned long)fl->fl_owner;
+}
+
+static inline void
+locks_insert_global_blocked(struct file_lock *waiter)
+{
+	hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
+}
+
+static inline void
+locks_delete_global_blocked(struct file_lock *waiter)
+{
+	hash_del(&waiter->fl_link);
+}
+
 /* Remove waiter from blocker's block list.
  * When blocker ends up pointing to itself then the list is empty.
+ *
+ * Must be called with blocked_lock_lock held.
  */
 static void __locks_delete_block(struct file_lock *waiter)
 {
+	locks_delete_global_blocked(waiter);
 	list_del_init(&waiter->fl_block);
-	list_del_init(&waiter->fl_link);
 	waiter->fl_next = NULL;
 }
 
-/*
- */
-void locks_delete_block(struct file_lock *waiter)
+static void locks_delete_block(struct file_lock *waiter)
 {
-	lock_flocks();
+	spin_lock(&blocked_lock_lock);
 	__locks_delete_block(waiter);
-	unlock_flocks();
+	spin_unlock(&blocked_lock_lock);
 }
-EXPORT_SYMBOL(locks_delete_block);
 
 /* Insert waiter into blocker's block list.
  * We use a circular list so that processes can be easily woken up in
  * the order they blocked. The documentation doesn't require this but
  * it seems like the reasonable thing to do.
+ *
+ * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
+ * list itself is protected by the file_lock_list, but by ensuring that the
+ * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
+ * in some cases when we see that the fl_block list is empty.
  */
-static void locks_insert_block(struct file_lock *blocker, 
-			       struct file_lock *waiter)
+static void __locks_insert_block(struct file_lock *blocker,
+					struct file_lock *waiter)
 {
 	BUG_ON(!list_empty(&waiter->fl_block));
-	list_add_tail(&waiter->fl_block, &blocker->fl_block);
 	waiter->fl_next = blocker;
+	list_add_tail(&waiter->fl_block, &blocker->fl_block);
 	if (IS_POSIX(blocker))
-		list_add(&waiter->fl_link, &blocked_list);
+		locks_insert_global_blocked(waiter);
 }
 
-/* Wake up processes blocked waiting for blocker.
- * If told to wait then schedule the processes until the block list
- * is empty, otherwise empty the block list ourselves.
+/* Must be called with i_lock held. */
+static void locks_insert_block(struct file_lock *blocker,
+					struct file_lock *waiter)
+{
+	spin_lock(&blocked_lock_lock);
+	__locks_insert_block(blocker, waiter);
+	spin_unlock(&blocked_lock_lock);
+}
+
+/*
+ * Wake up processes blocked waiting for blocker.
+ *
+ * Must be called with the inode->i_lock held!
  */
 static void locks_wake_up_blocks(struct file_lock *blocker)
 {
+	/*
+	 * Avoid taking global lock if list is empty. This is safe since new
+	 * blocked requests are only added to the list under the i_lock, and
+	 * the i_lock is always held here. Note that removal from the fl_block
+	 * list does not require the i_lock, so we must recheck list_empty()
+	 * after acquiring the blocked_lock_lock.
+	 */
+	if (list_empty(&blocker->fl_block))
+		return;
+
+	spin_lock(&blocked_lock_lock);
 	while (!list_empty(&blocker->fl_block)) {
 		struct file_lock *waiter;
 
@@ -536,20 +633,23 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
 		else
 			wake_up(&waiter->fl_wait);
 	}
+	spin_unlock(&blocked_lock_lock);
 }
 
 /* Insert file lock fl into an inode's lock list at the position indicated
  * by pos. At the same time add the lock to the global file lock list.
+ *
+ * Must be called with the i_lock held!
  */
 static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
 {
-	list_add(&fl->fl_link, &file_lock_list);
-
 	fl->fl_nspid = get_pid(task_tgid(current));
 
 	/* insert into file's list */
 	fl->fl_next = *pos;
 	*pos = fl;
+
+	locks_insert_global_locks(fl);
 }
 
 /*
@@ -557,14 +657,17 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
  * Wake up processes that are blocked waiting for this lock,
  * notify the FS that the lock has been cleared and
  * finally free the lock.
+ *
+ * Must be called with the i_lock held!
  */
 static void locks_delete_lock(struct file_lock **thisfl_p)
 {
 	struct file_lock *fl = *thisfl_p;
 
+	locks_delete_global_locks(fl);
+
 	*thisfl_p = fl->fl_next;
 	fl->fl_next = NULL;
-	list_del_init(&fl->fl_link);
 
 	if (fl->fl_nspid) {
 		put_pid(fl->fl_nspid);
@@ -625,9 +728,10 @@ void
 posix_test_lock(struct file *filp, struct file_lock *fl)
 {
 	struct file_lock *cfl;
+	struct inode *inode = file_inode(filp);
 
-	lock_flocks();
-	for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
+	spin_lock(&inode->i_lock);
+	for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
 		if (!IS_POSIX(cfl))
 			continue;
 		if (posix_locks_conflict(fl, cfl))
@@ -639,7 +743,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 			fl->fl_pid = pid_vnr(cfl->fl_nspid);
 	} else
 		fl->fl_type = F_UNLCK;
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -676,13 +780,14 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
 {
 	struct file_lock *fl;
 
-	list_for_each_entry(fl, &blocked_list, fl_link) {
+	hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
 		if (posix_same_owner(fl, block_fl))
 			return fl->fl_next;
 	}
 	return NULL;
 }
 
+/* Must be called with the blocked_lock_lock held! */
 static int posix_locks_deadlock(struct file_lock *caller_fl,
 				struct file_lock *block_fl)
 {
@@ -708,7 +813,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 {
 	struct file_lock *new_fl = NULL;
 	struct file_lock **before;
-	struct inode * inode = filp->f_path.dentry->d_inode;
+	struct inode * inode = file_inode(filp);
 	int error = 0;
 	int found = 0;
 
@@ -718,7 +823,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 			return -ENOMEM;
 	}
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	if (request->fl_flags & FL_ACCESS)
 		goto find_conflict;
 
@@ -748,9 +853,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	 * give it the opportunity to lock the file.
 	 */
 	if (found) {
-		unlock_flocks();
+		spin_unlock(&inode->i_lock);
 		cond_resched();
-		lock_flocks();
+		spin_lock(&inode->i_lock);
 	}
 
 find_conflict:
@@ -777,7 +882,7 @@ find_conflict:
 	error = 0;
 
 out:
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	if (new_fl)
 		locks_free_lock(new_fl);
 	return error;
@@ -791,7 +896,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	struct file_lock *left = NULL;
 	struct file_lock *right = NULL;
 	struct file_lock **before;
-	int error, added = 0;
+	int error;
+	bool added = false;
 
 	/*
 	 * We may need two file_lock structures for this operation,
@@ -806,7 +912,12 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		new_fl2 = locks_alloc_lock();
 	}
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
+	/*
+	 * New lock request. Walk all POSIX locks and look for conflicts. If
+	 * there are any, either return error or put the request on the
+	 * blocker's list of waiters and the global blocked_hash.
+	 */
 	if (request->fl_type != F_UNLCK) {
 		for_each_lock(inode, before) {
 			fl = *before;
@@ -819,11 +930,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			error = -EAGAIN;
 			if (!(request->fl_flags & FL_SLEEP))
 				goto out;
+			/*
+			 * Deadlock detection and insertion into the blocked
+			 * locks list must be done while holding the same lock!
+			 */
 			error = -EDEADLK;
-			if (posix_locks_deadlock(request, fl))
-				goto out;
-			error = FILE_LOCK_DEFERRED;
-			locks_insert_block(fl, request);
+			spin_lock(&blocked_lock_lock);
+			if (likely(!posix_locks_deadlock(request, fl))) {
+				error = FILE_LOCK_DEFERRED;
+				__locks_insert_block(fl, request);
+			}
+			spin_unlock(&blocked_lock_lock);
 			goto out;
   		}
   	}
@@ -845,7 +962,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		before = &fl->fl_next;
 	}
 
-	/* Process locks with this owner.  */
+	/* Process locks with this owner. */
 	while ((fl = *before) && posix_same_owner(request, fl)) {
 		/* Detect adjacent or overlapping regions (if same lock type)
 		 */
@@ -880,7 +997,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				continue;
 			}
 			request = fl;
-			added = 1;
+			added = true;
 		}
 		else {
 			/* Processing for different lock types is a bit
@@ -891,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			if (fl->fl_start > request->fl_end)
 				break;
 			if (request->fl_type == F_UNLCK)
-				added = 1;
+				added = true;
 			if (fl->fl_start < request->fl_start)
 				left = fl;
 			/* If the next lock in the list has a higher end
@@ -921,7 +1038,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				locks_release_private(fl);
 				locks_copy_private(fl, request);
 				request = fl;
-				added = 1;
+				added = true;
 			}
 		}
 		/* Go on to next lock.
@@ -931,10 +1048,9 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	}
 
 	/*
-	 * The above code only modifies existing locks in case of
-	 * merging or replacing.  If new lock(s) need to be inserted
-	 * all modifications are done bellow this, so it's safe yet to
-	 * bail out.
+	 * The above code only modifies existing locks in case of merging or
+	 * replacing. If new lock(s) need to be inserted all modifications are
+	 * done below this, so it's safe yet to bail out.
 	 */
 	error = -ENOLCK; /* "no luck" */
 	if (right && left == right && !new_fl2)
@@ -974,7 +1090,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		locks_wake_up_blocks(left);
 	}
  out:
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	/*
 	 * Free any unused locks.
 	 */
@@ -1002,7 +1118,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 int posix_lock_file(struct file *filp, struct file_lock *fl,
 			struct file_lock *conflock)
 {
-	return __posix_lock_file(filp->f_path.dentry->d_inode, fl, conflock);
+	return __posix_lock_file(file_inode(filp), fl, conflock);
 }
 EXPORT_SYMBOL(posix_lock_file);
 
@@ -1049,14 +1165,14 @@ int locks_mandatory_locked(struct inode *inode)
 	/*
 	 * Search the lock list for this inode for any POSIX locks.
 	 */
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!IS_POSIX(fl))
 			continue;
 		if (fl->fl_owner != owner)
 			break;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return fl ? -EAGAIN : 0;
 }
 
@@ -1199,7 +1315,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	if (IS_ERR(new_fl))
 		return PTR_ERR(new_fl);
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 
 	time_out_leases(inode);
 
@@ -1249,11 +1365,11 @@ restart:
 			break_time++;
 	}
 	locks_insert_block(flock, new_fl);
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	error = wait_event_interruptible_timeout(new_fl->fl_wait,
 						!new_fl->fl_next, break_time);
-	lock_flocks();
-	__locks_delete_block(new_fl);
+	spin_lock(&inode->i_lock);
+	locks_delete_block(new_fl);
 	if (error >= 0) {
 		if (error == 0)
 			time_out_leases(inode);
@@ -1270,7 +1386,7 @@ restart:
 	}
 
 out:
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	locks_free_lock(new_fl);
 	return error;
 }
@@ -1323,22 +1439,23 @@ EXPORT_SYMBOL(lease_get_mtime);
 int fcntl_getlease(struct file *filp)
 {
 	struct file_lock *fl;
+	struct inode *inode = file_inode(filp);
 	int type = F_UNLCK;
 
-	lock_flocks();
-	time_out_leases(filp->f_path.dentry->d_inode);
-	for (fl = filp->f_path.dentry->d_inode->i_flock; fl && IS_LEASE(fl);
+	spin_lock(&inode->i_lock);
+	time_out_leases(file_inode(filp));
+	for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
 			fl = fl->fl_next) {
 		if (fl->fl_file == filp) {
 			type = target_leasetype(fl);
 			break;
 		}
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return type;
 }
 
-int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
+static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
 {
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
 	struct dentry *dentry = filp->f_path.dentry;
@@ -1351,7 +1468,7 @@ int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
 	if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
 		goto out;
 	if ((arg == F_WRLCK)
-	    && ((dentry->d_count > 1)
+	    && ((d_count(dentry) > 1)
 		|| (atomic_read(&inode->i_count) > 1)))
 		goto out;
 
@@ -1403,7 +1520,7 @@ out:
 	return error;
 }
 
-int generic_delete_lease(struct file *filp, struct file_lock **flp)
+static int generic_delete_lease(struct file *filp, struct file_lock **flp)
 {
 	struct file_lock *fl, **before;
 	struct dentry *dentry = filp->f_path.dentry;
@@ -1428,7 +1545,7 @@ int generic_delete_lease(struct file *filp, struct file_lock **flp)
  *	The (input) flp->fl_lmops->lm_break function is required
  *	by break_lease().
  *
- *	Called with file_lock_lock held.
+ *	Called with inode->i_lock held.
  */
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
@@ -1497,11 +1614,12 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 
 int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 {
+	struct inode *inode = file_inode(filp);
 	int error;
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	error = __vfs_setlease(filp, arg, lease);
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 
 	return error;
 }
@@ -1519,6 +1637,7 @@ static int do_fcntl_delete_lease(struct file *filp)
 static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 {
 	struct file_lock *fl, *ret;
+	struct inode *inode = file_inode(filp);
 	struct fasync_struct *new;
 	int error;
 
@@ -1532,10 +1651,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 		return -ENOMEM;
 	}
 	ret = fl;
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	error = __vfs_setlease(filp, arg, &ret);
 	if (error) {
-		unlock_flocks();
+		spin_unlock(&inode->i_lock);
 		locks_free_lock(fl);
 		goto out_free_fasync;
 	}
@@ -1552,7 +1671,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 		new = NULL;
 
 	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 
 out_free_fasync:
 	if (new)
@@ -1843,7 +1962,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 	if (copy_from_user(&flock, l, sizeof(flock)))
 		goto out;
 
-	inode = filp->f_path.dentry->d_inode;
+	inode = file_inode(filp);
 
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
@@ -1961,7 +2080,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 	if (copy_from_user(&flock, l, sizeof(flock)))
 		goto out;
 
-	inode = filp->f_path.dentry->d_inode;
+	inode = file_inode(filp);
 
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
@@ -2030,7 +2149,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 	 * posix_lock_file().  Another process could be setting a lock on this
 	 * file at the same time, but we wouldn't remove that lock anyway.
 	 */
-	if (!filp->f_path.dentry->d_inode->i_flock)
+	if (!file_inode(filp)->i_flock)
 		return;
 
 	lock.fl_type = F_UNLCK;
@@ -2056,7 +2175,7 @@ EXPORT_SYMBOL(locks_remove_posix);
  */
 void locks_remove_flock(struct file *filp)
 {
-	struct inode * inode = filp->f_path.dentry->d_inode;
+	struct inode * inode = file_inode(filp);
 	struct file_lock *fl;
 	struct file_lock **before;
 
@@ -2076,7 +2195,7 @@ void locks_remove_flock(struct file *filp)
 			fl.fl_ops->fl_release_private(&fl);
 	}
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	before = &inode->i_flock;
 
 	while ((fl = *before) != NULL) {
@@ -2094,30 +2213,28 @@ void locks_remove_flock(struct file *filp)
  		}
 		before = &fl->fl_next;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 }
 
 /**
  *	posix_unblock_lock - stop waiting for a file lock
- *      @filp:   how the file was opened
  *	@waiter: the lock which was waiting
  *
  *	lockd needs to block waiting for locks.
  */
 int
-posix_unblock_lock(struct file *filp, struct file_lock *waiter)
+posix_unblock_lock(struct file_lock *waiter)
 {
 	int status = 0;
 
-	lock_flocks();
+	spin_lock(&blocked_lock_lock);
 	if (waiter->fl_next)
 		__locks_delete_block(waiter);
 	else
 		status = -ENOENT;
-	unlock_flocks();
+	spin_unlock(&blocked_lock_lock);
 	return status;
 }
-
 EXPORT_SYMBOL(posix_unblock_lock);
 
 /**
@@ -2140,6 +2257,11 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
+struct locks_iterator {
+	int	li_cpu;
+	loff_t	li_pos;
+};
+
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 			    loff_t id, char *pfx)
 {
@@ -2152,7 +2274,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 		fl_pid = fl->fl_pid;
 
 	if (fl->fl_file != NULL)
-		inode = fl->fl_file->f_path.dentry->d_inode;
+		inode = file_inode(fl->fl_file);
 
 	seq_printf(f, "%lld:%s ", id, pfx);
 	if (IS_POSIX(fl)) {
@@ -2213,37 +2335,41 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 
 static int locks_show(struct seq_file *f, void *v)
 {
+	struct locks_iterator *iter = f->private;
 	struct file_lock *fl, *bfl;
 
-	fl = list_entry(v, struct file_lock, fl_link);
+	fl = hlist_entry(v, struct file_lock, fl_link);
 
-	lock_get_status(f, fl, *((loff_t *)f->private), "");
+	lock_get_status(f, fl, iter->li_pos, "");
 
 	list_for_each_entry(bfl, &fl->fl_block, fl_block)
-		lock_get_status(f, bfl, *((loff_t *)f->private), " ->");
+		lock_get_status(f, bfl, iter->li_pos, " ->");
 
 	return 0;
 }
 
 static void *locks_start(struct seq_file *f, loff_t *pos)
 {
-	loff_t *p = f->private;
+	struct locks_iterator *iter = f->private;
 
-	lock_flocks();
-	*p = (*pos + 1);
-	return seq_list_start(&file_lock_list, *pos);
+	iter->li_pos = *pos + 1;
+	lg_global_lock(&file_lock_lglock);
+	spin_lock(&blocked_lock_lock);
+	return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
 }
 
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 {
-	loff_t *p = f->private;
-	++*p;
-	return seq_list_next(v, &file_lock_list, pos);
+	struct locks_iterator *iter = f->private;
+
+	++iter->li_pos;
+	return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
 }
 
 static void locks_stop(struct seq_file *f, void *v)
 {
-	unlock_flocks();
+	spin_unlock(&blocked_lock_lock);
+	lg_global_unlock(&file_lock_lglock);
 }
 
 static const struct seq_operations locks_seq_operations = {
@@ -2255,7 +2381,8 @@ static const struct seq_operations locks_seq_operations = {
 
 static int locks_open(struct inode *inode, struct file *filp)
 {
-	return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t));
+	return seq_open_private(filp, &locks_seq_operations,
+					sizeof(struct locks_iterator));
 }
 
 static const struct file_operations proc_locks_operations = {
@@ -2290,7 +2417,8 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
 {
 	struct file_lock *fl;
 	int result = 1;
-	lock_flocks();
+
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (IS_POSIX(fl)) {
 			if (fl->fl_type == F_RDLCK)
@@ -2307,7 +2435,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
 		result = 0;
 		break;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return result;
 }
 
@@ -2330,7 +2458,8 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 {
 	struct file_lock *fl;
 	int result = 1;
-	lock_flocks();
+
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (IS_POSIX(fl)) {
 			if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2345,7 +2474,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 		result = 0;
 		break;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return result;
 }
 
@@ -2353,9 +2482,16 @@ EXPORT_SYMBOL(lock_may_write);
 
 static int __init filelock_init(void)
 {
+	int i;
+
 	filelock_cache = kmem_cache_create("file_lock_cache",
 			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
 
+	lg_lock_init(&file_lock_lglock, "file_lock_lglock");
+
+	for_each_possible_cpu(i)
+		INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
+
 	return 0;
 }
 
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
index daf9a9b32dd..09ed066c022 100644
--- a/fs/logfs/Kconfig
+++ b/fs/logfs/Kconfig
@@ -1,6 +1,6 @@
 config LOGFS
-	tristate "LogFS file system (EXPERIMENTAL)"
-	depends on (MTD || BLOCK) && EXPERIMENTAL
+	tristate "LogFS file system"
+	depends on (MTD || BLOCK)
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
 	select CRC32
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index e784a217b50..550475ca6a0 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -32,7 +32,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
 	bio_vec.bv_len = PAGE_SIZE;
 	bio_vec.bv_offset = 0;
 	bio.bi_vcnt = 1;
-	bio.bi_idx = 0;
 	bio.bi_size = PAGE_SIZE;
 	bio.bi_bdev = bdev;
 	bio.bi_sector = page->index * (PAGE_SIZE >> 9);
@@ -108,7 +107,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 		if (i >= max_pages) {
 			/* Block layer cannot split bios :( */
 			bio->bi_vcnt = i;
-			bio->bi_idx = 0;
 			bio->bi_size = i * PAGE_SIZE;
 			bio->bi_bdev = super->s_bdev;
 			bio->bi_sector = ofs >> 9;
@@ -136,7 +134,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 		unlock_page(page);
 	}
 	bio->bi_vcnt = nr_pages;
-	bio->bi_idx = 0;
 	bio->bi_size = nr_pages * PAGE_SIZE;
 	bio->bi_bdev = super->s_bdev;
 	bio->bi_sector = ofs >> 9;
@@ -202,7 +199,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 		if (i >= max_pages) {
 			/* Block layer cannot split bios :( */
 			bio->bi_vcnt = i;
-			bio->bi_idx = 0;
 			bio->bi_size = i * PAGE_SIZE;
 			bio->bi_bdev = super->s_bdev;
 			bio->bi_sector = ofs >> 9;
@@ -224,7 +220,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 		bio->bi_io_vec[i].bv_offset = 0;
 	}
 	bio->bi_vcnt = nr_pages;
-	bio->bi_idx = 0;
 	bio->bi_size = nr_pages * PAGE_SIZE;
 	bio->bi_bdev = super->s_bdev;
 	bio->bi_sector = ofs >> 9;
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 26e4a941532..6bdc347008f 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -281,17 +281,23 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 /* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
  * way to combine the two copies */
-#define IMPLICIT_NODES 2
-static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
+static int logfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *dir = file->f_dentry->d_inode;
-	loff_t pos = file->f_pos - IMPLICIT_NODES;
+	struct inode *dir = file_inode(file);
+	loff_t pos;
 	struct page *page;
 	struct logfs_disk_dentry *dd;
-	int full;
 
+	if (ctx->pos < 0)
+		return -EINVAL;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	pos = ctx->pos - 2;
 	BUG_ON(pos < 0);
-	for (;; pos++) {
+	for (;; pos++, ctx->pos++) {
+		bool full;
 		if (beyond_eof(dir, pos))
 			break;
 		if (!logfs_exist_block(dir, pos)) {
@@ -306,42 +312,17 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
 		dd = kmap(page);
 		BUG_ON(dd->namelen == 0);
 
-		full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
-				pos, be64_to_cpu(dd->ino), dd->type);
+		full = !dir_emit(ctx, (char *)dd->name,
+				be16_to_cpu(dd->namelen),
+				be64_to_cpu(dd->ino), dd->type);
 		kunmap(page);
 		page_cache_release(page);
 		if (full)
 			break;
 	}
-
-	file->f_pos = pos + IMPLICIT_NODES;
 	return 0;
 }
 
-static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
-{
-	struct inode *inode = file->f_dentry->d_inode;
-	ino_t pino = parent_ino(file->f_dentry);
-	int err;
-
-	if (file->f_pos < 0)
-		return -EINVAL;
-
-	if (file->f_pos == 0) {
-		if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
-			return 0;
-		file->f_pos++;
-	}
-	if (file->f_pos == 1) {
-		if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
-			return 0;
-		file->f_pos++;
-	}
-
-	err = __logfs_readdir(file, buf, filldir);
-	return err;
-}
-
 static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
 {
 	dd->namelen = cpu_to_be16(name->len);
@@ -814,7 +795,7 @@ const struct inode_operations logfs_dir_iops = {
 const struct file_operations logfs_dir_fops = {
 	.fsync		= logfs_fsync,
 	.unlocked_ioctl	= logfs_ioctl,
-	.readdir	= logfs_readdir,
+	.iterate	= logfs_readdir,
 	.read		= generic_read_dir,
 	.llseek		= default_llseek,
 };
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 3886cded283..57914fc32b6 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
 	return __logfs_writepage(page);
 }
 
-static void logfs_invalidatepage(struct page *page, unsigned long offset)
+static void logfs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct logfs_block *block = logfs_block(page);
 
@@ -183,7 +184,7 @@ static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
 
 long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct logfs_inode *li = logfs_inode(inode);
 	unsigned int oldflags, flags;
 	int err;
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 038da099179..d448a777166 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb)
 	return area;
 }
 
-static void map_invalidatepage(struct page *page, unsigned long l)
+static void map_invalidatepage(struct page *page, unsigned int o,
+			       unsigned int l)
 {
 	return;
 }
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 345c24b8a6f..54360293bcb 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -608,6 +608,7 @@ static struct file_system_type logfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 
 };
+MODULE_ALIAS_FS("logfs");
 
 static int __init logfs_init(void)
 {
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 685b2d981b8..dfaf6fa9b7b 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -16,12 +16,12 @@
 typedef struct minix_dir_entry minix_dirent;
 typedef struct minix3_dir_entry minix3_dirent;
 
-static int minix_readdir(struct file *, void *, filldir_t);
+static int minix_readdir(struct file *, struct dir_context *);
 
 const struct file_operations minix_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= minix_readdir,
+	.iterate	= minix_readdir,
 	.fsync		= generic_file_fsync,
 };
 
@@ -82,22 +82,23 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
 	return (void*)((char*)de + sbi->s_dirsize);
 }
 
-static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int minix_readdir(struct file *file, struct dir_context *ctx)
 {
-	unsigned long pos = filp->f_pos;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	unsigned offset = pos & ~PAGE_CACHE_MASK;
-	unsigned long n = pos >> PAGE_CACHE_SHIFT;
-	unsigned long npages = dir_pages(inode);
 	struct minix_sb_info *sbi = minix_sb(sb);
 	unsigned chunk_size = sbi->s_dirsize;
-	char *name;
-	__u32 inumber;
+	unsigned long npages = dir_pages(inode);
+	unsigned long pos = ctx->pos;
+	unsigned offset;
+	unsigned long n;
 
-	pos = (pos + chunk_size-1) & ~(chunk_size-1);
+	ctx->pos = pos = ALIGN(pos, chunk_size);
 	if (pos >= inode->i_size)
-		goto done;
+		return 0;
+
+	offset = pos & ~PAGE_CACHE_MASK;
+	n = pos >> PAGE_CACHE_SHIFT;
 
 	for ( ; n < npages; n++, offset = 0) {
 		char *p, *kaddr, *limit;
@@ -109,6 +110,8 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
 		p = kaddr+offset;
 		limit = kaddr + minix_last_byte(inode, n) - chunk_size;
 		for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
+			const char *name;
+			__u32 inumber;
 			if (sbi->s_version == MINIX_V3) {
 				minix3_dirent *de3 = (minix3_dirent *)p;
 				name = de3->name;
@@ -119,24 +122,17 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
 				inumber = de->inode;
 			}
 			if (inumber) {
-				int over;
-
 				unsigned l = strnlen(name, sbi->s_namelen);
-				offset = p - kaddr;
-				over = filldir(dirent, name, l,
-					(n << PAGE_CACHE_SHIFT) | offset,
-					inumber, DT_UNKNOWN);
-				if (over) {
+				if (!dir_emit(ctx, name, l,
+					      inumber, DT_UNKNOWN)) {
 					dir_put_page(page);
-					goto done;
+					return 0;
 				}
 			}
+			ctx->pos += chunk_size;
 		}
 		dir_put_page(page);
 	}
-
-done:
-	filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
 	return 0;
 }
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 99541cceb58..df122496f32 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -660,6 +660,7 @@ static struct file_system_type minix_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("minix");
 
 static int __init init_minix_fs(void)
 {
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 0db73d9dd66..cd950e2331b 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -54,6 +54,18 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode,
 	return error;
 }
 
+static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int error;
+	struct inode *inode = minix_new_inode(dir, mode, &error);
+	if (inode) {
+		minix_set_inode(inode, 0);
+		mark_inode_dirty(inode);
+		d_tmpfile(dentry, inode);
+	}
+	return error;
+}
+
 static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		bool excl)
 {
@@ -254,4 +266,5 @@ const struct inode_operations minix_dir_inode_operations = {
 	.mknod		= minix_mknod,
 	.rename		= minix_rename,
 	.getattr	= minix_getattr,
+	.tmpfile	= minix_tmpfile,
 };
diff --git a/fs/mount.h b/fs/mount.h
index cd500798040..64a858143ff 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -18,6 +18,12 @@ struct mnt_pcp {
 	int mnt_writers;
 };
 
+struct mountpoint {
+	struct list_head m_hash;
+	struct dentry *m_dentry;
+	int m_count;
+};
+
 struct mount {
 	struct list_head mnt_hash;
 	struct mount *mnt_parent;
@@ -40,6 +46,7 @@ struct mount {
 	struct list_head mnt_slave;	/* slave list entry */
 	struct mount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
+	struct mountpoint *mnt_mp;	/* where is it mounted */
 #ifdef CONFIG_FSNOTIFY
 	struct hlist_head mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
diff --git a/fs/namei.c b/fs/namei.c
index 43a97ee1d4c..b2beee7a733 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -451,7 +451,7 @@ int inode_permission(struct inode *inode, int mask)
  *
  * Given a path increment the reference count to the dentry and the vfsmount.
  */
-void path_get(struct path *path)
+void path_get(const struct path *path)
 {
 	mntget(path->mnt);
 	dget(path->dentry);
@@ -464,7 +464,7 @@ EXPORT_SYMBOL(path_get);
  *
  * Given a path decrement the reference count to the dentry and the vfsmount.
  */
-void path_put(struct path *path)
+void path_put(const struct path *path)
 {
 	dput(path->dentry);
 	mntput(path->mnt);
@@ -600,14 +600,10 @@ static int complete_walk(struct nameidata *nd)
 	if (likely(!(nd->flags & LOOKUP_JUMPED)))
 		return 0;
 
-	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
+	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
 		return 0;
 
-	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
-		return 0;
-
-	/* Note: we do not d_invalidate() */
-	status = d_revalidate(dentry, nd->flags);
+	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
 	if (status > 0)
 		return 0;
 
@@ -693,8 +689,6 @@ void nd_jump_link(struct nameidata *nd, struct path *path)
 	nd->path = *path;
 	nd->inode = nd->path.dentry->d_inode;
 	nd->flags |= LOOKUP_JUMPED;
-
-	BUG_ON(nd->inode->i_op->follow_link);
 }
 
 static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
@@ -1342,7 +1336,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
  *  small and for now I'd prefer to have fast path as straight as possible.
  *  It _is_ time-critical.
  */
-static int lookup_fast(struct nameidata *nd, struct qstr *name,
+static int lookup_fast(struct nameidata *nd,
 		       struct path *path, struct inode **inode)
 {
 	struct vfsmount *mnt = nd->path.mnt;
@@ -1358,7 +1352,7 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name,
 	 */
 	if (nd->flags & LOOKUP_RCU) {
 		unsigned seq;
-		dentry = __d_lookup_rcu(parent, name, &seq, nd->inode);
+		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
 		if (!dentry)
 			goto unlazy;
 
@@ -1400,7 +1394,7 @@ unlazy:
 		if (unlazy_walk(nd, dentry))
 			return -ECHILD;
 	} else {
-		dentry = __d_lookup(parent, name);
+		dentry = __d_lookup(parent, &nd->last);
 	}
 
 	if (unlikely(!dentry))
@@ -1436,8 +1430,7 @@ need_lookup:
 }
 
 /* Fast lookup failed, do it the slow way */
-static int lookup_slow(struct nameidata *nd, struct qstr *name,
-		       struct path *path)
+static int lookup_slow(struct nameidata *nd, struct path *path)
 {
 	struct dentry *dentry, *parent;
 	int err;
@@ -1446,7 +1439,7 @@ static int lookup_slow(struct nameidata *nd, struct qstr *name,
 	BUG_ON(nd->inode != parent->d_inode);
 
 	mutex_lock(&parent->d_inode->i_mutex);
-	dentry = __lookup_hash(name, parent, nd->flags);
+	dentry = __lookup_hash(&nd->last, parent, nd->flags);
 	mutex_unlock(&parent->d_inode->i_mutex);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
@@ -1519,7 +1512,7 @@ static inline int should_follow_link(struct inode *inode, int follow)
 }
 
 static inline int walk_component(struct nameidata *nd, struct path *path,
-		struct qstr *name, int type, int follow)
+		int follow)
 {
 	struct inode *inode;
 	int err;
@@ -1528,14 +1521,14 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
 	 * to be able to know about the current root directory and
 	 * parent relationships.
 	 */
-	if (unlikely(type != LAST_NORM))
-		return handle_dots(nd, type);
-	err = lookup_fast(nd, name, path, &inode);
+	if (unlikely(nd->last_type != LAST_NORM))
+		return handle_dots(nd, nd->last_type);
+	err = lookup_fast(nd, path, &inode);
 	if (unlikely(err)) {
 		if (err < 0)
 			goto out_err;
 
-		err = lookup_slow(nd, name, path);
+		err = lookup_slow(nd, path);
 		if (err < 0)
 			goto out_err;
 
@@ -1594,8 +1587,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 		res = follow_link(&link, nd, &cookie);
 		if (res)
 			break;
-		res = walk_component(nd, path, &nd->last,
-				     nd->last_type, LOOKUP_FOLLOW);
+		res = walk_component(nd, path, LOOKUP_FOLLOW);
 		put_link(nd, &link, cookie);
 	} while (res > 0);
 
@@ -1795,15 +1787,17 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			struct dentry *parent = nd->path.dentry;
 			nd->flags &= ~LOOKUP_JUMPED;
 			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-				err = parent->d_op->d_hash(parent, nd->inode,
-							   &this);
+				err = parent->d_op->d_hash(parent, &this);
 				if (err < 0)
 					break;
 			}
 		}
 
+		nd->last = this;
+		nd->last_type = type;
+
 		if (!name[len])
-			goto last_component;
+			return 0;
 		/*
 		 * If it wasn't NUL, we know it was '/'. Skip that
 		 * slash, and continue until no more slashes.
@@ -1812,10 +1806,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			len++;
 		} while (unlikely(name[len] == '/'));
 		if (!name[len])
-			goto last_component;
+			return 0;
+
 		name += len;
 
-		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
+		err = walk_component(nd, &next, LOOKUP_FOLLOW);
 		if (err < 0)
 			return err;
 
@@ -1824,16 +1819,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			if (err)
 				return err;
 		}
-		if (can_lookup(nd->inode))
-			continue;
-		err = -ENOTDIR; 
-		break;
-		/* here ends the main loop */
-
-last_component:
-		nd->last = this;
-		nd->last_type = type;
-		return 0;
+		if (!can_lookup(nd->inode)) {
+			err = -ENOTDIR; 
+			break;
+		}
 	}
 	terminate_walk(nd);
 	return err;
@@ -1932,8 +1921,7 @@ static inline int lookup_last(struct nameidata *nd, struct path *path)
 		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 
 	nd->flags &= ~LOOKUP_PARENT;
-	return walk_component(nd, path, &nd->last, nd->last_type,
-					nd->flags & LOOKUP_FOLLOW);
+	return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW);
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@ -1987,7 +1975,7 @@ static int path_lookupat(int dfd, const char *name,
 		err = complete_walk(nd);
 
 	if (!err && nd->flags & LOOKUP_DIRECTORY) {
-		if (!nd->inode->i_op->lookup) {
+		if (!can_lookup(nd->inode)) {
 			path_put(&nd->path);
 			err = -ENOTDIR;
 		}
@@ -2132,7 +2120,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	 * to use its own hash..
 	 */
 	if (base->d_flags & DCACHE_OP_HASH) {
-		int err = base->d_op->d_hash(base, base->d_inode, &this);
+		int err = base->d_op->d_hash(base, &this);
 		if (err < 0)
 			return ERR_PTR(err);
 	}
@@ -2701,28 +2689,10 @@ static int do_last(struct nameidata *nd, struct path *path,
 	nd->flags &= ~LOOKUP_PARENT;
 	nd->flags |= op->intent;
 
-	switch (nd->last_type) {
-	case LAST_DOTDOT:
-	case LAST_DOT:
+	if (nd->last_type != LAST_NORM) {
 		error = handle_dots(nd, nd->last_type);
 		if (error)
 			return error;
-		/* fallthrough */
-	case LAST_ROOT:
-		error = complete_walk(nd);
-		if (error)
-			return error;
-		audit_inode(name, nd->path.dentry, 0);
-		if (open_flag & O_CREAT) {
-			error = -EISDIR;
-			goto out;
-		}
-		goto finish_open;
-	case LAST_BIND:
-		error = complete_walk(nd);
-		if (error)
-			return error;
-		audit_inode(name, dir, 0);
 		goto finish_open;
 	}
 
@@ -2732,7 +2702,7 @@ static int do_last(struct nameidata *nd, struct path *path,
 		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
 			symlink_ok = true;
 		/* we _can_ be in RCU mode here */
-		error = lookup_fast(nd, &nd->last, path, &inode);
+		error = lookup_fast(nd, path, &inode);
 		if (likely(!error))
 			goto finish_lookup;
 
@@ -2751,7 +2721,7 @@ static int do_last(struct nameidata *nd, struct path *path,
 		if (error)
 			return error;
 
-		audit_inode(name, dir, 0);
+		audit_inode(name, dir, LOOKUP_PARENT);
 		error = -EISDIR;
 		/* trailing slashes? */
 		if (nd->last.name[nd->last.len])
@@ -2778,7 +2748,7 @@ retry_lookup:
 			goto out;
 
 		if ((*opened & FILE_CREATED) ||
-		    !S_ISREG(file->f_path.dentry->d_inode->i_mode))
+		    !S_ISREG(file_inode(file)->i_mode))
 			will_truncate = false;
 
 		audit_inode(name, file->f_path.dentry, 0);
@@ -2852,19 +2822,19 @@ finish_lookup:
 	}
 	nd->inode = inode;
 	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
+finish_open:
 	error = complete_walk(nd);
 	if (error) {
 		path_put(&save_parent);
 		return error;
 	}
+	audit_inode(name, nd->path.dentry, 0);
 	error = -EISDIR;
 	if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
 		goto out;
 	error = -ENOTDIR;
-	if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
+	if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode))
 		goto out;
-	audit_inode(name, nd->path.dentry, 0);
-finish_open:
 	if (!S_ISREG(nd->inode->i_mode))
 		will_truncate = false;
 
@@ -2931,6 +2901,67 @@ stale_open:
 	goto retry_lookup;
 }
 
+static int do_tmpfile(int dfd, struct filename *pathname,
+		struct nameidata *nd, int flags,
+		const struct open_flags *op,
+		struct file *file, int *opened)
+{
+	static const struct qstr name = QSTR_INIT("/", 1);
+	struct dentry *dentry, *child;
+	struct inode *dir;
+	int error = path_lookupat(dfd, pathname->name,
+				  flags | LOOKUP_DIRECTORY, nd);
+	if (unlikely(error))
+		return error;
+	error = mnt_want_write(nd->path.mnt);
+	if (unlikely(error))
+		goto out;
+	/* we want directory to be writable */
+	error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
+	if (error)
+		goto out2;
+	dentry = nd->path.dentry;
+	dir = dentry->d_inode;
+	if (!dir->i_op->tmpfile) {
+		error = -EOPNOTSUPP;
+		goto out2;
+	}
+	child = d_alloc(dentry, &name);
+	if (unlikely(!child)) {
+		error = -ENOMEM;
+		goto out2;
+	}
+	nd->flags &= ~LOOKUP_DIRECTORY;
+	nd->flags |= op->intent;
+	dput(nd->path.dentry);
+	nd->path.dentry = child;
+	error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
+	if (error)
+		goto out2;
+	audit_inode(pathname, nd->path.dentry, 0);
+	error = may_open(&nd->path, op->acc_mode, op->open_flag);
+	if (error)
+		goto out2;
+	file->f_path.mnt = nd->path.mnt;
+	error = finish_open(file, nd->path.dentry, NULL, opened);
+	if (error)
+		goto out2;
+	error = open_check_o_direct(file);
+	if (error) {
+		fput(file);
+	} else if (!(op->open_flag & O_EXCL)) {
+		struct inode *inode = file_inode(file);
+		spin_lock(&inode->i_lock);
+		inode->i_state |= I_LINKABLE;
+		spin_unlock(&inode->i_lock);
+	}
+out2:
+	mnt_drop_write(nd->path.mnt);
+out:
+	path_put(&nd->path);
+	return error;
+}
+
 static struct file *path_openat(int dfd, struct filename *pathname,
 		struct nameidata *nd, const struct open_flags *op, int flags)
 {
@@ -2941,11 +2972,16 @@ static struct file *path_openat(int dfd, struct filename *pathname,
 	int error;
 
 	file = get_empty_filp();
-	if (!file)
-		return ERR_PTR(-ENFILE);
+	if (IS_ERR(file))
+		return file;
 
 	file->f_flags = op->open_flag;
 
+	if (unlikely(file->f_flags & O_TMPFILE)) {
+		error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
+		goto out;
+	}
+
 	error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
 	if (unlikely(error))
 		goto out;
@@ -2998,9 +3034,10 @@ out:
 }
 
 struct file *do_filp_open(int dfd, struct filename *pathname,
-		const struct open_flags *op, int flags)
+		const struct open_flags *op)
 {
 	struct nameidata nd;
+	int flags = op->lookup_flags;
 	struct file *filp;
 
 	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
@@ -3012,17 +3049,16 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
 }
 
 struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
-		const char *name, const struct open_flags *op, int flags)
+		const char *name, const struct open_flags *op)
 {
 	struct nameidata nd;
 	struct file *file;
 	struct filename filename = { .name = name };
+	int flags = op->lookup_flags | LOOKUP_ROOT;
 
 	nd.root.mnt = mnt;
 	nd.root.dentry = dentry;
 
-	flags |= LOOKUP_ROOT;
-
 	if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
 
@@ -3597,12 +3633,18 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 
 	mutex_lock(&inode->i_mutex);
 	/* Make sure we don't allow creating hardlink to an unlinked file */
-	if (inode->i_nlink == 0)
+	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
 		error =  -ENOENT;
 	else if (max_links && inode->i_nlink >= max_links)
 		error = -EMLINK;
 	else
 		error = dir->i_op->link(old_dentry, dir, new_dentry);
+
+	if (!error && (inode->i_state & I_LINKABLE)) {
+		spin_lock(&inode->i_lock);
+		inode->i_state &= ~I_LINKABLE;
+		spin_unlock(&inode->i_lock);
+	}
 	mutex_unlock(&inode->i_mutex);
 	if (!error)
 		fsnotify_link(dir, inode, new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index 55605c55278..7b1ca9ba0b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -21,7 +21,8 @@
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/magic.h>
 #include "pnode.h"
 #include "internal.h"
 
@@ -36,6 +37,7 @@ static int mnt_id_start = 0;
 static int mnt_group_start = 1;
 
 static struct list_head *mount_hashtable __read_mostly;
+static struct list_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
 static struct rw_semaphore namespace_sem;
 
@@ -384,7 +386,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
  */
 int __mnt_want_write_file(struct file *file)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 
 	if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
 		return __mnt_want_write(file->f_path.mnt);
@@ -605,6 +607,51 @@ struct vfsmount *lookup_mnt(struct path *path)
 	}
 }
 
+static struct mountpoint *new_mountpoint(struct dentry *dentry)
+{
+	struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
+	struct mountpoint *mp;
+
+	list_for_each_entry(mp, chain, m_hash) {
+		if (mp->m_dentry == dentry) {
+			/* might be worth a WARN_ON() */
+			if (d_unlinked(dentry))
+				return ERR_PTR(-ENOENT);
+			mp->m_count++;
+			return mp;
+		}
+	}
+
+	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+	if (!mp)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock(&dentry->d_lock);
+	if (d_unlinked(dentry)) {
+		spin_unlock(&dentry->d_lock);
+		kfree(mp);
+		return ERR_PTR(-ENOENT);
+	}
+	dentry->d_flags |= DCACHE_MOUNTED;
+	spin_unlock(&dentry->d_lock);
+	mp->m_dentry = dentry;
+	mp->m_count = 1;
+	list_add(&mp->m_hash, chain);
+	return mp;
+}
+
+static void put_mountpoint(struct mountpoint *mp)
+{
+	if (!--mp->m_count) {
+		struct dentry *dentry = mp->m_dentry;
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags &= ~DCACHE_MOUNTED;
+		spin_unlock(&dentry->d_lock);
+		list_del(&mp->m_hash);
+		kfree(mp);
+	}
+}
+
 static inline int check_mnt(struct mount *mnt)
 {
 	return mnt->mnt_ns == current->nsproxy->mnt_ns;
@@ -633,27 +680,6 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 }
 
 /*
- * Clear dentry's mounted state if it has no remaining mounts.
- * vfsmount_lock must be held for write.
- */
-static void dentry_reset_mounted(struct dentry *dentry)
-{
-	unsigned u;
-
-	for (u = 0; u < HASH_SIZE; u++) {
-		struct mount *p;
-
-		list_for_each_entry(p, &mount_hashtable[u], mnt_hash) {
-			if (p->mnt_mountpoint == dentry)
-				return;
-		}
-	}
-	spin_lock(&dentry->d_lock);
-	dentry->d_flags &= ~DCACHE_MOUNTED;
-	spin_unlock(&dentry->d_lock);
-}
-
-/*
  * vfsmount lock must be held for write
  */
 static void detach_mnt(struct mount *mnt, struct path *old_path)
@@ -664,32 +690,35 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	list_del_init(&mnt->mnt_child);
 	list_del_init(&mnt->mnt_hash);
-	dentry_reset_mounted(old_path->dentry);
+	put_mountpoint(mnt->mnt_mp);
+	mnt->mnt_mp = NULL;
 }
 
 /*
  * vfsmount lock must be held for write
  */
-void mnt_set_mountpoint(struct mount *mnt, struct dentry *dentry,
+void mnt_set_mountpoint(struct mount *mnt,
+			struct mountpoint *mp,
 			struct mount *child_mnt)
 {
+	mp->m_count++;
 	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
-	child_mnt->mnt_mountpoint = dget(dentry);
+	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
 	child_mnt->mnt_parent = mnt;
-	spin_lock(&dentry->d_lock);
-	dentry->d_flags |= DCACHE_MOUNTED;
-	spin_unlock(&dentry->d_lock);
+	child_mnt->mnt_mp = mp;
 }
 
 /*
  * vfsmount lock must be held for write
  */
-static void attach_mnt(struct mount *mnt, struct path *path)
+static void attach_mnt(struct mount *mnt,
+			struct mount *parent,
+			struct mountpoint *mp)
 {
-	mnt_set_mountpoint(real_mount(path->mnt), path->dentry, mnt);
+	mnt_set_mountpoint(parent, mp, mnt);
 	list_add_tail(&mnt->mnt_hash, mount_hashtable +
-			hash(path->mnt, path->dentry));
-	list_add_tail(&mnt->mnt_child, &real_mount(path->mnt)->mnt_mounts);
+			hash(&parent->mnt, mp->m_dentry));
+	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 }
 
 /*
@@ -798,6 +827,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	}
 
 	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
+	/* Don't allow unprivileged users to change mount flags */
+	if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
+		mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
+
 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_sb = sb;
 	mnt->mnt.mnt_root = dget(root);
@@ -1091,11 +1124,23 @@ int may_umount(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(may_umount);
 
-void release_mounts(struct list_head *head)
+static LIST_HEAD(unmounted);	/* protected by namespace_sem */
+
+static void namespace_unlock(void)
 {
 	struct mount *mnt;
-	while (!list_empty(head)) {
-		mnt = list_first_entry(head, struct mount, mnt_hash);
+	LIST_HEAD(head);
+
+	if (likely(list_empty(&unmounted))) {
+		up_write(&namespace_sem);
+		return;
+	}
+
+	list_splice_init(&unmounted, &head);
+	up_write(&namespace_sem);
+
+	while (!list_empty(&head)) {
+		mnt = list_first_entry(&head, struct mount, mnt_hash);
 		list_del_init(&mnt->mnt_hash);
 		if (mnt_has_parent(mnt)) {
 			struct dentry *dentry;
@@ -1115,11 +1160,16 @@ void release_mounts(struct list_head *head)
 	}
 }
 
+static inline void namespace_lock(void)
+{
+	down_write(&namespace_sem);
+}
+
 /*
  * vfsmount lock must be held for write
  * namespace_sem must be held for write
  */
-void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
+void umount_tree(struct mount *mnt, int propagate)
 {
 	LIST_HEAD(tmp_list);
 	struct mount *p;
@@ -1138,20 +1188,20 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
 		list_del_init(&p->mnt_child);
 		if (mnt_has_parent(p)) {
 			p->mnt_parent->mnt_ghosts++;
-			dentry_reset_mounted(p->mnt_mountpoint);
+			put_mountpoint(p->mnt_mp);
+			p->mnt_mp = NULL;
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
 	}
-	list_splice(&tmp_list, kill);
+	list_splice(&tmp_list, &unmounted);
 }
 
-static void shrink_submounts(struct mount *mnt, struct list_head *umounts);
+static void shrink_submounts(struct mount *mnt);
 
 static int do_umount(struct mount *mnt, int flags)
 {
 	struct super_block *sb = mnt->mnt.mnt_sb;
 	int retval;
-	LIST_HEAD(umount_list);
 
 	retval = security_sb_umount(&mnt->mnt, flags);
 	if (retval)
@@ -1218,25 +1268,32 @@ static int do_umount(struct mount *mnt, int flags)
 		return retval;
 	}
 
-	down_write(&namespace_sem);
+	namespace_lock();
 	br_write_lock(&vfsmount_lock);
 	event++;
 
 	if (!(flags & MNT_DETACH))
-		shrink_submounts(mnt, &umount_list);
+		shrink_submounts(mnt);
 
 	retval = -EBUSY;
 	if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
 		if (!list_empty(&mnt->mnt_list))
-			umount_tree(mnt, 1, &umount_list);
+			umount_tree(mnt, 1);
 		retval = 0;
 	}
 	br_write_unlock(&vfsmount_lock);
-	up_write(&namespace_sem);
-	release_mounts(&umount_list);
+	namespace_unlock();
 	return retval;
 }
 
+/* 
+ * Is the caller allowed to modify his namespace?
+ */
+static inline bool may_mount(void)
+{
+	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
+}
+
 /*
  * Now umount can handle mount points as well as block devices.
  * This is important for filesystems which use unnamed block devices.
@@ -1255,6 +1312,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
 		return -EINVAL;
 
+	if (!may_mount())
+		return -EPERM;
+
 	if (!(flags & UMOUNT_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
 
@@ -1268,10 +1328,6 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 	if (!check_mnt(mnt))
 		goto dput_and_out;
 
-	retval = -EPERM;
-	if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
-		goto dput_and_out;
-
 	retval = do_umount(mnt, flags);
 dput_and_out:
 	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
@@ -1293,37 +1349,19 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
 
 #endif
 
-static int mount_is_safe(struct path *path)
-{
-	if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
-		return 0;
-	return -EPERM;
-#ifdef notyet
-	if (S_ISLNK(path->dentry->d_inode->i_mode))
-		return -EPERM;
-	if (path->dentry->d_inode->i_mode & S_ISVTX) {
-		if (current_uid() != path->dentry->d_inode->i_uid)
-			return -EPERM;
-	}
-	if (inode_permission(path->dentry->d_inode, MAY_WRITE))
-		return -EPERM;
-	return 0;
-#endif
-}
-
 static bool mnt_ns_loop(struct path *path)
 {
 	/* Could bind mounting the mount namespace inode cause a
 	 * mount namespace loop?
 	 */
 	struct inode *inode = path->dentry->d_inode;
-	struct proc_inode *ei;
+	struct proc_ns *ei;
 	struct mnt_namespace *mnt_ns;
 
 	if (!proc_ns_inode(inode))
 		return false;
 
-	ei = PROC_I(inode);
+	ei = get_proc_ns(inode);
 	if (ei->ns_ops != &mntns_operations)
 		return false;
 
@@ -1334,8 +1372,7 @@ static bool mnt_ns_loop(struct path *path)
 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 					int flag)
 {
-	struct mount *res, *p, *q, *r;
-	struct path path;
+	struct mount *res, *p, *q, *r, *parent;
 
 	if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
 		return ERR_PTR(-EINVAL);
@@ -1362,25 +1399,22 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 				q = q->mnt_parent;
 			}
 			p = s;
-			path.mnt = &q->mnt;
-			path.dentry = p->mnt_mountpoint;
+			parent = q;
 			q = clone_mnt(p, p->mnt.mnt_root, flag);
 			if (IS_ERR(q))
 				goto out;
 			br_write_lock(&vfsmount_lock);
 			list_add_tail(&q->mnt_list, &res->mnt_list);
-			attach_mnt(q, &path);
+			attach_mnt(q, parent, p->mnt_mp);
 			br_write_unlock(&vfsmount_lock);
 		}
 	}
 	return res;
 out:
 	if (res) {
-		LIST_HEAD(umount_list);
 		br_write_lock(&vfsmount_lock);
-		umount_tree(res, 0, &umount_list);
+		umount_tree(res, 0);
 		br_write_unlock(&vfsmount_lock);
-		release_mounts(&umount_list);
 	}
 	return q;
 }
@@ -1390,10 +1424,10 @@ out:
 struct vfsmount *collect_mounts(struct path *path)
 {
 	struct mount *tree;
-	down_write(&namespace_sem);
+	namespace_lock();
 	tree = copy_tree(real_mount(path->mnt), path->dentry,
 			 CL_COPY_ALL | CL_PRIVATE);
-	up_write(&namespace_sem);
+	namespace_unlock();
 	if (IS_ERR(tree))
 		return NULL;
 	return &tree->mnt;
@@ -1401,13 +1435,11 @@ struct vfsmount *collect_mounts(struct path *path)
 
 void drop_collected_mounts(struct vfsmount *mnt)
 {
-	LIST_HEAD(umount_list);
-	down_write(&namespace_sem);
+	namespace_lock();
 	br_write_lock(&vfsmount_lock);
-	umount_tree(real_mount(mnt), 0, &umount_list);
+	umount_tree(real_mount(mnt), 0);
 	br_write_unlock(&vfsmount_lock);
-	up_write(&namespace_sem);
-	release_mounts(&umount_list);
+	namespace_unlock();
 }
 
 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
@@ -1516,11 +1548,11 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
  * in allocations.
  */
 static int attach_recursive_mnt(struct mount *source_mnt,
-			struct path *path, struct path *parent_path)
+			struct mount *dest_mnt,
+			struct mountpoint *dest_mp,
+			struct path *parent_path)
 {
 	LIST_HEAD(tree_list);
-	struct mount *dest_mnt = real_mount(path->mnt);
-	struct dentry *dest_dentry = path->dentry;
 	struct mount *child, *p;
 	int err;
 
@@ -1529,7 +1561,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		if (err)
 			goto out;
 	}
-	err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
+	err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
 	if (err)
 		goto out_cleanup_ids;
 
@@ -1541,10 +1573,10 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	}
 	if (parent_path) {
 		detach_mnt(source_mnt, parent_path);
-		attach_mnt(source_mnt, path);
+		attach_mnt(source_mnt, dest_mnt, dest_mp);
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
-		mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
+		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		commit_tree(source_mnt);
 	}
 
@@ -1563,46 +1595,53 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	return err;
 }
 
-static int lock_mount(struct path *path)
+static struct mountpoint *lock_mount(struct path *path)
 {
 	struct vfsmount *mnt;
+	struct dentry *dentry = path->dentry;
 retry:
-	mutex_lock(&path->dentry->d_inode->i_mutex);
-	if (unlikely(cant_mount(path->dentry))) {
-		mutex_unlock(&path->dentry->d_inode->i_mutex);
-		return -ENOENT;
+	mutex_lock(&dentry->d_inode->i_mutex);
+	if (unlikely(cant_mount(dentry))) {
+		mutex_unlock(&dentry->d_inode->i_mutex);
+		return ERR_PTR(-ENOENT);
 	}
-	down_write(&namespace_sem);
+	namespace_lock();
 	mnt = lookup_mnt(path);
-	if (likely(!mnt))
-		return 0;
-	up_write(&namespace_sem);
+	if (likely(!mnt)) {
+		struct mountpoint *mp = new_mountpoint(dentry);
+		if (IS_ERR(mp)) {
+			namespace_unlock();
+			mutex_unlock(&dentry->d_inode->i_mutex);
+			return mp;
+		}
+		return mp;
+	}
+	namespace_unlock();
 	mutex_unlock(&path->dentry->d_inode->i_mutex);
 	path_put(path);
 	path->mnt = mnt;
-	path->dentry = dget(mnt->mnt_root);
+	dentry = path->dentry = dget(mnt->mnt_root);
 	goto retry;
 }
 
-static void unlock_mount(struct path *path)
+static void unlock_mount(struct mountpoint *where)
 {
-	up_write(&namespace_sem);
-	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	struct dentry *dentry = where->m_dentry;
+	put_mountpoint(where);
+	namespace_unlock();
+	mutex_unlock(&dentry->d_inode->i_mutex);
 }
 
-static int graft_tree(struct mount *mnt, struct path *path)
+static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 {
 	if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
 		return -EINVAL;
 
-	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
+	if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
 	      S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
 		return -ENOTDIR;
 
-	if (d_unlinked(path->dentry))
-		return -ENOENT;
-
-	return attach_recursive_mnt(mnt, path, NULL);
+	return attach_recursive_mnt(mnt, p, mp, NULL);
 }
 
 /*
@@ -1633,9 +1672,6 @@ static int do_change_type(struct path *path, int flag)
 	int type;
 	int err = 0;
 
-	if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
-
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
@@ -1643,7 +1679,7 @@ static int do_change_type(struct path *path, int flag)
 	if (!type)
 		return -EINVAL;
 
-	down_write(&namespace_sem);
+	namespace_lock();
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
 		if (err)
@@ -1656,7 +1692,7 @@ static int do_change_type(struct path *path, int flag)
 	br_write_unlock(&vfsmount_lock);
 
  out_unlock:
-	up_write(&namespace_sem);
+	namespace_unlock();
 	return err;
 }
 
@@ -1666,12 +1702,10 @@ static int do_change_type(struct path *path, int flag)
 static int do_loopback(struct path *path, const char *old_name,
 				int recurse)
 {
-	LIST_HEAD(umount_list);
 	struct path old_path;
-	struct mount *mnt = NULL, *old;
-	int err = mount_is_safe(path);
-	if (err)
-		return err;
+	struct mount *mnt = NULL, *old, *parent;
+	struct mountpoint *mp;
+	int err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
@@ -1682,17 +1716,19 @@ static int do_loopback(struct path *path, const char *old_name,
 	if (mnt_ns_loop(&old_path))
 		goto out; 
 
-	err = lock_mount(path);
-	if (err)
+	mp = lock_mount(path);
+	err = PTR_ERR(mp);
+	if (IS_ERR(mp))
 		goto out;
 
 	old = real_mount(old_path.mnt);
+	parent = real_mount(path->mnt);
 
 	err = -EINVAL;
 	if (IS_MNT_UNBINDABLE(old))
 		goto out2;
 
-	if (!check_mnt(real_mount(path->mnt)) || !check_mnt(old))
+	if (!check_mnt(parent) || !check_mnt(old))
 		goto out2;
 
 	if (recurse)
@@ -1702,18 +1738,17 @@ static int do_loopback(struct path *path, const char *old_name,
 
 	if (IS_ERR(mnt)) {
 		err = PTR_ERR(mnt);
-		goto out;
+		goto out2;
 	}
 
-	err = graft_tree(mnt, path);
+	err = graft_tree(mnt, parent, mp);
 	if (err) {
 		br_write_lock(&vfsmount_lock);
-		umount_tree(mnt, 0, &umount_list);
+		umount_tree(mnt, 0);
 		br_write_unlock(&vfsmount_lock);
 	}
 out2:
-	unlock_mount(path);
-	release_mounts(&umount_list);
+	unlock_mount(mp);
 out:
 	path_put(&old_path);
 	return err;
@@ -1729,6 +1764,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
 	if (readonly_request == __mnt_is_readonly(mnt))
 		return 0;
 
+	if (mnt->mnt_flags & MNT_LOCK_READONLY)
+		return -EPERM;
+
 	if (readonly_request)
 		error = mnt_make_readonly(real_mount(mnt));
 	else
@@ -1748,9 +1786,6 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	struct super_block *sb = path->mnt->mnt_sb;
 	struct mount *mnt = real_mount(path->mnt);
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	if (!check_mnt(mnt))
 		return -EINVAL;
 
@@ -1764,6 +1799,8 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
+	else if (!capable(CAP_SYS_ADMIN))
+		err = -EPERM;
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
@@ -1796,17 +1833,17 @@ static int do_move_mount(struct path *path, const char *old_name)
 	struct path old_path, parent_path;
 	struct mount *p;
 	struct mount *old;
-	int err = 0;
-	if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
+	struct mountpoint *mp;
+	int err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
 	if (err)
 		return err;
 
-	err = lock_mount(path);
-	if (err < 0)
+	mp = lock_mount(path);
+	err = PTR_ERR(mp);
+	if (IS_ERR(mp))
 		goto out;
 
 	old = real_mount(old_path.mnt);
@@ -1816,9 +1853,6 @@ static int do_move_mount(struct path *path, const char *old_name)
 	if (!check_mnt(p) || !check_mnt(old))
 		goto out1;
 
-	if (d_unlinked(path->dentry))
-		goto out1;
-
 	err = -EINVAL;
 	if (old_path.dentry != old_path.mnt->mnt_root)
 		goto out1;
@@ -1845,7 +1879,7 @@ static int do_move_mount(struct path *path, const char *old_name)
 		if (p == old)
 			goto out1;
 
-	err = attach_recursive_mnt(old, path, &parent_path);
+	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
 	if (err)
 		goto out1;
 
@@ -1853,7 +1887,7 @@ static int do_move_mount(struct path *path, const char *old_name)
 	 * automatically */
 	list_del_init(&old->mnt_expire);
 out1:
-	unlock_mount(path);
+	unlock_mount(mp);
 out:
 	if (!err)
 		path_put(&parent_path);
@@ -1889,21 +1923,24 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
  */
 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 {
+	struct mountpoint *mp;
+	struct mount *parent;
 	int err;
 
 	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
 
-	err = lock_mount(path);
-	if (err)
-		return err;
+	mp = lock_mount(path);
+	if (IS_ERR(mp))
+		return PTR_ERR(mp);
 
+	parent = real_mount(path->mnt);
 	err = -EINVAL;
-	if (unlikely(!check_mnt(real_mount(path->mnt)))) {
+	if (unlikely(!check_mnt(parent))) {
 		/* that's acceptable only for automounts done in private ns */
 		if (!(mnt_flags & MNT_SHRINKABLE))
 			goto unlock;
 		/* ... and for those we'd better have mountpoint still alive */
-		if (!real_mount(path->mnt)->mnt_ns)
+		if (!parent->mnt_ns)
 			goto unlock;
 	}
 
@@ -1918,10 +1955,10 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 		goto unlock;
 
 	newmnt->mnt.mnt_flags = mnt_flags;
-	err = graft_tree(newmnt, path);
+	err = graft_tree(newmnt, parent, mp);
 
 unlock:
-	unlock_mount(path);
+	unlock_mount(mp);
 	return err;
 }
 
@@ -1933,18 +1970,13 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 			int mnt_flags, const char *name, void *data)
 {
 	struct file_system_type *type;
-	struct user_namespace *user_ns;
+	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	struct vfsmount *mnt;
 	int err;
 
 	if (!fstype)
 		return -EINVAL;
 
-	/* we need capabilities... */
-	user_ns = real_mount(path->mnt)->mnt_ns->user_ns;
-	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
-
 	type = get_fs_type(fstype);
 	if (!type)
 		return -ENODEV;
@@ -1999,11 +2031,11 @@ int finish_automount(struct vfsmount *m, struct path *path)
 fail:
 	/* remove m from any expiration list it may be on */
 	if (!list_empty(&mnt->mnt_expire)) {
-		down_write(&namespace_sem);
+		namespace_lock();
 		br_write_lock(&vfsmount_lock);
 		list_del_init(&mnt->mnt_expire);
 		br_write_unlock(&vfsmount_lock);
-		up_write(&namespace_sem);
+		namespace_unlock();
 	}
 	mntput(m);
 	mntput(m);
@@ -2017,13 +2049,13 @@ fail:
  */
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
-	down_write(&namespace_sem);
+	namespace_lock();
 	br_write_lock(&vfsmount_lock);
 
 	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
 
 	br_write_unlock(&vfsmount_lock);
-	up_write(&namespace_sem);
+	namespace_unlock();
 }
 EXPORT_SYMBOL(mnt_set_expiry);
 
@@ -2036,12 +2068,11 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 {
 	struct mount *mnt, *next;
 	LIST_HEAD(graveyard);
-	LIST_HEAD(umounts);
 
 	if (list_empty(mounts))
 		return;
 
-	down_write(&namespace_sem);
+	namespace_lock();
 	br_write_lock(&vfsmount_lock);
 
 	/* extract from the expiration list every vfsmount that matches the
@@ -2059,12 +2090,10 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 	while (!list_empty(&graveyard)) {
 		mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
 		touch_mnt_namespace(mnt->mnt_ns);
-		umount_tree(mnt, 1, &umounts);
+		umount_tree(mnt, 1);
 	}
 	br_write_unlock(&vfsmount_lock);
-	up_write(&namespace_sem);
-
-	release_mounts(&umounts);
+	namespace_unlock();
 }
 
 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
@@ -2121,7 +2150,7 @@ resume:
  *
  * vfsmount_lock must be held for write
  */
-static void shrink_submounts(struct mount *mnt, struct list_head *umounts)
+static void shrink_submounts(struct mount *mnt)
 {
 	LIST_HEAD(graveyard);
 	struct mount *m;
@@ -2132,7 +2161,7 @@ static void shrink_submounts(struct mount *mnt, struct list_head *umounts)
 			m = list_first_entry(&graveyard, struct mount,
 						mnt_expire);
 			touch_mnt_namespace(m->mnt_ns);
-			umount_tree(m, 1, umounts);
+			umount_tree(m, 1);
 		}
 	}
 }
@@ -2255,6 +2284,8 @@ long do_mount(const char *dev_name, const char *dir_name,
 
 	retval = security_sb_mount(dev_name, &path,
 				   type_page, flags, data_page);
+	if (!retval && !may_mount())
+		retval = -EPERM;
 	if (retval)
 		goto dput_out;
 
@@ -2356,14 +2387,14 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	if (IS_ERR(new_ns))
 		return new_ns;
 
-	down_write(&namespace_sem);
+	namespace_lock();
 	/* First pass: copy the tree topology */
 	copy_flags = CL_COPY_ALL | CL_EXPIRE;
 	if (user_ns != mnt_ns->user_ns)
-		copy_flags |= CL_SHARED_TO_SLAVE;
+		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
 	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
 	if (IS_ERR(new)) {
-		up_write(&namespace_sem);
+		namespace_unlock();
 		free_mnt_ns(new_ns);
 		return ERR_CAST(new);
 	}
@@ -2394,7 +2425,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 		p = next_mnt(p, old);
 		q = next_mnt(q, new);
 	}
-	up_write(&namespace_sem);
+	namespace_unlock();
 
 	if (rootmnt)
 		mntput(rootmnt);
@@ -2432,7 +2463,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
 		struct mount *mnt = real_mount(m);
 		mnt->mnt_ns = new_ns;
 		new_ns->root = mnt;
-		list_add(&new_ns->list, &mnt->mnt_list);
+		list_add(&mnt->mnt_list, &new_ns->list);
 	} else {
 		mntput(m);
 	}
@@ -2564,10 +2595,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		const char __user *, put_old)
 {
 	struct path new, old, parent_path, root_parent, root;
-	struct mount *new_mnt, *root_mnt;
+	struct mount *new_mnt, *root_mnt, *old_mnt;
+	struct mountpoint *old_mp, *root_mp;
 	int error;
 
-	if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
+	if (!may_mount())
 		return -EPERM;
 
 	error = user_path_dir(new_root, &new);
@@ -2583,14 +2615,16 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out2;
 
 	get_fs_root(current->fs, &root);
-	error = lock_mount(&old);
-	if (error)
+	old_mp = lock_mount(&old);
+	error = PTR_ERR(old_mp);
+	if (IS_ERR(old_mp))
 		goto out3;
 
 	error = -EINVAL;
 	new_mnt = real_mount(new.mnt);
 	root_mnt = real_mount(root.mnt);
-	if (IS_MNT_SHARED(real_mount(old.mnt)) ||
+	old_mnt = real_mount(old.mnt);
+	if (IS_MNT_SHARED(old_mnt) ||
 		IS_MNT_SHARED(new_mnt->mnt_parent) ||
 		IS_MNT_SHARED(root_mnt->mnt_parent))
 		goto out4;
@@ -2599,37 +2633,37 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	error = -ENOENT;
 	if (d_unlinked(new.dentry))
 		goto out4;
-	if (d_unlinked(old.dentry))
-		goto out4;
 	error = -EBUSY;
-	if (new.mnt == root.mnt ||
-	    old.mnt == root.mnt)
+	if (new_mnt == root_mnt || old_mnt == root_mnt)
 		goto out4; /* loop, on the same file system  */
 	error = -EINVAL;
 	if (root.mnt->mnt_root != root.dentry)
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(root_mnt))
 		goto out4; /* not attached */
+	root_mp = root_mnt->mnt_mp;
 	if (new.mnt->mnt_root != new.dentry)
 		goto out4; /* not a mountpoint */
 	if (!mnt_has_parent(new_mnt))
 		goto out4; /* not attached */
 	/* make sure we can reach put_old from new_root */
-	if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
+	if (!is_path_reachable(old_mnt, old.dentry, &new))
 		goto out4;
+	root_mp->m_count++; /* pin it so it won't go away */
 	br_write_lock(&vfsmount_lock);
 	detach_mnt(new_mnt, &parent_path);
 	detach_mnt(root_mnt, &root_parent);
 	/* mount old root on put_old */
-	attach_mnt(root_mnt, &old);
+	attach_mnt(root_mnt, old_mnt, old_mp);
 	/* mount new_root on / */
-	attach_mnt(new_mnt, &root_parent);
+	attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	br_write_unlock(&vfsmount_lock);
 	chroot_fs_refs(&root, &new);
+	put_mountpoint(root_mp);
 	error = 0;
 out4:
-	unlock_mount(&old);
+	unlock_mount(old_mp);
 	if (!error) {
 		path_put(&root_parent);
 		path_put(&parent_path);
@@ -2684,14 +2718,17 @@ void __init mnt_init(void)
 			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
 
 	mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
+	mountpoint_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
 
-	if (!mount_hashtable)
+	if (!mount_hashtable || !mountpoint_hashtable)
 		panic("Failed to allocate mount hash table\n");
 
 	printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
 
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mount_hashtable[u]);
+	for (u = 0; u < HASH_SIZE; u++)
+		INIT_LIST_HEAD(&mountpoint_hashtable[u]);
 
 	br_lock_init(&vfsmount_lock);
 
@@ -2708,16 +2745,13 @@ void __init mnt_init(void)
 
 void put_mnt_ns(struct mnt_namespace *ns)
 {
-	LIST_HEAD(umount_list);
-
 	if (!atomic_dec_and_test(&ns->count))
 		return;
-	down_write(&namespace_sem);
+	namespace_lock();
 	br_write_lock(&vfsmount_lock);
-	umount_tree(ns->root, 0, &umount_list);
+	umount_tree(ns->root, 0);
 	br_write_unlock(&vfsmount_lock);
-	up_write(&namespace_sem);
-	release_mounts(&umount_list);
+	namespace_unlock();
 	free_mnt_ns(ns);
 }
 
@@ -2753,6 +2787,51 @@ bool our_mnt(struct vfsmount *mnt)
 	return check_mnt(real_mount(mnt));
 }
 
+bool current_chrooted(void)
+{
+	/* Does the current process have a non-standard root */
+	struct path ns_root;
+	struct path fs_root;
+	bool chrooted;
+
+	/* Find the namespace root */
+	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
+	ns_root.dentry = ns_root.mnt->mnt_root;
+	path_get(&ns_root);
+	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
+		;
+
+	get_fs_root(current->fs, &fs_root);
+
+	chrooted = !path_equal(&fs_root, &ns_root);
+
+	path_put(&fs_root);
+	path_put(&ns_root);
+
+	return chrooted;
+}
+
+void update_mnt_policy(struct user_namespace *userns)
+{
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	struct mount *mnt;
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &ns->list, mnt_list) {
+		switch (mnt->mnt.mnt_sb->s_magic) {
+		case SYSFS_MAGIC:
+			userns->may_mount_sysfs = true;
+			break;
+		case PROC_SUPER_MAGIC:
+			userns->may_mount_proc = true;
+			break;
+		}
+		if (userns->may_mount_sysfs && userns->may_mount_proc)
+			break;
+	}
+	up_read(&namespace_sem);
+}
+
 static void *mntns_get(struct task_struct *task)
 {
 	struct mnt_namespace *ns = NULL;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 4117e7b377b..3be047474bf 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -23,12 +23,12 @@
 
 #include "ncp_fs.h"
 
-static void ncp_read_volume_list(struct file *, void *, filldir_t,
+static void ncp_read_volume_list(struct file *, struct dir_context *,
 				struct ncp_cache_control *);
-static void ncp_do_readdir(struct file *, void *, filldir_t,
+static void ncp_do_readdir(struct file *, struct dir_context *,
 				struct ncp_cache_control *);
 
-static int ncp_readdir(struct file *, void *, filldir_t);
+static int ncp_readdir(struct file *, struct dir_context *);
 
 static int ncp_create(struct inode *, struct dentry *, umode_t, bool);
 static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int);
@@ -49,7 +49,7 @@ const struct file_operations ncp_dir_operations =
 {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ncp_readdir,
+	.iterate	= ncp_readdir,
 	.unlocked_ioctl	= ncp_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ncp_compat_ioctl,
@@ -73,10 +73,8 @@ const struct inode_operations ncp_dir_inode_operations =
  * Dentry operations routines
  */
 static int ncp_lookup_validate(struct dentry *, unsigned int);
-static int ncp_hash_dentry(const struct dentry *, const struct inode *,
-		struct qstr *);
-static int ncp_compare_dentry(const struct dentry *, const struct inode *,
-		const struct dentry *, const struct inode *,
+static int ncp_hash_dentry(const struct dentry *, struct qstr *);
+static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
 		unsigned int, const char *, const struct qstr *);
 static int ncp_delete_dentry(const struct dentry *);
 
@@ -119,11 +117,19 @@ static inline int ncp_case_sensitive(const struct inode *i)
 /*
  * Note: leave the hash unchanged if the directory
  * is case-sensitive.
+ *
+ * Accessing the parent inode can be racy under RCU pathwalking.
+ * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
+ * the callers will handle races.
  */
 static int 
-ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *this)
+ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
 {
+	struct inode *inode = ACCESS_ONCE(dentry->d_inode);
+
+	if (!inode)
+		return 0;
+
 	if (!ncp_case_sensitive(inode)) {
 		struct super_block *sb = dentry->d_sb;
 		struct nls_table *t;
@@ -140,14 +146,24 @@ ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
 	return 0;
 }
 
+/*
+ * Accessing the parent inode can be racy under RCU pathwalking.
+ * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
+ * the callers will handle races.
+ */
 static int
-ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
+	struct inode *pinode;
+
 	if (len != name->len)
 		return 1;
 
+	pinode = ACCESS_ONCE(parent->d_inode);
+	if (!pinode)
+		return 1;
+
 	if (ncp_case_sensitive(pinode))
 		return strncmp(str, name->name, len);
 
@@ -424,9 +440,9 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
 	return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
 }
 
-static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ncp_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	struct page *page = NULL;
 	struct ncp_server *server = NCP_SERVER(inode);
@@ -440,7 +456,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		(int) filp->f_pos);
+		(int) ctx->pos);
 
 	result = -EIO;
 	/* Do not generate '.' and '..' when server is dead. */
@@ -448,16 +464,8 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		goto out;
 
 	result = 0;
-	if (filp->f_pos == 0) {
-		if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR))
-			goto out;
-		filp->f_pos = 1;
-	}
-	if (filp->f_pos == 1) {
-		if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR))
-			goto out;
-		filp->f_pos = 2;
-	}
+	if (!dir_emit_dots(file, ctx))
+		goto out;
 
 	page = grab_cache_page(&inode->i_data, 0);
 	if (!page)
@@ -469,7 +477,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (!PageUptodate(page) || !ctl.head.eof)
 		goto init_cache;
 
-	if (filp->f_pos == 2) {
+	if (ctx->pos == 2) {
 		if (jiffies - ctl.head.time >= NCP_MAX_AGE(server))
 			goto init_cache;
 
@@ -479,10 +487,10 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			goto init_cache;
 	}
 
-	if (filp->f_pos > ctl.head.end)
+	if (ctx->pos > ctl.head.end)
 		goto finished;
 
-	ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2);
+	ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2);
 	ctl.ofs  = ctl.fpos / NCP_DIRCACHE_SIZE;
 	ctl.idx  = ctl.fpos % NCP_DIRCACHE_SIZE;
 
@@ -497,21 +505,21 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		while (ctl.idx < NCP_DIRCACHE_SIZE) {
 			struct dentry *dent;
-			int res;
+			bool over;
 
 			dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
-						dentry, filp->f_pos);
+						dentry, ctx->pos);
 			if (!dent)
 				goto invalid_cache;
-			res = filldir(dirent, dent->d_name.name,
-					dent->d_name.len, filp->f_pos,
+			over = !dir_emit(ctx, dent->d_name.name,
+					dent->d_name.len,
 					dent->d_inode->i_ino, DT_UNKNOWN);
 			dput(dent);
-			if (res)
+			if (over)
 				goto finished;
-			filp->f_pos += 1;
+			ctx->pos += 1;
 			ctl.idx += 1;
-			if (filp->f_pos > ctl.head.end)
+			if (ctx->pos > ctl.head.end)
 				goto finished;
 		}
 		if (ctl.page) {
@@ -548,9 +556,9 @@ init_cache:
 	ctl.valid  = 1;
 read_really:
 	if (ncp_is_server_root(inode)) {
-		ncp_read_volume_list(filp, dirent, filldir, &ctl);
+		ncp_read_volume_list(file, ctx, &ctl);
 	} else {
-		ncp_do_readdir(filp, dirent, filldir, &ctl);
+		ncp_do_readdir(file, ctx, &ctl);
 	}
 	ctl.head.end = ctl.fpos - 1;
 	ctl.head.eof = ctl.valid;
@@ -573,11 +581,11 @@ out:
 }
 
 static int
-ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+ncp_fill_cache(struct file *file, struct dir_context *ctx,
 		struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
 		int inval_childs)
 {
-	struct dentry *newdent, *dentry = filp->f_path.dentry;
+	struct dentry *newdent, *dentry = file->f_path.dentry;
 	struct inode *dir = dentry->d_inode;
 	struct ncp_cache_control ctl = *ctrl;
 	struct qstr qname;
@@ -593,14 +601,10 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 		return 1; /* I'm not sure */
 
 	qname.name = __name;
-	qname.hash = full_name_hash(qname.name, qname.len);
-
-	if (dentry->d_op && dentry->d_op->d_hash)
-		if (dentry->d_op->d_hash(dentry, dentry->d_inode, &qname) != 0)
-			goto end_advance;
-
-	newdent = d_lookup(dentry, &qname);
 
+	newdent = d_hash_and_lookup(dentry, &qname);
+	if (unlikely(IS_ERR(newdent)))
+		goto end_advance;
 	if (!newdent) {
 		newdent = d_alloc(dentry, &qname);
 		if (!newdent)
@@ -670,15 +674,13 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 end_advance:
 	if (!valid)
 		ctl.valid = 0;
-	if (!ctl.filled && (ctl.fpos == filp->f_pos)) {
-		if (!ino)
-			ino = find_inode_number(dentry, &qname);
+	if (!ctl.filled && (ctl.fpos == ctx->pos)) {
 		if (!ino)
 			ino = iunique(dir->i_sb, 2);
-		ctl.filled = filldir(dirent, qname.name, qname.len,
-				     filp->f_pos, ino, DT_UNKNOWN);
+		ctl.filled = !dir_emit(ctx, qname.name, qname.len,
+				     ino, DT_UNKNOWN);
 		if (!ctl.filled)
-			filp->f_pos += 1;
+			ctx->pos += 1;
 	}
 	ctl.fpos += 1;
 	ctl.idx  += 1;
@@ -687,10 +689,10 @@ end_advance:
 }
 
 static void
-ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
+ncp_read_volume_list(struct file *file, struct dir_context *ctx,
 			struct ncp_cache_control *ctl)
 {
-	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	struct ncp_server *server = NCP_SERVER(inode);
 	struct ncp_volume_info info;
@@ -698,7 +700,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
 	int i;
 
 	DPRINTK("ncp_read_volume_list: pos=%ld\n",
-			(unsigned long) filp->f_pos);
+			(unsigned long) ctx->pos);
 
 	for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
 		int inval_dentry;
@@ -719,16 +721,16 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
 		}
 		inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
 		entry.volume = entry.i.volNumber;
-		if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry))
+		if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry))
 			return;
 	}
 }
 
 static void
-ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
+ncp_do_readdir(struct file *file, struct dir_context *ctx,
 						struct ncp_cache_control *ctl)
 {
-	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *dentry = file->f_path.dentry;
 	struct inode *dir = dentry->d_inode;
 	struct ncp_server *server = NCP_SERVER(dir);
 	struct nw_search_sequence seq;
@@ -740,7 +742,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
 
 	DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		(unsigned long) filp->f_pos);
+		(unsigned long) ctx->pos);
 	PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n",
 		dentry->d_name.name, NCP_FINFO(dir)->volNumber,
 		NCP_FINFO(dir)->dirEntNum);
@@ -782,7 +784,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
 			rpl += onerpl;
 			rpls -= onerpl;
 			entry.volume = entry.i.volNumber;
-			if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0))
+			if (!ncp_fill_cache(file, ctx, ctl, &entry, 0))
 				break;
 		}
 	} while (more);
@@ -1033,15 +1035,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
 	DPRINTK("ncp_rmdir: removing %s/%s\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	/*
-	 * fail with EBUSY if there are still references to this
-	 * directory.
-	 */
-	dentry_unhash(dentry);
-	error = -EBUSY;
-	if (!d_unhashed(dentry))
-		goto out;
-
 	len = sizeof(__name);
 	error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
 			   dentry->d_name.len, !ncp_preserve_case(dir));
@@ -1144,17 +1137,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 		old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 		new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
 
-	if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
-		/*
-		 * fail with EBUSY if there are still references to this
-		 * directory.
-		 */
-		dentry_unhash(new_dentry);
-		error = -EBUSY;
-		if (!d_unhashed(new_dentry))
-			goto out;
-	}
-
 	ncp_age_dentry(server, old_dentry);
 	ncp_age_dentry(server, new_dentry);
 
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 1acdad7fcec..4659da67e7f 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -331,12 +331,15 @@ static int  ncp_show_options(struct seq_file *seq, struct dentry *root)
 	struct ncp_server *server = NCP_SBP(root->d_sb);
 	unsigned int tmp;
 
-	if (server->m.uid != 0)
-		seq_printf(seq, ",uid=%u", server->m.uid);
-	if (server->m.gid != 0)
-		seq_printf(seq, ",gid=%u", server->m.gid);
-	if (server->m.mounted_uid != 0)
-		seq_printf(seq, ",owner=%u", server->m.mounted_uid);
+	if (!uid_eq(server->m.uid, GLOBAL_ROOT_UID))
+		seq_printf(seq, ",uid=%u",
+			   from_kuid_munged(&init_user_ns, server->m.uid));
+	if (!gid_eq(server->m.gid, GLOBAL_ROOT_GID))
+		seq_printf(seq, ",gid=%u",
+			   from_kgid_munged(&init_user_ns, server->m.gid));
+	if (!uid_eq(server->m.mounted_uid, GLOBAL_ROOT_UID))
+		seq_printf(seq, ",owner=%u",
+			   from_kuid_munged(&init_user_ns, server->m.mounted_uid));
 	tmp = server->m.file_mode & S_IALLUGO;
 	if (tmp != NCP_DEFAULT_FILE_MODE)
 		seq_printf(seq, ",mode=0%o", tmp);
@@ -381,13 +384,13 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
 
 	data->flags = 0;
 	data->int_flags = 0;
-	data->mounted_uid = 0;
+	data->mounted_uid = GLOBAL_ROOT_UID;
 	data->wdog_pid = NULL;
 	data->ncp_fd = ~0;
 	data->time_out = NCP_DEFAULT_TIME_OUT;
 	data->retry_count = NCP_DEFAULT_RETRY_COUNT;
-	data->uid = 0;
-	data->gid = 0;
+	data->uid = GLOBAL_ROOT_UID;
+	data->gid = GLOBAL_ROOT_GID;
 	data->file_mode = NCP_DEFAULT_FILE_MODE;
 	data->dir_mode = NCP_DEFAULT_DIR_MODE;
 	data->info_fd = -1;
@@ -399,13 +402,25 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
 			goto err;
 		switch (optval) {
 			case 'u':
-				data->uid = optint;
+				data->uid = make_kuid(current_user_ns(), optint);
+				if (!uid_valid(data->uid)) {
+					ret = -EINVAL;
+					goto err;
+				}
 				break;
 			case 'g':
-				data->gid = optint;
+				data->gid = make_kgid(current_user_ns(), optint);
+				if (!gid_valid(data->gid)) {
+					ret = -EINVAL;
+					goto err;
+				}
 				break;
 			case 'o':
-				data->mounted_uid = optint;
+				data->mounted_uid = make_kuid(current_user_ns(), optint);
+				if (!uid_valid(data->mounted_uid)) {
+					ret = -EINVAL;
+					goto err;
+				}
 				break;
 			case 'm':
 				data->file_mode = optint;
@@ -480,13 +495,13 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 
 				data.flags = md->flags;
 				data.int_flags = NCP_IMOUNT_LOGGEDIN_POSSIBLE;
-				data.mounted_uid = md->mounted_uid;
+				data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid);
 				data.wdog_pid = find_get_pid(md->wdog_pid);
 				data.ncp_fd = md->ncp_fd;
 				data.time_out = md->time_out;
 				data.retry_count = md->retry_count;
-				data.uid = md->uid;
-				data.gid = md->gid;
+				data.uid = make_kuid(current_user_ns(), md->uid);
+				data.gid = make_kgid(current_user_ns(), md->gid);
 				data.file_mode = md->file_mode;
 				data.dir_mode = md->dir_mode;
 				data.info_fd = -1;
@@ -499,13 +514,13 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 				struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;
 
 				data.flags = md->flags;
-				data.mounted_uid = md->mounted_uid;
+				data.mounted_uid = make_kuid(current_user_ns(), md->mounted_uid);
 				data.wdog_pid = find_get_pid(md->wdog_pid);
 				data.ncp_fd = md->ncp_fd;
 				data.time_out = md->time_out;
 				data.retry_count = md->retry_count;
-				data.uid = md->uid;
-				data.gid = md->gid;
+				data.uid = make_kuid(current_user_ns(), md->uid);
+				data.gid = make_kgid(current_user_ns(), md->gid);
 				data.file_mode = md->file_mode;
 				data.dir_mode = md->dir_mode;
 				data.info_fd = -1;
@@ -520,12 +535,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 				goto out;
 			break;
 	}
+	error = -EINVAL;
+	if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) ||
+	    !gid_valid(data.gid))
+		goto out;
 	error = -EBADF;
 	ncp_filp = fget(data.ncp_fd);
 	if (!ncp_filp)
 		goto out;
 	error = -ENOTSOCK;
-	sock_inode = ncp_filp->f_path.dentry->d_inode;
+	sock_inode = file_inode(ncp_filp);
 	if (!S_ISSOCK(sock_inode->i_mode))
 		goto out_fput;
 	sock = SOCKET_I(sock_inode);
@@ -564,7 +583,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 		if (!server->info_filp)
 			goto out_bdi;
 		error = -ENOTSOCK;
-		sock_inode = server->info_filp->f_path.dentry->d_inode;
+		sock_inode = file_inode(server->info_filp);
 		if (!S_ISSOCK(sock_inode->i_mode))
 			goto out_fput2;
 		info_sock = SOCKET_I(sock_inode);
@@ -878,6 +897,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 	if (!server)	/* How this could happen? */
 		goto out;
 
+	result = -EPERM;
+	if (IS_DEADDIR(dentry->d_inode))
+		goto out;
+
 	/* ageing the dentry to force validation */
 	ncp_age_dentry(server, dentry);
 
@@ -886,12 +909,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
 		goto out;
 
 	result = -EPERM;
-	if (((attr->ia_valid & ATTR_UID) &&
-	     (attr->ia_uid != server->m.uid)))
+	if ((attr->ia_valid & ATTR_UID) && !uid_eq(attr->ia_uid, server->m.uid))
 		goto out;
 
-	if (((attr->ia_valid & ATTR_GID) &&
-	     (attr->ia_gid != server->m.gid)))
+	if ((attr->ia_valid & ATTR_GID) && !gid_eq(attr->ia_gid, server->m.gid))
 		goto out;
 
 	if (((attr->ia_valid & ATTR_MODE) &&
@@ -1040,6 +1061,7 @@ static struct file_system_type ncp_fs_type = {
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_BINARY_MOUNTDATA,
 };
+MODULE_ALIAS_FS("ncpfs");
 
 static int __init init_ncp_fs(void)
 {
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 6958adfaff0..60426ccb3b6 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -45,7 +45,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
 		return -EINVAL;
 	}
 	/* TODO: info.addr = server->m.serv_addr; */
-	SET_UID(info.mounted_uid, server->m.mounted_uid);
+	SET_UID(info.mounted_uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid));
 	info.connection		= server->connection;
 	info.buffer_size	= server->buffer_size;
 	info.volume_number	= NCP_FINFO(inode)->volNumber;
@@ -69,7 +69,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
 		DPRINTK("info.version invalid: %d\n", info2.version);
 		return -EINVAL;
 	}
-	info2.mounted_uid   = server->m.mounted_uid;
+	info2.mounted_uid   = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
 	info2.connection    = server->connection;
 	info2.buffer_size   = server->buffer_size;
 	info2.volume_number = NCP_FINFO(inode)->volNumber;
@@ -135,7 +135,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
 		DPRINTK("info.version invalid: %d\n", info2.version);
 		return -EINVAL;
 	}
-	info2.mounted_uid   = server->m.mounted_uid;
+	info2.mounted_uid   = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
 	info2.connection    = server->connection;
 	info2.buffer_size   = server->buffer_size;
 	info2.volume_number = NCP_FINFO(inode)->volNumber;
@@ -348,22 +348,25 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
 		{
 			u16 uid;
 
-			SET_UID(uid, server->m.mounted_uid);
+			SET_UID(uid, from_kuid_munged(current_user_ns(), server->m.mounted_uid));
 			if (put_user(uid, (u16 __user *)argp))
 				return -EFAULT;
 			return 0;
 		}
 	case NCP_IOC_GETMOUNTUID32:
-		if (put_user(server->m.mounted_uid,
-			     (u32 __user *)argp))
+	{
+		uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
+		if (put_user(uid, (u32 __user *)argp))
 			return -EFAULT;
 		return 0;
+	}
 	case NCP_IOC_GETMOUNTUID64:
-		if (put_user(server->m.mounted_uid,
-			     (u64 __user *)argp))
+	{
+		uid_t uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
+		if (put_user(uid, (u64 __user *)argp))
 			return -EFAULT;
 		return 0;
-
+	}
 	case NCP_IOC_GETROOT:
 		{
 			struct ncp_setroot_ioctl sr;
@@ -808,9 +811,9 @@ outrel:
 
 long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct ncp_server *server = NCP_SERVER(inode);
-	uid_t uid = current_uid();
+	kuid_t uid = current_uid();
 	int need_drop_write = 0;
 	long ret;
 
@@ -819,12 +822,12 @@ long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case NCP_IOC_CONN_LOGGED_IN:
 	case NCP_IOC_SETROOT:
 		if (!capable(CAP_SYS_ADMIN)) {
-			ret = -EACCES;
+			ret = -EPERM;
 			goto out;
 		}
 		break;
 	}
-	if (server->m.mounted_uid != uid) {
+	if (!uid_eq(server->m.mounted_uid, uid)) {
 		switch (cmd) {
 		/*
 		 * Only mount owner can issue these ioctls.  Information
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 63d14a99483..3c5dd55d284 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -105,7 +105,7 @@ static const struct vm_operations_struct ncp_file_mmap =
 /* This is used for a general mmap of a ncp file */
 int ncp_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	
 	DPRINTK("ncp_mmap: called\n");
 
@@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 	/* we do not support files bigger than 4GB... We eventually 
 	   supports just 4GB... */
-	if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff 
+	if (vma_pages(vma) + vma->vm_pgoff
 	   > (1U << (32 - PAGE_SHIFT)))
 		return -EFBIG;
 
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index 54cc0cdb3dc..c51b2c54353 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -23,15 +23,15 @@ struct ncp_mount_data_kernel {
 	unsigned long    flags;		/* NCP_MOUNT_* flags */
 	unsigned int	 int_flags;	/* internal flags */
 #define NCP_IMOUNT_LOGGEDIN_POSSIBLE	0x0001
-	uid_t		 mounted_uid;	/* Who may umount() this filesystem? */
+	kuid_t		 mounted_uid;	/* Who may umount() this filesystem? */
 	struct pid      *wdog_pid;	/* Who cares for our watchdog packets? */
 	unsigned int     ncp_fd;	/* The socket to the ncp port */
 	unsigned int     time_out;	/* How long should I wait after
 					   sending a NCP request? */
 	unsigned int     retry_count;	/* And how often should I retry? */
 	unsigned char	 mounted_vol[NCP_VOLNAME_LEN + 1];
-	uid_t		 uid;
-	gid_t		 gid;
+	kuid_t		 uid;
+	kgid_t		 gid;
 	umode_t		 file_mode;
 	umode_t		 dir_mode;
 	int		 info_fd;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 13ca196385f..b5e80b0af31 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -104,6 +104,15 @@ config NFS_V4_1
 
 	  If unsure, say N.
 
+config NFS_V4_2
+	bool "NFS client support for NFSv4.2"
+	depends on NFS_V4_1
+	help
+	  This option enables support for minor version 2 of the NFSv4 protocol
+	  in the kernel's NFS client.
+
+	  If unsure, say N.
+
 config PNFS_FILE_LAYOUT
 	tristate
 	depends on NFS_V4_1
@@ -131,6 +140,11 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
 	  If the NFS client is unchanged from the upstream kernel, this
 	  option should be set to the default "kernel.org".
 
+config NFS_V4_SECURITY_LABEL
+	bool
+	depends on NFS_V4_2 && SECURITY
+	default y
+
 config ROOT_NFS
 	bool "Root file system on NFS"
 	depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index cce2c057bd2..e0bb048e957 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,8 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
 
 nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o \
 			   direct.o pagelist.o read.o symlink.o unlink.o \
-			   write.o namespace.o mount_clnt.o \
-			   dns_resolve.o cache_lib.o
+			   write.o namespace.o mount_clnt.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
 nfs-$(CONFIG_SYSCTL)	+= sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
@@ -22,7 +21,8 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
 obj-$(CONFIG_NFS_V4) += nfsv4.o
 nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
 	  delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
-	  nfs4namespace.o nfs4getroot.o nfs4client.o
+	  nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o
+nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
 nfsv4-$(CONFIG_SYSCTL)	+= nfs4sysctl.o
 nfsv4-$(CONFIG_NFS_V4_1)	+= nfs4session.o pnfs.o pnfs_dev.o
 
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 4fa788c93f4..e242bbf7297 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1089,9 +1089,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
 	dev->pgbase = 0;
 	dev->pglen = PAGE_SIZE * max_pages;
 	dev->mincount = 0;
+	dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
 
 	dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
-	rc = nfs4_proc_getdeviceinfo(server, dev);
+	rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
 	dprintk("%s getdevice info returns %d\n", __func__, rc);
 	if (rc) {
 		rv = ERR_PTR(rc);
@@ -1273,6 +1274,7 @@ static const struct nfs_pageio_ops bl_pg_write_ops = {
 static struct pnfs_layoutdriver_type blocklayout_type = {
 	.id				= LAYOUT_BLOCK_VOLUME,
 	.name				= "LAYOUT_BLOCK_VOLUME",
+	.owner				= THIS_MODULE,
 	.read_pagelist			= bl_read_pagelist,
 	.write_pagelist			= bl_write_pagelist,
 	.alloc_layout_hdr		= bl_alloc_layout_hdr,
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f4891bde885..8485978993e 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -173,7 +173,7 @@ struct bl_msg_hdr {
 /* blocklayoutdev.c */
 ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
 void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-int nfs4_blkdev_put(struct block_device *bdev);
+void nfs4_blkdev_put(struct block_device *bdev);
 struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
 						struct pnfs_device *dev);
 int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a86c5bdad9e..04303b5c936 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -56,11 +56,11 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
 /*
  * Release the block device
  */
-int nfs4_blkdev_put(struct block_device *bdev)
+void nfs4_blkdev_put(struct block_device *bdev)
 {
 	dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
 			MINOR(bdev->bd_dev));
-	return blkdev_put(bdev, FMODE_READ);
+	blkdev_put(bdev, FMODE_READ);
 }
 
 ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
index 737d839bc17..8999cfddd86 100644
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -55,7 +55,8 @@ static void dev_remove(struct net *net, dev_t dev)
 
 	bl_pipe_msg.bl_wq = &nn->bl_wq;
 	memset(msg, 0, sizeof(*msg));
-	msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS);
+	msg->len = sizeof(bl_msg) + bl_msg.totallen;
+	msg->data = kzalloc(msg->len, GFP_NOFS);
 	if (!msg->data)
 		goto out;
 
@@ -66,7 +67,6 @@ static void dev_remove(struct net *net, dev_t dev)
 	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
 	dataptr = (uint8_t *) msg->data;
 	memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
-	msg->len = sizeof(bl_msg) + bl_msg.totallen;
 
 	add_wait_queue(&nn->bl_wq, &wq);
 	if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
@@ -88,14 +88,8 @@ out:
  */
 static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
 {
-	int rv;
-
 	dprintk("%s Releasing\n", __func__);
-	rv = nfs4_blkdev_put(bdev->bm_mdev);
-	if (rv)
-		printk(KERN_ERR "NFS: %s nfs4_blkdev_put returns %d\n",
-				__func__, rv);
-
+	nfs4_blkdev_put(bdev->bm_mdev);
 	dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
 }
 
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index 862a2f16db6..5f7b053720e 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -128,10 +128,13 @@ int nfs_cache_register_net(struct net *net, struct cache_detail *cd)
 	struct super_block *pipefs_sb;
 	int ret = 0;
 
+	sunrpc_init_cache_detail(cd);
 	pipefs_sb = rpc_get_sb_net(net);
 	if (pipefs_sb) {
 		ret = nfs_cache_register_sb(pipefs_sb, cd);
 		rpc_put_sb_net(net);
+		if (ret)
+			sunrpc_destroy_cache_detail(cd);
 	}
 	return ret;
 }
@@ -151,14 +154,5 @@ void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)
 		nfs_cache_unregister_sb(pipefs_sb, cd);
 		rpc_put_sb_net(net);
 	}
-}
-
-void nfs_cache_init(struct cache_detail *cd)
-{
-	sunrpc_init_cache_detail(cd);
-}
-
-void nfs_cache_destroy(struct cache_detail *cd)
-{
 	sunrpc_destroy_cache_detail(cd);
 }
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
index 317db95e37f..4116d2c3f52 100644
--- a/fs/nfs/cache_lib.h
+++ b/fs/nfs/cache_lib.h
@@ -23,8 +23,6 @@ extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
 extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
 extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
 
-extern void nfs_cache_init(struct cache_detail *cd);
-extern void nfs_cache_destroy(struct cache_detail *cd);
 extern int nfs_cache_register_net(struct net *net, struct cache_detail *cd);
 extern void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd);
 extern int nfs_cache_register_sb(struct super_block *sb,
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 5088b57b078..67cd7321316 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -125,6 +125,9 @@ nfs41_callback_svc(void *vrqstp)
 	set_freezable();
 
 	while (!kthread_should_stop()) {
+		if (try_to_freeze())
+			continue;
+
 		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
 		spin_lock_bh(&serv->sv_cb_lock);
 		if (!list_empty(&serv->sv_cb_list)) {
@@ -208,7 +211,6 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
 	struct svc_rqst *rqstp;
 	int (*callback_svc)(void *vrqstp);
 	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
-	char svc_name[12];
 	int ret;
 
 	nfs_callback_bc_serv(minorversion, xprt, serv);
@@ -232,10 +234,10 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
 
 	svc_sock_update_bufs(serv);
 
-	sprintf(svc_name, "nfsv4.%u-svc", minorversion);
 	cb_info->serv = serv;
 	cb_info->rqst = rqstp;
-	cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name);
+	cb_info->task = kthread_run(callback_svc, cb_info->rqst,
+				    "nfsv4.%u-svc", minorversion);
 	if (IS_ERR(cb_info->task)) {
 		ret = PTR_ERR(cb_info->task);
 		svc_exit_thread(cb_info->rqst);
@@ -279,6 +281,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
 			ret = nfs4_callback_up_net(serv, net);
 			break;
 		case 1:
+		case 2:
 			ret = nfs41_callback_up_net(serv, net);
 			break;
 		default:
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index efd54f0a4c4..84326e9fb47 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -32,6 +32,8 @@ enum nfs4_callback_opnum {
 	OP_CB_WANTS_CANCELLED = 12,
 	OP_CB_NOTIFY_LOCK   = 13,
 	OP_CB_NOTIFY_DEVICEID = 14,
+/* Callback operations new to NFSv4.2 */
+	OP_CB_OFFLOAD = 15,
 	OP_CB_ILLEGAL = 10044,
 };
 
@@ -39,6 +41,7 @@ struct cb_process_state {
 	__be32			drc_status;
 	struct nfs_client	*clp;
 	u32			slotid;
+	u32			minorversion;
 	struct net		*net;
 };
 
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 264d1aa935f..e6ebc4c38c8 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -183,60 +183,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 static u32 initiate_bulk_draining(struct nfs_client *clp,
 				  struct cb_layoutrecallargs *args)
 {
-	struct nfs_server *server;
-	struct pnfs_layout_hdr *lo;
-	struct inode *ino;
-	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
-	struct pnfs_layout_hdr *tmp;
-	LIST_HEAD(recall_list);
-	LIST_HEAD(free_me_list);
-	struct pnfs_layout_range range = {
-		.iomode = IOMODE_ANY,
-		.offset = 0,
-		.length = NFS4_MAX_UINT64,
-	};
-
-	spin_lock(&clp->cl_lock);
-	rcu_read_lock();
-	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-		if ((args->cbl_recall_type == RETURN_FSID) &&
-		    memcmp(&server->fsid, &args->cbl_fsid,
-			   sizeof(struct nfs_fsid)))
-			continue;
+	int stat;
 
-		list_for_each_entry(lo, &server->layouts, plh_layouts) {
-			ino = igrab(lo->plh_inode);
-			if (!ino)
-				continue;
-			spin_lock(&ino->i_lock);
-			/* Is this layout in the process of being freed? */
-			if (NFS_I(ino)->layout != lo) {
-				spin_unlock(&ino->i_lock);
-				iput(ino);
-				continue;
-			}
-			pnfs_get_layout_hdr(lo);
-			spin_unlock(&ino->i_lock);
-			list_add(&lo->plh_bulk_recall, &recall_list);
-		}
-	}
-	rcu_read_unlock();
-	spin_unlock(&clp->cl_lock);
-
-	list_for_each_entry_safe(lo, tmp,
-				 &recall_list, plh_bulk_recall) {
-		ino = lo->plh_inode;
-		spin_lock(&ino->i_lock);
-		set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-		if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range))
-			rv = NFS4ERR_DELAY;
-		list_del_init(&lo->plh_bulk_recall);
-		spin_unlock(&ino->i_lock);
-		pnfs_free_lseg_list(&free_me_list);
-		pnfs_put_layout_hdr(lo);
-		iput(ino);
-	}
-	return rv;
+	if (args->cbl_recall_type == RETURN_FSID)
+		stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true);
+	else
+		stat = pnfs_destroy_layouts_byclid(clp, true);
+	if (stat != 0)
+		return NFS4ERR_DELAY;
+	return NFS4ERR_NOMATCHING_LAYOUT;
 }
 
 static u32 do_callback_layoutrecall(struct nfs_client *clp,
@@ -451,7 +406,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	int i;
 	__be32 status = htonl(NFS4ERR_BADSESSION);
 
-	clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid);
+	clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
+					 &args->csa_sessionid, cps->minorversion);
 	if (clp == NULL)
 		goto out;
 
@@ -459,7 +415,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 
 	spin_lock(&tbl->slot_tbl_lock);
 	/* state manager is resetting the session */
-	if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
+	if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
 		spin_unlock(&tbl->slot_tbl_lock);
 		status = htonl(NFS4ERR_DELAY);
 		/* Return NFS4ERR_BADSESSION if we're draining the session
@@ -545,7 +501,7 @@ __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy,
 		     &args->craa_type_mask))
 		pnfs_recall_all_layouts(cps->clp);
 	if (flags)
-		nfs_expire_all_delegation_types(cps->clp, flags);
+		nfs_expire_unused_delegation_types(cps->clp, flags);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 59461c957d9..f4ccfe6521e 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -166,9 +166,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
 	hdr->minorversion = ntohl(*p++);
-	/* Check minor version is zero or one. */
-	if (hdr->minorversion <= 1) {
-		hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */
+	/* Check for minor version support */
+	if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
+		hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
 	} else {
 		pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
 			"illegal minor version %u!\n",
@@ -763,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
 	 * A single slot, so highest used slotid is either 0 or -1
 	 */
 	tbl->highest_used_slotid = NFS4_NO_SLOT;
-	nfs4_session_drain_complete(session, tbl);
+	nfs4_slot_tbl_drain_complete(tbl);
 	spin_unlock(&tbl->slot_tbl_lock);
 }
 
@@ -786,6 +786,26 @@ static void nfs4_cb_free_slot(struct cb_process_state *cps)
 }
 #endif /* CONFIG_NFS_V4_1 */
 
+#ifdef CONFIG_NFS_V4_2
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+	__be32 status = preprocess_nfs41_op(nop, op_nr, op);
+	if (status != htonl(NFS4ERR_OP_ILLEGAL))
+		return status;
+
+	if (op_nr == OP_CB_OFFLOAD)
+		return htonl(NFS4ERR_NOTSUPP);
+	return htonl(NFS4ERR_OP_ILLEGAL);
+}
+#else /* CONFIG_NFS_V4_2 */
+static __be32
+preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
+{
+	return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+}
+#endif /* CONFIG_NFS_V4_2 */
+
 static __be32
 preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 {
@@ -801,8 +821,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
 	return htonl(NFS_OK);
 }
 
-static __be32 process_op(uint32_t minorversion, int nop,
-		struct svc_rqst *rqstp,
+static __be32 process_op(int nop, struct svc_rqst *rqstp,
 		struct xdr_stream *xdr_in, void *argp,
 		struct xdr_stream *xdr_out, void *resp,
 		struct cb_process_state *cps)
@@ -819,10 +838,22 @@ static __be32 process_op(uint32_t minorversion, int nop,
 		return status;
 
 	dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
-		__func__, minorversion, nop, op_nr);
+		__func__, cps->minorversion, nop, op_nr);
+
+	switch (cps->minorversion) {
+	case 0:
+		status = preprocess_nfs4_op(op_nr, &op);
+		break;
+	case 1:
+		status = preprocess_nfs41_op(nop, op_nr, &op);
+		break;
+	case 2:
+		status = preprocess_nfs42_op(nop, op_nr, &op);
+		break;
+	default:
+		status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
+	}
 
-	status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
-				preprocess_nfs4_op(op_nr, &op);
 	if (status == htonl(NFS4ERR_OP_ILLEGAL))
 		op_nr = OP_CB_ILLEGAL;
 	if (status)
@@ -885,14 +916,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 			return rpc_drop_reply;
 	}
 
+	cps.minorversion = hdr_arg.minorversion;
 	hdr_res.taglen = hdr_arg.taglen;
 	hdr_res.tag = hdr_arg.tag;
 	if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
 		return rpc_system_err;
 
 	while (status == 0 && nops != hdr_arg.nops) {
-		status = process_op(hdr_arg.minorversion, nops, rqstp,
-				    &xdr_in, argp, &xdr_out, resp, &cps);
+		status = process_op(nops, rqstp, &xdr_in,
+				    argp, &xdr_out, resp, &cps);
 		nops++;
 	}
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9f3c66438d0..340b1eff026 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -197,7 +197,6 @@ error_0:
 EXPORT_SYMBOL_GPL(nfs_alloc_client);
 
 #if IS_ENABLED(CONFIG_NFS_V4)
-/* idr_remove_all is not needed as all id's are removed by nfs_put_client */
 void nfs_cleanup_cb_ident_idr(struct net *net)
 {
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
@@ -594,6 +593,8 @@ int nfs_create_rpc_client(struct nfs_client *clp,
 		args.flags |= RPC_CLNT_CREATE_DISCRTRY;
 	if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+	if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags))
+		args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS;
 
 	if (!IS_ERR(clp->cl_rpcclient))
 		return 0;
@@ -752,8 +753,6 @@ static int nfs_init_server(struct nfs_server *server,
 			data->timeo, data->retrans);
 	if (data->flags & NFS_MOUNT_NORESVPORT)
 		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
-	if (server->options & NFS_OPTION_MIGRATION)
-		set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
 
 	/* Allocate or find a client reference we can use */
 	clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -1075,7 +1074,7 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
 	}
 
 	if (!(fattr->valid & NFS_ATTR_FATTR)) {
-		error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr);
+		error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);
 		if (error < 0) {
 			dprintk("nfs_create_server: getattr error = %d\n", -error);
 			goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 81c5eec3cf3..7ec4814e298 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -55,7 +55,8 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
 	flags &= FMODE_READ|FMODE_WRITE;
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
-	if (delegation != NULL && (delegation->type & flags) == flags) {
+	if (delegation != NULL && (delegation->type & flags) == flags &&
+	    !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
 		nfs_mark_delegation_referenced(delegation);
 		ret = 1;
 	}
@@ -63,7 +64,7 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
 	return ret;
 }
 
-static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state)
+static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
 {
 	struct inode *inode = state->inode;
 	struct file_lock *fl;
@@ -72,20 +73,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 	if (inode->i_flock == NULL)
 		goto out;
 
-	/* Protect inode->i_flock using the file locks lock */
-	lock_flocks();
+	/* Protect inode->i_flock using the i_lock */
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
 		if (nfs_file_open_context(fl->fl_file) != ctx)
 			continue;
-		unlock_flocks();
-		status = nfs4_lock_delegation_recall(state, fl);
+		spin_unlock(&inode->i_lock);
+		status = nfs4_lock_delegation_recall(fl, state, stateid);
 		if (status < 0)
 			goto out;
-		lock_flocks();
+		spin_lock(&inode->i_lock);
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 out:
 	return status;
 }
@@ -94,7 +95,9 @@ static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *s
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_open_context *ctx;
+	struct nfs4_state_owner *sp;
 	struct nfs4_state *state;
+	unsigned int seq;
 	int err;
 
 again:
@@ -109,9 +112,16 @@ again:
 			continue;
 		get_nfs_open_context(ctx);
 		spin_unlock(&inode->i_lock);
+		sp = state->owner;
+		/* Block nfs4_proc_unlck */
+		mutex_lock(&sp->so_delegreturn_mutex);
+		seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
 		err = nfs4_open_delegation_recall(ctx, state, stateid);
-		if (err >= 0)
-			err = nfs_delegation_claim_locks(ctx, state);
+		if (!err)
+			err = nfs_delegation_claim_locks(ctx, state, stateid);
+		if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+			err = -EAGAIN;
+		mutex_unlock(&sp->so_delegreturn_mutex);
 		put_nfs_open_context(ctx);
 		if (err != 0)
 			return err;
@@ -182,39 +192,91 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
 }
 
 static struct nfs_delegation *
+nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
+{
+	struct nfs_delegation *ret = NULL;
+	struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+
+	if (delegation == NULL)
+		goto out;
+	spin_lock(&delegation->lock);
+	if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+		ret = delegation;
+	spin_unlock(&delegation->lock);
+out:
+	return ret;
+}
+
+static struct nfs_delegation *
+nfs_start_delegation_return(struct nfs_inode *nfsi)
+{
+	struct nfs_delegation *delegation;
+
+	rcu_read_lock();
+	delegation = nfs_start_delegation_return_locked(nfsi);
+	rcu_read_unlock();
+	return delegation;
+}
+
+static void
+nfs_abort_delegation_return(struct nfs_delegation *delegation,
+		struct nfs_client *clp)
+{
+
+	spin_lock(&delegation->lock);
+	clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+	set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+	spin_unlock(&delegation->lock);
+	set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+}
+
+static struct nfs_delegation *
 nfs_detach_delegation_locked(struct nfs_inode *nfsi,
-			     struct nfs_server *server)
+		struct nfs_delegation *delegation,
+		struct nfs_client *clp)
 {
-	struct nfs_delegation *delegation =
+	struct nfs_delegation *deleg_cur =
 		rcu_dereference_protected(nfsi->delegation,
-				lockdep_is_held(&server->nfs_client->cl_lock));
+				lockdep_is_held(&clp->cl_lock));
 
-	if (delegation == NULL)
-		goto nomatch;
+	if (deleg_cur == NULL || delegation != deleg_cur)
+		return NULL;
 
 	spin_lock(&delegation->lock);
+	set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
 	list_del_rcu(&delegation->super_list);
 	delegation->inode = NULL;
 	nfsi->delegation_state = 0;
 	rcu_assign_pointer(nfsi->delegation, NULL);
 	spin_unlock(&delegation->lock);
 	return delegation;
-nomatch:
-	return NULL;
 }
 
 static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
-						    struct nfs_server *server)
+		struct nfs_delegation *delegation,
+		struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
-	struct nfs_delegation *delegation;
 
 	spin_lock(&clp->cl_lock);
-	delegation = nfs_detach_delegation_locked(nfsi, server);
+	delegation = nfs_detach_delegation_locked(nfsi, delegation, clp);
 	spin_unlock(&clp->cl_lock);
 	return delegation;
 }
 
+static struct nfs_delegation *
+nfs_inode_detach_delegation(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_delegation *delegation;
+
+	delegation = nfs_start_delegation_return(nfsi);
+	if (delegation == NULL)
+		return NULL;
+	return nfs_detach_delegation(nfsi, delegation, server);
+}
+
 /**
  * nfs_inode_set_delegation - set up a delegation on an inode
  * @inode: inode to which delegation applies
@@ -268,7 +330,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 			delegation = NULL;
 			goto out;
 		}
-		freeme = nfs_detach_delegation_locked(nfsi, server);
+		freeme = nfs_detach_delegation_locked(nfsi, 
+				old_delegation, clp);
+		if (freeme == NULL)
+			goto out;
 	}
 	list_add_rcu(&delegation->super_list, &server->delegations);
 	nfsi->delegation_state = delegation->type;
@@ -292,19 +357,29 @@ out:
 /*
  * Basic procedure for returning a delegation to the server
  */
-static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)
 {
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int err;
 
-	/*
-	 * Guard against new delegated open/lock/unlock calls and against
-	 * state recovery
-	 */
-	down_write(&nfsi->rwsem);
-	err = nfs_delegation_claim_opens(inode, &delegation->stateid);
-	up_write(&nfsi->rwsem);
-	if (err)
+	if (delegation == NULL)
+		return 0;
+	do {
+		err = nfs_delegation_claim_opens(inode, &delegation->stateid);
+		if (!issync || err != -EAGAIN)
+			break;
+		/*
+		 * Guard against state recovery
+		 */
+		err = nfs4_wait_clnt_recover(clp);
+	} while (err == 0);
+
+	if (err) {
+		nfs_abort_delegation_return(delegation, clp);
+		goto out;
+	}
+	if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode)))
 		goto out;
 
 	err = nfs_do_return_delegation(inode, delegation, issync);
@@ -312,6 +387,24 @@ out:
 	return err;
 }
 
+static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
+{
+	bool ret = false;
+
+	if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+		ret = true;
+	if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) {
+		struct inode *inode;
+
+		spin_lock(&delegation->lock);
+		inode = delegation->inode;
+		if (inode && list_empty(&NFS_I(inode)->open_files))
+			ret = true;
+		spin_unlock(&delegation->lock);
+	}
+	return ret;
+}
+
 /**
  * nfs_client_return_marked_delegations - return previously marked delegations
  * @clp: nfs_client to process
@@ -334,19 +427,15 @@ restart:
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 		list_for_each_entry_rcu(delegation, &server->delegations,
 								super_list) {
-			if (!test_and_clear_bit(NFS_DELEGATION_RETURN,
-							&delegation->flags))
+			if (!nfs_delegation_need_return(delegation))
 				continue;
 			inode = nfs_delegation_grab_inode(delegation);
 			if (inode == NULL)
 				continue;
-			delegation = nfs_detach_delegation(NFS_I(inode),
-								server);
+			delegation = nfs_start_delegation_return_locked(NFS_I(inode));
 			rcu_read_unlock();
 
-			if (delegation != NULL)
-				err = __nfs_inode_return_delegation(inode,
-								delegation, 0);
+			err = nfs_end_delegation_return(inode, delegation, 0);
 			iput(inode);
 			if (!err)
 				goto restart;
@@ -367,15 +456,11 @@ restart:
  */
 void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 {
-	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 
-	if (rcu_access_pointer(nfsi->delegation) != NULL) {
-		delegation = nfs_detach_delegation(nfsi, server);
-		if (delegation != NULL)
-			nfs_do_return_delegation(inode, delegation, 0);
-	}
+	delegation = nfs_inode_detach_delegation(inode);
+	if (delegation != NULL)
+		nfs_do_return_delegation(inode, delegation, 0);
 }
 
 /**
@@ -390,21 +475,24 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
  */
 int nfs4_inode_return_delegation(struct inode *inode)
 {
-	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 	int err = 0;
 
 	nfs_wb_all(inode);
-	if (rcu_access_pointer(nfsi->delegation) != NULL) {
-		delegation = nfs_detach_delegation(nfsi, server);
-		if (delegation != NULL) {
-			err = __nfs_inode_return_delegation(inode, delegation, 1);
-		}
-	}
+	delegation = nfs_start_delegation_return(nfsi);
+	if (delegation != NULL)
+		err = nfs_end_delegation_return(inode, delegation, 1);
 	return err;
 }
 
+static void nfs_mark_return_if_closed_delegation(struct nfs_server *server,
+		struct nfs_delegation *delegation)
+{
+	set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+	set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+}
+
 static void nfs_mark_return_delegation(struct nfs_server *server,
 		struct nfs_delegation *delegation)
 {
@@ -412,6 +500,45 @@ static void nfs_mark_return_delegation(struct nfs_server *server,
 	set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
 }
 
+static bool nfs_server_mark_return_all_delegations(struct nfs_server *server)
+{
+	struct nfs_delegation *delegation;
+	bool ret = false;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+		nfs_mark_return_delegation(server, delegation);
+		ret = true;
+	}
+	return ret;
+}
+
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_server_mark_return_all_delegations(server);
+	rcu_read_unlock();
+}
+
+static void nfs_delegation_run_state_manager(struct nfs_client *clp)
+{
+	if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
+		nfs4_schedule_state_manager(clp);
+}
+
+/**
+ * nfs_expire_all_delegations
+ * @clp: client to process
+ *
+ */
+void nfs_expire_all_delegations(struct nfs_client *clp)
+{
+	nfs_client_mark_return_all_delegations(clp);
+	nfs_delegation_run_state_manager(clp);
+}
+
 /**
  * nfs_super_return_all_delegations - return delegations for one superblock
  * @sb: sb to process
@@ -420,24 +547,22 @@ static void nfs_mark_return_delegation(struct nfs_server *server,
 void nfs_server_return_all_delegations(struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
-	struct nfs_delegation *delegation;
+	bool need_wait;
 
 	if (clp == NULL)
 		return;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
-		spin_lock(&delegation->lock);
-		set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-		spin_unlock(&delegation->lock);
-	}
+	need_wait = nfs_server_mark_return_all_delegations(server);
 	rcu_read_unlock();
 
-	if (nfs_client_return_marked_delegations(clp) != 0)
+	if (need_wait) {
 		nfs4_schedule_state_manager(clp);
+		nfs4_wait_clnt_recover(clp);
+	}
 }
 
-static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
+static void nfs_mark_return_unused_delegation_types(struct nfs_server *server,
 						 fmode_t flags)
 {
 	struct nfs_delegation *delegation;
@@ -446,32 +571,26 @@ static void nfs_mark_return_all_delegation_types(struct nfs_server *server,
 		if ((delegation->type == (FMODE_READ|FMODE_WRITE)) && !(flags & FMODE_WRITE))
 			continue;
 		if (delegation->type & flags)
-			nfs_mark_return_delegation(server, delegation);
+			nfs_mark_return_if_closed_delegation(server, delegation);
 	}
 }
 
-static void nfs_client_mark_return_all_delegation_types(struct nfs_client *clp,
+static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *clp,
 							fmode_t flags)
 {
 	struct nfs_server *server;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
-		nfs_mark_return_all_delegation_types(server, flags);
+		nfs_mark_return_unused_delegation_types(server, flags);
 	rcu_read_unlock();
 }
 
-static void nfs_delegation_run_state_manager(struct nfs_client *clp)
-{
-	if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
-		nfs4_schedule_state_manager(clp);
-}
-
 void nfs_remove_bad_delegation(struct inode *inode)
 {
 	struct nfs_delegation *delegation;
 
-	delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode));
+	delegation = nfs_inode_detach_delegation(inode);
 	if (delegation) {
 		nfs_inode_find_state_and_recover(inode, &delegation->stateid);
 		nfs_free_delegation(delegation);
@@ -480,27 +599,17 @@ void nfs_remove_bad_delegation(struct inode *inode)
 EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
 
 /**
- * nfs_expire_all_delegation_types
+ * nfs_expire_unused_delegation_types
  * @clp: client to process
  * @flags: delegation types to expire
  *
  */
-void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags)
+void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags)
 {
-	nfs_client_mark_return_all_delegation_types(clp, flags);
+	nfs_client_mark_return_unused_delegation_types(clp, flags);
 	nfs_delegation_run_state_manager(clp);
 }
 
-/**
- * nfs_expire_all_delegations
- * @clp: client to process
- *
- */
-void nfs_expire_all_delegations(struct nfs_client *clp)
-{
-	nfs_expire_all_delegation_types(clp, FMODE_READ|FMODE_WRITE);
-}
-
 static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
 {
 	struct nfs_delegation *delegation;
@@ -508,7 +617,7 @@ static void nfs_mark_return_unreferenced_delegations(struct nfs_server *server)
 	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
 		if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
 			continue;
-		nfs_mark_return_delegation(server, delegation);
+		nfs_mark_return_if_closed_delegation(server, delegation);
 	}
 }
 
@@ -649,7 +758,7 @@ restart:
 			if (inode == NULL)
 				continue;
 			delegation = nfs_detach_delegation(NFS_I(inode),
-								server);
+					delegation, server);
 			rcu_read_unlock();
 
 			if (delegation != NULL)
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index bbc6a4dba0d..9a79c7a99d6 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -28,7 +28,9 @@ struct nfs_delegation {
 enum {
 	NFS_DELEGATION_NEED_RECLAIM = 0,
 	NFS_DELEGATION_RETURN,
+	NFS_DELEGATION_RETURN_IF_CLOSED,
 	NFS_DELEGATION_REFERENCED,
+	NFS_DELEGATION_RETURNING,
 };
 
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
@@ -40,7 +42,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode);
 struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
 void nfs_server_return_all_delegations(struct nfs_server *);
 void nfs_expire_all_delegations(struct nfs_client *clp);
-void nfs_expire_all_delegation_types(struct nfs_client *clp, fmode_t flags);
+void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
 int nfs_delegations_present(struct nfs_client *clp);
@@ -52,7 +54,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 /* NFSv4 delegation-related procedures */
 int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
-int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
+int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
 bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
 
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1b2d7eb9379..0fac2cb1ea1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,6 +33,7 @@
 #include <linux/pagevec.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/swap.h>
 #include <linux/sched.h>
 #include <linux/kmemleak.h>
 #include <linux/xattr.h>
@@ -46,7 +47,7 @@
 
 static int nfs_opendir(struct inode *, struct file *);
 static int nfs_closedir(struct inode *, struct file *);
-static int nfs_readdir(struct file *, void *, filldir_t);
+static int nfs_readdir(struct file *, struct dir_context *);
 static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 static void nfs_readdir_clear_array(struct page*);
@@ -54,7 +55,7 @@ static void nfs_readdir_clear_array(struct page*);
 const struct file_operations nfs_dir_operations = {
 	.llseek		= nfs_llseek_dir,
 	.read		= generic_read_dir,
-	.readdir	= nfs_readdir,
+	.iterate	= nfs_readdir,
 	.open		= nfs_opendir,
 	.release	= nfs_closedir,
 	.fsync		= nfs_fsync_dir,
@@ -147,6 +148,7 @@ typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
 typedef struct {
 	struct file	*file;
 	struct page	*page;
+	struct dir_context *ctx;
 	unsigned long	page_index;
 	u64		*dir_cookie;
 	u64		last_cookie;
@@ -252,7 +254,7 @@ out:
 static
 int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
 {
-	loff_t diff = desc->file->f_pos - desc->current_index;
+	loff_t diff = desc->ctx->pos - desc->current_index;
 	unsigned int index;
 
 	if (diff < 0)
@@ -281,7 +283,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 
 	for (i = 0; i < array->size; i++) {
 		if (array->array[i].cookie == *desc->dir_cookie) {
-			struct nfs_inode *nfsi = NFS_I(desc->file->f_path.dentry->d_inode);
+			struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
 			struct nfs_open_dir_context *ctx = desc->file->private_data;
 
 			new_pos = desc->current_index + i;
@@ -289,7 +291,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 			    || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
 				ctx->duped = 0;
 				ctx->attr_gencount = nfsi->attr_gencount;
-			} else if (new_pos < desc->file->f_pos) {
+			} else if (new_pos < desc->ctx->pos) {
 				if (ctx->duped > 0
 				    && ctx->dup_cookie == *desc->dir_cookie) {
 					if (printk_ratelimit()) {
@@ -307,7 +309,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 				ctx->dup_cookie = *desc->dir_cookie;
 				ctx->duped = -1;
 			}
-			desc->file->f_pos = new_pos;
+			desc->ctx->pos = new_pos;
 			desc->cache_entry_index = i;
 			return 0;
 		}
@@ -405,13 +407,13 @@ different:
 }
 
 static
-bool nfs_use_readdirplus(struct inode *dir, struct file *filp)
+bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 {
 	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
 		return false;
 	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
 		return true;
-	if (filp->f_pos == 0)
+	if (ctx->pos == 0)
 		return true;
 	return false;
 }
@@ -435,6 +437,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 	struct dentry *alias;
 	struct inode *dir = parent->d_inode;
 	struct inode *inode;
+	int status;
 
 	if (filename.name[0] == '.') {
 		if (filename.len == 1)
@@ -447,7 +450,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 	dentry = d_lookup(parent, &filename);
 	if (dentry != NULL) {
 		if (nfs_same_file(dentry, entry)) {
-			nfs_refresh_inode(dentry->d_inode, entry->fattr);
+			status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
+			if (!status)
+				nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
 			goto out;
 		} else {
 			if (d_invalidate(dentry) != 0)
@@ -460,7 +465,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 	if (dentry == NULL)
 		return;
 
-	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr);
+	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
 	if (IS_ERR(inode))
 		goto out;
 
@@ -585,10 +590,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	if (entry.fh == NULL || entry.fattr == NULL)
 		goto out;
 
+	entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+	if (IS_ERR(entry.label)) {
+		status = PTR_ERR(entry.label);
+		goto out;
+	}
+
 	array = nfs_readdir_get_array(page);
 	if (IS_ERR(array)) {
 		status = PTR_ERR(array);
-		goto out;
+		goto out_label_free;
 	}
 	memset(array, 0, sizeof(struct nfs_cache_array));
 	array->eof_index = -1;
@@ -614,6 +625,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	nfs_readdir_free_large_page(pages_ptr, pages, array_size);
 out_release_array:
 	nfs_readdir_release_array(page);
+out_label_free:
+	nfs4_label_free(entry.label);
 out:
 	nfs_free_fattr(entry.fattr);
 	nfs_free_fhandle(entry.fh);
@@ -629,7 +642,7 @@ out:
 static
 int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
 {
-	struct inode	*inode = desc->file->f_path.dentry->d_inode;
+	struct inode	*inode = file_inode(desc->file);
 	int ret;
 
 	ret = nfs_readdir_xdr_to_array(desc, page, inode);
@@ -660,7 +673,7 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
 static
 struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
 {
-	return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping,
+	return read_cache_page(file_inode(desc->file)->i_mapping,
 			desc->page_index, (filler_t *)nfs_readdir_filler, desc);
 }
 
@@ -702,8 +715,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
 static 
-int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
-		   filldir_t filldir)
+int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 {
 	struct file	*file = desc->file;
 	int i = 0;
@@ -721,13 +733,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 		struct nfs_cache_array_entry *ent;
 
 		ent = &array->array[i];
-		if (filldir(dirent, ent->string.name, ent->string.len,
-		    file->f_pos, nfs_compat_user_ino64(ent->ino),
-		    ent->d_type) < 0) {
+		if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
+		    nfs_compat_user_ino64(ent->ino), ent->d_type)) {
 			desc->eof = 1;
 			break;
 		}
-		file->f_pos++;
+		desc->ctx->pos++;
 		if (i < (array->size-1))
 			*desc->dir_cookie = array->array[i+1].cookie;
 		else
@@ -759,12 +770,11 @@ out:
  *	 directory in the page cache by the time we get here.
  */
 static inline
-int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
-		     filldir_t filldir)
+int uncached_readdir(nfs_readdir_descriptor_t *desc)
 {
 	struct page	*page = NULL;
 	int		status;
-	struct inode *inode = desc->file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(desc->file);
 	struct nfs_open_dir_context *ctx = desc->file->private_data;
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
@@ -785,7 +795,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	if (status < 0)
 		goto out_release;
 
-	status = nfs_do_filldir(desc, dirent, filldir);
+	status = nfs_do_filldir(desc);
 
  out:
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
@@ -800,35 +810,36 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
    last cookie cache takes care of the common case of reading the
    whole directory.
  */
-static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry	*dentry = filp->f_path.dentry;
+	struct dentry	*dentry = file->f_path.dentry;
 	struct inode	*inode = dentry->d_inode;
 	nfs_readdir_descriptor_t my_desc,
 			*desc = &my_desc;
-	struct nfs_open_dir_context *dir_ctx = filp->private_data;
+	struct nfs_open_dir_context *dir_ctx = file->private_data;
 	int res;
 
 	dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
 			dentry->d_parent->d_name.name, dentry->d_name.name,
-			(long long)filp->f_pos);
+			(long long)ctx->pos);
 	nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
 
 	/*
-	 * filp->f_pos points to the dirent entry number.
+	 * ctx->pos points to the dirent entry number.
 	 * *desc->dir_cookie has the cookie for the next entry. We have
 	 * to either find the entry with the appropriate number or
 	 * revalidate the cookie.
 	 */
 	memset(desc, 0, sizeof(*desc));
 
-	desc->file = filp;
+	desc->file = file;
+	desc->ctx = ctx;
 	desc->dir_cookie = &dir_ctx->dir_cookie;
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
-	desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0;
+	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
 
 	nfs_block_sillyrename(dentry);
-	res = nfs_revalidate_mapping(inode, filp->f_mapping);
+	res = nfs_revalidate_mapping(inode, file->f_mapping);
 	if (res < 0)
 		goto out;
 
@@ -840,7 +851,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			/* This means either end of directory */
 			if (*desc->dir_cookie && desc->eof == 0) {
 				/* Or that the server has 'lost' a cookie */
-				res = uncached_readdir(desc, dirent, filldir);
+				res = uncached_readdir(desc);
 				if (res == 0)
 					continue;
 			}
@@ -857,7 +868,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (res < 0)
 			break;
 
-		res = nfs_do_filldir(desc, dirent, filldir);
+		res = nfs_do_filldir(desc);
 		if (res < 0)
 			break;
 	} while (!desc->eof);
@@ -1040,6 +1051,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 	struct dentry *parent;
 	struct nfs_fh *fhandle = NULL;
 	struct nfs_fattr *fattr = NULL;
+	struct nfs4_label *label = NULL;
 	int error;
 
 	if (flags & LOOKUP_RCU)
@@ -1082,7 +1094,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 	if (fhandle == NULL || fattr == NULL)
 		goto out_error;
 
-	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+	if (IS_ERR(label))
+		goto out_error;
+
+	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
 	if (error)
 		goto out_bad;
 	if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1090,8 +1106,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 	if ((error = nfs_refresh_inode(inode, fattr)) != 0)
 		goto out_bad;
 
+	nfs_setsecurity(inode, fattr, label);
+
 	nfs_free_fattr(fattr);
 	nfs_free_fhandle(fhandle);
+	nfs4_label_free(label);
+
 out_set_verifier:
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
@@ -1108,6 +1128,7 @@ out_zap_parent:
  out_bad:
 	nfs_free_fattr(fattr);
 	nfs_free_fhandle(fhandle);
+	nfs4_label_free(label);
 	nfs_mark_for_revalidate(dir);
 	if (inode && S_ISDIR(inode->i_mode)) {
 		/* Purge readdir caches. */
@@ -1128,6 +1149,7 @@ out_zap_parent:
 out_error:
 	nfs_free_fattr(fattr);
 	nfs_free_fhandle(fhandle);
+	nfs4_label_free(label);
 	dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
 			__func__, dentry->d_parent->d_name.name,
@@ -1136,6 +1158,45 @@ out_error:
 }
 
 /*
+ * A weaker form of d_revalidate for revalidating just the dentry->d_inode
+ * when we don't really care about the dentry name. This is called when a
+ * pathwalk ends on a dentry that was not found via a normal lookup in the
+ * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals).
+ *
+ * In this situation, we just want to verify that the inode itself is OK
+ * since the dentry might have changed on the server.
+ */
+static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	int error;
+	struct inode *inode = dentry->d_inode;
+
+	/*
+	 * I believe we can only get a negative dentry here in the case of a
+	 * procfs-style symlink. Just assume it's correct for now, but we may
+	 * eventually need to do something more here.
+	 */
+	if (!inode) {
+		dfprintk(LOOKUPCACHE, "%s: %s/%s has negative inode\n",
+				__func__, dentry->d_parent->d_name.name,
+				dentry->d_name.name);
+		return 1;
+	}
+
+	if (is_bad_inode(inode)) {
+		dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n",
+				__func__, dentry->d_parent->d_name.name,
+				dentry->d_name.name);
+		return 0;
+	}
+
+	error = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n",
+			__func__, inode->i_ino, error ? "invalid" : "valid");
+	return !error;
+}
+
+/*
  * This is called from dput() when d_count is going to 0.
  */
 static int nfs_dentry_delete(const struct dentry *dentry)
@@ -1202,6 +1263,7 @@ static void nfs_d_release(struct dentry *dentry)
 
 const struct dentry_operations nfs_dentry_operations = {
 	.d_revalidate	= nfs_lookup_revalidate,
+	.d_weak_revalidate	= nfs_weak_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
 	.d_automount	= nfs_d_automount,
@@ -1216,6 +1278,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
 	struct inode *inode = NULL;
 	struct nfs_fh *fhandle = NULL;
 	struct nfs_fattr *fattr = NULL;
+	struct nfs4_label *label = NULL;
 	int error;
 
 	dfprintk(VFS, "NFS: lookup(%s/%s)\n",
@@ -1242,17 +1305,21 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
 	if (fhandle == NULL || fattr == NULL)
 		goto out;
 
+	label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
+	if (IS_ERR(label))
+		goto out;
+
 	parent = dentry->d_parent;
 	/* Protect against concurrent sillydeletes */
 	nfs_block_sillyrename(parent);
-	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+	error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
 	if (error == -ENOENT)
 		goto no_entry;
 	if (error < 0) {
 		res = ERR_PTR(error);
 		goto out_unblock_sillyrename;
 	}
-	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+	inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
 	res = ERR_CAST(inode);
 	if (IS_ERR(res))
 		goto out_unblock_sillyrename;
@@ -1270,6 +1337,7 @@ no_entry:
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_unblock_sillyrename:
 	nfs_unblock_sillyrename(parent);
+	nfs4_label_free(label);
 out:
 	nfs_free_fattr(fattr);
 	nfs_free_fhandle(fhandle);
@@ -1317,18 +1385,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
 {
 	int err;
 
-	if (ctx->dentry != dentry) {
-		dput(ctx->dentry);
-		ctx->dentry = dget(dentry);
-	}
-
-	/* If the open_intent is for execute, we have an extra check to make */
-	if (ctx->mode & FMODE_EXEC) {
-		err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);
-		if (err < 0)
-			goto out;
-	}
-
 	err = finish_open(file, dentry, do_open, opened);
 	if (err)
 		goto out;
@@ -1387,13 +1443,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 
 	nfs_block_sillyrename(dentry->d_parent);
 	inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
-	d_drop(dentry);
+	nfs_unblock_sillyrename(dentry->d_parent);
 	if (IS_ERR(inode)) {
-		nfs_unblock_sillyrename(dentry->d_parent);
 		put_nfs_open_context(ctx);
 		err = PTR_ERR(inode);
 		switch (err) {
 		case -ENOENT:
+			d_drop(dentry);
 			d_add(dentry, NULL);
 			break;
 		case -EISDIR:
@@ -1409,16 +1465,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 		}
 		goto out;
 	}
-	res = d_add_unique(dentry, inode);
-	if (res != NULL)
-		dentry = res;
-
-	nfs_unblock_sillyrename(dentry->d_parent);
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-
-	err = nfs_finish_open(ctx, dentry, file, open_flags, opened);
 
-	dput(res);
+	err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
 out:
 	return err;
 
@@ -1446,6 +1494,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 		goto no_open;
 	if (d_mountpoint(dentry))
 		goto no_open;
+	if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1)
+		goto no_open;
 
 	inode = dentry->d_inode;
 	parent = dget_parent(dentry);
@@ -1486,7 +1536,8 @@ no_open:
  * Code common to create, mkdir, and mknod.
  */
 int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
-				struct nfs_fattr *fattr)
+				struct nfs_fattr *fattr,
+				struct nfs4_label *label)
 {
 	struct dentry *parent = dget_parent(dentry);
 	struct inode *dir = parent->d_inode;
@@ -1499,18 +1550,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
 	if (dentry->d_inode)
 		goto out;
 	if (fhandle->size == 0) {
-		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
 		if (error)
 			goto out_error;
 	}
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	if (!(fattr->valid & NFS_ATTR_FATTR)) {
 		struct nfs_server *server = NFS_SB(dentry->d_sb);
-		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
+		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);
 		if (error < 0)
 			goto out_error;
 	}
-	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
+	inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
 	error = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_error;
@@ -1679,7 +1730,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
 		dir->i_ino, dentry->d_name.name);
 
 	spin_lock(&dentry->d_lock);
-	if (dentry->d_count > 1) {
+	if (d_count(dentry) > 1) {
 		spin_unlock(&dentry->d_lock);
 		/* Start asynchronous writeout of the inode */
 		write_inode_now(dentry->d_inode, 0);
@@ -1717,7 +1768,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
  */
 int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
-	struct pagevec lru_pvec;
 	struct page *page;
 	char *kaddr;
 	struct iattr attr;
@@ -1757,11 +1807,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	 * No big deal if we can't add this page to the page cache here.
 	 * READLINK will get the missing page from the server if needed.
 	 */
-	pagevec_init(&lru_pvec, 0);
-	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
+	if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0,
 							GFP_KERNEL)) {
-		pagevec_add(&lru_pvec, page);
-		pagevec_lru_add_file(&lru_pvec);
 		SetPageUptodate(page);
 		unlock_page(page);
 	} else
@@ -1828,7 +1875,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
 		 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
 		 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
-		 new_dentry->d_count);
+		 d_count(new_dentry));
 
 	/*
 	 * For non-directories, check whether the target is busy and if so,
@@ -1846,7 +1893,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			rehash = new_dentry;
 		}
 
-		if (new_dentry->d_count > 2) {
+		if (d_count(new_dentry) > 2) {
 			int err;
 
 			/* copy the target dentry's name */
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index ca4b11ec87a..fc0f95ec735 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -10,6 +10,7 @@
 
 #include <linux/module.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/dns_resolver.h>
 #include "dns_resolve.h"
 
@@ -28,7 +29,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
 	kfree(ip_addr);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
 
 #else
 
@@ -42,6 +42,7 @@ EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
 #include <linux/seq_file.h>
 #include <linux/inet.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/svcauth.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
@@ -142,7 +143,7 @@ static int nfs_dns_upcall(struct cache_detail *cd,
 
 	ret = nfs_cache_upcall(cd, key->hostname);
 	if (ret)
-		ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
+		ret = sunrpc_cache_pipe_upcall(cd, ch);
 	return ret;
 }
 
@@ -349,64 +350,65 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
 		ret = -ESRCH;
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
+
+static struct cache_detail nfs_dns_resolve_template = {
+	.owner		= THIS_MODULE,
+	.hash_size	= NFS_DNS_HASHTBL_SIZE,
+	.name		= "dns_resolve",
+	.cache_put	= nfs_dns_ent_put,
+	.cache_upcall	= nfs_dns_upcall,
+	.cache_request	= nfs_dns_request,
+	.cache_parse	= nfs_dns_parse,
+	.cache_show	= nfs_dns_show,
+	.match		= nfs_dns_match,
+	.init		= nfs_dns_ent_init,
+	.update		= nfs_dns_ent_update,
+	.alloc		= nfs_dns_ent_alloc,
+};
+
 
 int nfs_dns_resolver_cache_init(struct net *net)
 {
-	int err = -ENOMEM;
+	int err;
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
-	struct cache_detail *cd;
-	struct cache_head **tbl;
 
-	cd = kzalloc(sizeof(struct cache_detail), GFP_KERNEL);
-	if (cd == NULL)
-		goto err_cd;
-
-	tbl = kzalloc(NFS_DNS_HASHTBL_SIZE * sizeof(struct cache_head *),
-			GFP_KERNEL);
-	if (tbl == NULL)
-		goto err_tbl;
-
-	cd->owner = THIS_MODULE,
-	cd->hash_size = NFS_DNS_HASHTBL_SIZE,
-	cd->hash_table = tbl,
-	cd->name = "dns_resolve",
-	cd->cache_put = nfs_dns_ent_put,
-	cd->cache_upcall = nfs_dns_upcall,
-	cd->cache_parse = nfs_dns_parse,
-	cd->cache_show = nfs_dns_show,
-	cd->match = nfs_dns_match,
-	cd->init = nfs_dns_ent_init,
-	cd->update = nfs_dns_ent_update,
-	cd->alloc = nfs_dns_ent_alloc,
-
-	nfs_cache_init(cd);
-	err = nfs_cache_register_net(net, cd);
+	nn->nfs_dns_resolve = cache_create_net(&nfs_dns_resolve_template, net);
+	if (IS_ERR(nn->nfs_dns_resolve))
+		return PTR_ERR(nn->nfs_dns_resolve);
+
+	err = nfs_cache_register_net(net, nn->nfs_dns_resolve);
 	if (err)
 		goto err_reg;
-	nn->nfs_dns_resolve = cd;
 	return 0;
 
 err_reg:
-	nfs_cache_destroy(cd);
-	kfree(cd->hash_table);
-err_tbl:
-	kfree(cd);
-err_cd:
+	cache_destroy_net(nn->nfs_dns_resolve, net);
 	return err;
 }
 
 void nfs_dns_resolver_cache_destroy(struct net *net)
 {
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
-	struct cache_detail *cd = nn->nfs_dns_resolve;
 
-	nfs_cache_unregister_net(net, cd);
-	nfs_cache_destroy(cd);
-	kfree(cd->hash_table);
-	kfree(cd);
+	nfs_cache_unregister_net(net, nn->nfs_dns_resolve);
+	cache_destroy_net(nn->nfs_dns_resolve, net);
 }
 
+static int nfs4_dns_net_init(struct net *net)
+{
+	return nfs_dns_resolver_cache_init(net);
+}
+
+static void nfs4_dns_net_exit(struct net *net)
+{
+	nfs_dns_resolver_cache_destroy(net);
+}
+
+static struct pernet_operations nfs4_dns_resolver_ops = {
+	.init = nfs4_dns_net_init,
+	.exit = nfs4_dns_net_exit,
+};
+
 static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
 			   void *ptr)
 {
@@ -443,11 +445,24 @@ static struct notifier_block nfs_dns_resolver_block = {
 
 int nfs_dns_resolver_init(void)
 {
-	return rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+	int err;
+
+	err = register_pernet_subsys(&nfs4_dns_resolver_ops);
+	if (err < 0)
+		goto out;
+	err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
+	if (err < 0)
+		goto out1;
+	return 0;
+out1:
+	unregister_pernet_subsys(&nfs4_dns_resolver_ops);
+out:
+	return err;
 }
 
 void nfs_dns_resolver_destroy(void)
 {
 	rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
+	unregister_pernet_subsys(&nfs4_dns_resolver_ops);
 }
 #endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3c2b893665b..94e94bd11aa 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -292,7 +292,7 @@ static int
 nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	int ret;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 
 	do {
 		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
@@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
  * - Called if either PG_private or PG_fscache is set on the page
  * - Caller holds page lock
  */
-static void nfs_invalidate_page(struct page *page, unsigned long offset)
+static void nfs_invalidate_page(struct page *page, unsigned int offset,
+				unsigned int length)
 {
-	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
+		 page, offset, length);
 
-	if (offset != 0)
+	if (offset != 0 || length < PAGE_CACHE_SIZE)
 		return;
 	/* Cancel any unstarted writes on this page */
 	nfs_wb_page_cancel(page_file_mapping(page)->host, page);
@@ -493,6 +495,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 	return nfs_fscache_release_page(page, gfp);
 }
 
+static void nfs_check_dirty_writeback(struct page *page,
+				bool *dirty, bool *writeback)
+{
+	struct nfs_inode *nfsi;
+	struct address_space *mapping = page_file_mapping(page);
+
+	if (!mapping || PageSwapCache(page))
+		return;
+
+	/*
+	 * Check if an unstable page is currently being committed and
+	 * if so, have the VM treat it as if the page is under writeback
+	 * so it will not block due to pages that will shortly be freeable.
+	 */
+	nfsi = NFS_I(mapping->host);
+	if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+		*writeback = true;
+		return;
+	}
+
+	/*
+	 * If PagePrivate() is set, then the page is not freeable and as the
+	 * inode is not being committed, it's not going to be cleaned in the
+	 * near future so treat it as dirty
+	 */
+	if (PagePrivate(page))
+		*dirty = true;
+}
+
 /*
  * Attempt to clear the private state associated with a page when an error
  * occurs that requires the cached contents of an inode to be written back or
@@ -540,6 +571,7 @@ const struct address_space_operations nfs_file_aops = {
 	.direct_IO = nfs_direct_IO,
 	.migratepage = nfs_migrate_page,
 	.launder_page = nfs_launder_page,
+	.is_dirty_writeback = nfs_check_dirty_writeback,
 	.error_remove_page = generic_error_remove_page,
 #ifdef CONFIG_NFS_SWAP
 	.swap_activate = nfs_swap_activate,
@@ -744,6 +776,7 @@ static int
 do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
 	struct inode *inode = filp->f_mapping->host;
+	struct nfs_lock_context *l_ctx;
 	int status;
 
 	/*
@@ -752,6 +785,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 	 */
 	nfs_sync_mapping(filp->f_mapping);
 
+	l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
+	if (!IS_ERR(l_ctx)) {
+		status = nfs_iocounter_wait(&l_ctx->io_count);
+		nfs_put_lock_context(l_ctx);
+		if (status < 0)
+			return status;
+	}
+
 	/* NOTE: special case
 	 * 	If we're signalled while cleaning up locks on process exit, we
 	 * 	still need to complete the unlock.
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 033803c3664..66984a9aafa 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -95,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
 		goto out;
 	}
 
-	inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
+	inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);
 	if (IS_ERR(inode)) {
 		dprintk("nfs_get_root: get root inode failed\n");
 		ret = ERR_CAST(inode);
@@ -126,8 +126,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
 	}
 	spin_unlock(&ret->d_lock);
 out:
-	if (name)
-		kfree(name);
+	kfree(name);
 	nfs_free_fattr(fsinfo.fattr);
 	return ret;
 }
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index bc3968fa81e..c2c4163d568 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -97,7 +97,7 @@ static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
 static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
 {
 	struct nfs4_string *owner = fattr->owner_name;
-	__u32 uid;
+	kuid_t uid;
 
 	if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
 		return false;
@@ -111,7 +111,7 @@ static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr
 static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
 {
 	struct nfs4_string *group = fattr->group_name;
-	__u32 gid;
+	kgid_t gid;
 
 	if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
 		return false;
@@ -193,7 +193,8 @@ static int nfs_idmap_init_keyring(void)
 	if (!cred)
 		return -ENOMEM;
 
-	keyring = keyring_alloc(".id_resolver", 0, 0, cred,
+	keyring = keyring_alloc(".id_resolver",
+				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ,
 				KEY_ALLOC_NOT_IN_QUOTA, NULL);
@@ -261,29 +262,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
 	return desclen;
 }
 
-static ssize_t nfs_idmap_request_key(struct key_type *key_type,
-				     const char *name, size_t namelen,
-				     const char *type, void *data,
-				     size_t data_size, struct idmap *idmap)
+static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
+					 const char *type, struct idmap *idmap)
 {
-	const struct cred *saved_cred;
-	struct key *rkey;
 	char *desc;
-	struct user_key_payload *payload;
+	struct key *rkey;
 	ssize_t ret;
 
 	ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
 	if (ret <= 0)
-		goto out;
+		return ERR_PTR(ret);
+
+	rkey = request_key(&key_type_id_resolver, desc, "");
+	if (IS_ERR(rkey)) {
+		mutex_lock(&idmap->idmap_mutex);
+		rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
+						desc, "", 0, idmap);
+		mutex_unlock(&idmap->idmap_mutex);
+	}
+
+	kfree(desc);
+	return rkey;
+}
+
+static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
+				 const char *type, void *data,
+				 size_t data_size, struct idmap *idmap)
+{
+	const struct cred *saved_cred;
+	struct key *rkey;
+	struct user_key_payload *payload;
+	ssize_t ret;
 
 	saved_cred = override_creds(id_resolver_cache);
-	if (idmap)
-		rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
-	else
-		rkey = request_key(&key_type_id_resolver, desc, "");
+	rkey = nfs_idmap_request_key(name, namelen, type, idmap);
 	revert_creds(saved_cred);
 
-	kfree(desc);
 	if (IS_ERR(rkey)) {
 		ret = PTR_ERR(rkey);
 		goto out;
@@ -315,23 +329,6 @@ out:
 	return ret;
 }
 
-static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
-				 const char *type, void *data,
-				 size_t data_size, struct idmap *idmap)
-{
-	ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
-					    name, namelen, type, data,
-					    data_size, NULL);
-	if (ret < 0) {
-		mutex_lock(&idmap->idmap_mutex);
-		ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
-					    name, namelen, type, data,
-					    data_size, idmap);
-		mutex_unlock(&idmap->idmap_mutex);
-	}
-	return ret;
-}
-
 /* ID -> Name */
 static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
 				     size_t buflen, struct idmap *idmap)
@@ -725,9 +722,9 @@ out1:
 	return ret;
 }
 
-static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data)
+static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen)
 {
-	return key_instantiate_and_link(key, data, strlen(data) + 1,
+	return key_instantiate_and_link(key, data, datalen,
 					id_resolver_cache->thread_keyring,
 					authkey);
 }
@@ -737,6 +734,7 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im,
 		struct key *key, struct key *authkey)
 {
 	char id_str[NFS_UINT_MAXLEN];
+	size_t len;
 	int ret = -ENOKEY;
 
 	/* ret = -ENOKEY */
@@ -746,13 +744,15 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im,
 	case IDMAP_CONV_NAMETOID:
 		if (strcmp(upcall->im_name, im->im_name) != 0)
 			break;
-		sprintf(id_str, "%d", im->im_id);
-		ret = nfs_idmap_instantiate(key, authkey, id_str);
+		/* Note: here we store the NUL terminator too */
+		len = sprintf(id_str, "%d", im->im_id) + 1;
+		ret = nfs_idmap_instantiate(key, authkey, id_str, len);
 		break;
 	case IDMAP_CONV_IDTONAME:
 		if (upcall->im_id != im->im_id)
 			break;
-		ret = nfs_idmap_instantiate(key, authkey, im->im_name);
+		len = strlen(im->im_name);
+		ret = nfs_idmap_instantiate(key, authkey, im->im_name, len);
 		break;
 	default:
 		ret = -EINVAL;
@@ -764,7 +764,7 @@ out:
 static ssize_t
 idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
-	struct rpc_inode *rpci = RPC_I(filp->f_path.dentry->d_inode);
+	struct rpc_inode *rpci = RPC_I(file_inode(filp));
 	struct idmap *idmap = (struct idmap *)rpci->private;
 	struct key_construction *cons;
 	struct idmap_msg im;
@@ -836,43 +836,61 @@ idmap_release_pipe(struct inode *inode)
 	nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);
 }
 
-int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)
 {
 	struct idmap *idmap = server->nfs_client->cl_idmap;
+	__u32 id = -1;
+	int ret = 0;
 
-	if (nfs_map_string_to_numeric(name, namelen, uid))
-		return 0;
-	return nfs_idmap_lookup_id(name, namelen, "uid", uid, idmap);
+	if (!nfs_map_string_to_numeric(name, namelen, &id))
+		ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap);
+	if (ret == 0) {
+		*uid = make_kuid(&init_user_ns, id);
+		if (!uid_valid(*uid))
+			ret = -ERANGE;
+	}
+	return ret;
 }
 
-int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid)
 {
 	struct idmap *idmap = server->nfs_client->cl_idmap;
+	__u32 id = -1;
+	int ret = 0;
 
-	if (nfs_map_string_to_numeric(name, namelen, gid))
-		return 0;
-	return nfs_idmap_lookup_id(name, namelen, "gid", gid, idmap);
+	if (!nfs_map_string_to_numeric(name, namelen, &id))
+		ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap);
+	if (ret == 0) {
+		*gid = make_kgid(&init_user_ns, id);
+		if (!gid_valid(*gid))
+			ret = -ERANGE;
+	}
+	return ret;
 }
 
-int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = server->nfs_client->cl_idmap;
 	int ret = -EINVAL;
+	__u32 id;
 
+	id = from_kuid(&init_user_ns, uid);
 	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
-		ret = nfs_idmap_lookup_name(uid, "user", buf, buflen, idmap);
+		ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap);
 	if (ret < 0)
-		ret = nfs_map_numeric_to_string(uid, buf, buflen);
+		ret = nfs_map_numeric_to_string(id, buf, buflen);
 	return ret;
 }
-int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = server->nfs_client->cl_idmap;
 	int ret = -EINVAL;
+	__u32 id;
 
+	id = from_kgid(&init_user_ns, gid);
 	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
-		ret = nfs_idmap_lookup_name(gid, "group", buf, buflen, idmap);
+		ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap);
 	if (ret < 0)
-		ret = nfs_map_numeric_to_string(gid, buf, buflen);
+		ret = nfs_map_numeric_to_string(id, buf, buflen);
 	return ret;
 }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ebeb94ce1b0..c93639e6cf6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,7 +48,6 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
-#include "dns_resolve.h"
 #include "pnfs.h"
 #include "nfs.h"
 #include "netns.h"
@@ -79,7 +78,7 @@ int nfs_wait_bit_killable(void *word)
 {
 	if (fatal_signal_pending(current))
 		return -ERESTARTSYS;
-	freezable_schedule();
+	freezable_schedule_unsafe();
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
@@ -162,11 +161,19 @@ static void nfs_zap_caches_locked(struct inode *inode)
 
 	memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
 		nfs_fscache_invalidate(inode);
-	} else {
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
-	}
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+					| NFS_INO_INVALID_LABEL
+					| NFS_INO_INVALID_DATA
+					| NFS_INO_INVALID_ACCESS
+					| NFS_INO_INVALID_ACL
+					| NFS_INO_REVAL_PAGECACHE;
+	} else
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+					| NFS_INO_INVALID_LABEL
+					| NFS_INO_INVALID_ACCESS
+					| NFS_INO_INVALID_ACL
+					| NFS_INO_REVAL_PAGECACHE;
 }
 
 void nfs_zap_caches(struct inode *inode)
@@ -237,6 +244,8 @@ nfs_find_actor(struct inode *inode, void *opaque)
 
 	if (NFS_FILEID(inode) != fattr->fileid)
 		return 0;
+	if ((S_IFMT & inode->i_mode) != (S_IFMT & fattr->mode))
+		return 0;
 	if (nfs_compare_fh(NFS_FH(inode), fh))
 		return 0;
 	if (is_bad_inode(inode) || NFS_STALE(inode))
@@ -255,12 +264,72 @@ nfs_init_locked(struct inode *inode, void *opaque)
 	return 0;
 }
 
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+					struct nfs4_label *label)
+{
+	int error;
+
+	if (label == NULL)
+		return;
+
+	if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
+		return;
+
+	if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
+		return;
+
+	if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
+		error = security_inode_notifysecctx(inode, label->label,
+				label->len);
+		if (error)
+			printk(KERN_ERR "%s() %s %d "
+					"security_inode_notifysecctx() %d\n",
+					__func__,
+					(char *)label->label,
+					label->len, error);
+	}
+}
+
+struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
+{
+	struct nfs4_label *label = NULL;
+	int minor_version = server->nfs_client->cl_minorversion;
+
+	if (minor_version < 2)
+		return label;
+
+	if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+		return label;
+
+	label = kzalloc(sizeof(struct nfs4_label), flags);
+	if (label == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	label->label = kzalloc(NFS4_MAXLABELLEN, flags);
+	if (label->label == NULL) {
+		kfree(label);
+		return ERR_PTR(-ENOMEM);
+	}
+	label->len = NFS4_MAXLABELLEN;
+
+	return label;
+}
+EXPORT_SYMBOL_GPL(nfs4_label_alloc);
+#else
+void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+					struct nfs4_label *label)
+{
+}
+#endif
+EXPORT_SYMBOL_GPL(nfs_setsecurity);
+
 /*
  * This is our front-end to iget that looks up inodes by file handle
  * instead of inode number.
  */
 struct inode *
-nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
+nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
 {
 	struct nfs_find_desc desc = {
 		.fh	= fh,
@@ -332,8 +401,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		inode->i_version = 0;
 		inode->i_size = 0;
 		clear_nlink(inode);
-		inode->i_uid = -2;
-		inode->i_gid = -2;
+		inode->i_uid = make_kuid(&init_user_ns, -2);
+		inode->i_gid = make_kgid(&init_user_ns, -2);
 		inode->i_blocks = 0;
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 		nfsi->write_io = 0;
@@ -382,6 +451,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			 */
 			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
 		}
+
+		nfs_setsecurity(inode, fattr, label);
+
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
 		nfsi->access_cache = RB_ROOT;
@@ -391,6 +463,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		unlock_new_inode(inode);
 	} else
 		nfs_refresh_inode(inode, fattr);
+		nfs_setsecurity(inode, fattr, label);
 	dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
 		inode->i_sb->s_id,
 		(long long)NFS_FILEID(inode),
@@ -447,7 +520,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		NFS_PROTO(inode)->return_delegation(inode);
 	error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
 	if (error == 0)
-		nfs_refresh_inode(inode, fattr);
+		error = nfs_refresh_inode(inode, fattr);
 	nfs_free_fattr(fattr);
 out:
 	return error;
@@ -559,20 +632,22 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
 	l_ctx->lockowner.l_owner = current->files;
 	l_ctx->lockowner.l_pid = current->tgid;
 	INIT_LIST_HEAD(&l_ctx->list);
+	nfs_iocounter_init(&l_ctx->io_count);
 }
 
 static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
 {
-	struct nfs_lock_context *pos;
+	struct nfs_lock_context *head = &ctx->lock_context;
+	struct nfs_lock_context *pos = head;
 
-	list_for_each_entry(pos, &ctx->lock_context.list, list) {
+	do {
 		if (pos->lockowner.l_owner != current->files)
 			continue;
 		if (pos->lockowner.l_pid != current->tgid)
 			continue;
 		atomic_inc(&pos->count);
 		return pos;
-	}
+	} while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head);
 	return NULL;
 }
 
@@ -694,10 +769,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
 	dput(ctx->dentry);
-	if (is_sync)
-		nfs_sb_deactive(sb);
-	else
-		nfs_sb_deactive_async(sb);
+	nfs_sb_deactive(sb);
 	kfree(ctx->mdsthreshold);
 	kfree(ctx);
 }
@@ -712,16 +784,23 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);
  * Ensure that mmap has a recent RPC credential for use when writing out
  * shared pages
  */
-void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = ctx->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	filp->private_data = get_nfs_open_context(ctx);
 	spin_lock(&inode->i_lock);
 	list_add(&ctx->list, &nfsi->open_files);
 	spin_unlock(&inode->i_lock);
 }
+EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
+
+void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
+{
+	filp->private_data = get_nfs_open_context(ctx);
+	if (list_empty(&ctx->list))
+		nfs_inode_attach_open_context(ctx);
+}
 EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
 
 /*
@@ -747,10 +826,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
 
 static void nfs_file_clear_open_context(struct file *filp)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct nfs_open_context *ctx = nfs_file_open_context(filp);
 
 	if (ctx) {
+		struct inode *inode = ctx->dentry->d_inode;
+
 		filp->private_data = NULL;
 		spin_lock(&inode->i_lock);
 		list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -789,6 +869,7 @@ int
 __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
 	int		 status = -ESTALE;
+	struct nfs4_label *label = NULL;
 	struct nfs_fattr *fattr = NULL;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
@@ -806,7 +887,14 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		goto out;
 
 	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr);
+
+	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+	if (IS_ERR(label)) {
+		status = PTR_ERR(label);
+		goto out;
+	}
+
+	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
 	if (status != 0) {
 		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
 			 inode->i_sb->s_id,
@@ -816,7 +904,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 			if (!S_ISDIR(inode->i_mode))
 				set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
 		}
-		goto out;
+		goto err_out;
 	}
 
 	status = nfs_refresh_inode(inode, fattr);
@@ -824,7 +912,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
 			 inode->i_sb->s_id,
 			 (long long)NFS_FILEID(inode), status);
-		goto out;
+		goto err_out;
 	}
 
 	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
@@ -834,7 +922,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		inode->i_sb->s_id,
 		(long long)NFS_FILEID(inode));
 
- out:
+err_out:
+	nfs4_label_free(label);
+out:
 	nfs_free_fattr(fattr);
 	return status;
 }
@@ -862,7 +952,8 @@ static int nfs_attribute_cache_expired(struct inode *inode)
  */
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
-	if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
+	if (!(NFS_I(inode)->cache_validity &
+			(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
 			&& !nfs_attribute_cache_expired(inode))
 		return NFS_STALE(inode) ? -ESTALE : 0;
 	return __nfs_revalidate_inode(server, inode);
@@ -1009,9 +1100,9 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	/* Have any file permissions changed? */
 	if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
 		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
-	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+	if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid))
 		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
-	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
+	if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid))
 		invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
 
 	/* Has the link count changed? */
@@ -1242,6 +1333,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	spin_lock(&inode->i_lock);
 	status = nfs_post_op_update_inode_locked(inode, fattr);
 	spin_unlock(&inode->i_lock);
+
 	return status;
 }
 EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
@@ -1440,7 +1532,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				| NFS_INO_REVAL_FORCED);
 
 	if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
-		if (inode->i_uid != fattr->uid) {
+		if (!uid_eq(inode->i_uid, fattr->uid)) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 			inode->i_uid = fattr->uid;
 		}
@@ -1451,7 +1543,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				| NFS_INO_REVAL_FORCED);
 
 	if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
-		if (inode->i_gid != fattr->gid) {
+		if (!gid_eq(inode->i_gid, fattr->gid)) {
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 			inode->i_gid = fattr->gid;
 		}
@@ -1482,7 +1574,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		inode->i_blocks = fattr->du.nfs2.blocks;
 
 	/* Update attrtimeo value if we're out of the unstable period */
-	if (invalid & NFS_INO_INVALID_ATTR) {
+	if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) {
 		nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
@@ -1495,6 +1587,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		}
 	}
 	invalid &= ~NFS_INO_INVALID_ATTR;
+	invalid &= ~NFS_INO_INVALID_LABEL;
 	/* Don't invalidate the data if we were to blame */
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
 				|| S_ISLNK(inode->i_mode)))
@@ -1637,12 +1730,11 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
 static int nfs_net_init(struct net *net)
 {
 	nfs_clients_init(net);
-	return nfs_dns_resolver_cache_init(net);
+	return 0;
 }
 
 static void nfs_net_exit(struct net *net)
 {
-	nfs_dns_resolver_cache_destroy(net);
 	nfs_cleanup_cb_ident_idr(net);
 }
 
@@ -1660,10 +1752,6 @@ static int __init init_nfs_fs(void)
 {
 	int err;
 
-	err = nfs_dns_resolver_init();
-	if (err < 0)
-		goto out10;;
-
 	err = register_pernet_subsys(&nfs_net_ops);
 	if (err < 0)
 		goto out9;
@@ -1729,8 +1817,6 @@ out7:
 out8:
 	unregister_pernet_subsys(&nfs_net_ops);
 out9:
-	nfs_dns_resolver_destroy();
-out10:
 	return err;
 }
 
@@ -1743,7 +1829,6 @@ static void __exit exit_nfs_fs(void)
 	nfs_destroy_nfspagecache();
 	nfs_fscache_unregister();
 	unregister_pernet_subsys(&nfs_net_ops);
-	nfs_dns_resolver_destroy();
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister(&init_net, "nfs");
 #endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f0e6c7df1a0..3c8373f90ab 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -165,7 +165,7 @@ extern void nfs_free_client(struct nfs_client *);
 extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
 extern struct nfs_client *
 nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
-				struct nfs4_sessionid *);
+				struct nfs4_sessionid *, u32);
 extern struct nfs_server *nfs_create_server(struct nfs_mount_info *,
 					struct nfs_subversion *);
 extern struct nfs_server *nfs4_create_server(
@@ -229,6 +229,13 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 			      struct nfs_pgio_header *hdr,
 			      void (*release)(struct nfs_pgio_header *hdr));
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
+int nfs_iocounter_wait(struct nfs_io_counter *c);
+
+static inline void nfs_iocounter_init(struct nfs_io_counter *c)
+{
+	c->flags = 0;
+	atomic_set(&c->io_count, 0);
+}
 
 /* nfs2xdr.c */
 extern struct rpc_procinfo nfs_procedures[];
@@ -248,6 +255,7 @@ extern int nfs4_decode_dirent(struct xdr_stream *,
 #ifdef CONFIG_NFS_V4_1
 extern const u32 nfs41_maxread_overhead;
 extern const u32 nfs41_maxwrite_overhead;
+extern const u32 nfs41_maxgetdevinfo_overhead;
 #endif
 
 /* nfs4proc.c */
@@ -329,7 +337,6 @@ extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
 extern void nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
-extern void nfs_sb_deactive_async(struct super_block *sb);
 
 /* namespace.c */
 #define NFS_PATH_CANONICAL 1
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 91a6faf811a..99a45283b9e 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -139,7 +139,10 @@ struct mnt_fhstatus {
  * nfs_mount - Obtain an NFS file handle for the given host and path
  * @info: pointer to mount request arguments
  *
- * Uses default timeout parameters specified by underlying transport.
+ * Uses default timeout parameters specified by underlying transport. On
+ * successful return, the auth_flavs list and auth_flav_len will be populated
+ * with the list from the server or a faked-up list if the server didn't
+ * provide one.
  */
 int nfs_mount(struct nfs_mount_request *info)
 {
@@ -195,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info)
 	dprintk("NFS: MNT request succeeded\n");
 	status = 0;
 
+	/*
+	 * If the server didn't provide a flavor list, allow the
+	 * client to try any flavor.
+	 */
+	if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
+		dprintk("NFS: Faking up auth_flavs list\n");
+		info->auth_flavs[0] = RPC_AUTH_NULL;
+		*info->auth_flav_len = 1;
+	}
 out:
 	return status;
 
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index fc8dc20fdeb..348b535cd78 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -280,7 +280,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
 	struct dentry *parent = dget_parent(dentry);
 
 	/* Look it up again to get its attributes */
-	err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr);
+	err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL);
 	dput(parent);
 	if (err != 0)
 		return ERR_PTR(err);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 06b9df49f7f..62db136339e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -290,8 +290,13 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 
 	fattr->mode = be32_to_cpup(p++);
 	fattr->nlink = be32_to_cpup(p++);
-	fattr->uid = be32_to_cpup(p++);
-	fattr->gid = be32_to_cpup(p++);
+	fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++));
+	if (!uid_valid(fattr->uid))
+		goto out_uid;
+	fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++));
+	if (!gid_valid(fattr->gid))
+		goto out_gid;
+		
 	fattr->size = be32_to_cpup(p++);
 	fattr->du.nfs2.blocksize = be32_to_cpup(p++);
 
@@ -313,6 +318,12 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 	fattr->change_attr = nfs_timespec_to_change_attr(&fattr->ctime);
 
 	return 0;
+out_uid:
+	dprintk("NFS: returned invalid uid\n");
+	return -EINVAL;
+out_gid:
+	dprintk("NFS: returned invalid gid\n");
+	return -EINVAL;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
 	return -EIO;
@@ -351,11 +362,11 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr)
 	else
 		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
 	if (attr->ia_valid & ATTR_UID)
-		*p++ = cpu_to_be32(attr->ia_uid);
+		*p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid));
 	else
 		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
 	if (attr->ia_valid & ATTR_GID)
-		*p++ = cpu_to_be32(attr->ia_gid);
+		*p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid));
 	else
 		*p++ = cpu_to_be32(NFS2_SATTR_NOT_SET);
 	if (attr->ia_valid & ATTR_SIZE)
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 70efb63b1e4..f5c84c3efbc 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -33,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 		res = rpc_call_sync(clnt, msg, flags);
 		if (res != -EJUKEBOX)
 			break;
-		freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+		freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
 		res = -ERESTARTSYS;
 	} while (!fatal_signal_pending(current));
 	return res;
@@ -98,7 +98,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
  */
 static int
 nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
-		struct nfs_fattr *fattr)
+		struct nfs_fattr *fattr, struct nfs4_label *label)
 {
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_GETATTR],
@@ -143,7 +143,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 
 static int
 nfs3_proc_lookup(struct inode *dir, struct qstr *name,
-		 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+		 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+		 struct nfs4_label *label)
 {
 	struct nfs3_diropargs	arg = {
 		.fh		= NFS_FH(dir),
@@ -300,7 +301,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_
 	status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
 	nfs_post_op_update_inode(dir, data->res.dir_attr);
 	if (status == 0)
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
 	return status;
 }
 
@@ -872,7 +873,7 @@ static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
 static int
 nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
 	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
 }
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index bffc32406fb..fa6d72131c1 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -592,13 +592,13 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr)
 
 	if (attr->ia_valid & ATTR_UID) {
 		*p++ = xdr_one;
-		*p++ = cpu_to_be32(attr->ia_uid);
+		*p++ = cpu_to_be32(from_kuid(&init_user_ns, attr->ia_uid));
 	} else
 		*p++ = xdr_zero;
 
 	if (attr->ia_valid & ATTR_GID) {
 		*p++ = xdr_one;
-		*p++ = cpu_to_be32(attr->ia_gid);
+		*p++ = cpu_to_be32(from_kgid(&init_user_ns, attr->ia_gid));
 	} else
 		*p++ = xdr_zero;
 
@@ -657,8 +657,12 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 
 	fattr->mode = (be32_to_cpup(p++) & ~S_IFMT) | fmode;
 	fattr->nlink = be32_to_cpup(p++);
-	fattr->uid = be32_to_cpup(p++);
-	fattr->gid = be32_to_cpup(p++);
+	fattr->uid = make_kuid(&init_user_ns, be32_to_cpup(p++));
+	if (!uid_valid(fattr->uid))
+		goto out_uid;
+	fattr->gid = make_kgid(&init_user_ns, be32_to_cpup(p++));
+	if (!gid_valid(fattr->gid))
+		goto out_gid;
 
 	p = xdr_decode_size3(p, &fattr->size);
 	p = xdr_decode_size3(p, &fattr->du.nfs3.used);
@@ -675,6 +679,12 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr)
 
 	fattr->valid |= NFS_ATTR_FATTR_V3;
 	return 0;
+out_uid:
+	dprintk("NFS: returned invalid uid\n");
+	return -EINVAL;
+out_gid:
+	dprintk("NFS: returned invalid gid\n");
+	return -EINVAL;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
 	return -EIO;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a3f488b074a..ee81e354bce 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -13,6 +13,8 @@
 
 #define NFS4_MAX_LOOP_ON_RECOVER (10)
 
+#include <linux/seqlock.h>
+
 struct idmap;
 
 enum nfs4_client_state {
@@ -34,6 +36,7 @@ enum nfs4_client_state {
 
 struct nfs4_minor_version_ops {
 	u32	minor_version;
+	unsigned init_caps;
 
 	int	(*call_sync)(struct rpc_clnt *clnt,
 			struct nfs_server *server,
@@ -44,6 +47,8 @@ struct nfs4_minor_version_ops {
 			const nfs4_stateid *);
 	int	(*find_root_sec)(struct nfs_server *, struct nfs_fh *,
 			struct nfs_fsinfo *);
+	int	(*free_lock_state)(struct nfs_server *,
+			struct nfs4_lock_state *);
 	const struct nfs4_state_recovery_ops *reboot_recovery_ops;
 	const struct nfs4_state_recovery_ops *nograce_recovery_ops;
 	const struct nfs4_state_maintenance_ops *state_renewal_ops;
@@ -90,6 +95,8 @@ struct nfs4_state_owner {
 	unsigned long	     so_flags;
 	struct list_head     so_states;
 	struct nfs_seqid_counter so_seqid;
+	seqcount_t	     so_reclaim_seqcount;
+	struct mutex	     so_delegreturn_mutex;
 };
 
 enum {
@@ -139,12 +146,14 @@ struct nfs4_lock_state {
 enum {
 	LK_STATE_IN_USE,
 	NFS_DELEGATED_STATE,		/* Current stateid is delegation */
+	NFS_OPEN_STATE,			/* OPEN stateid is set */
 	NFS_O_RDONLY_STATE,		/* OPEN stateid has read-only state */
 	NFS_O_WRONLY_STATE,		/* OPEN stateid has write-only state */
 	NFS_O_RDWR_STATE,		/* OPEN stateid has read/write state */
 	NFS_STATE_RECLAIM_REBOOT,	/* OPEN stateid server rebooted */
 	NFS_STATE_RECLAIM_NOGRACE,	/* OPEN stateid needs to recover state */
 	NFS_STATE_POSIX_LOCKS,		/* Posix locks are supported */
+	NFS_STATE_RECOVERY_FAILED,	/* OPEN stateid state recovery failed */
 };
 
 struct nfs4_state {
@@ -185,7 +194,7 @@ struct nfs4_state_recovery_ops {
 	int (*recover_lock)(struct nfs4_state *, struct file_lock *);
 	int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
 	struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
-	int (*reclaim_complete)(struct nfs_client *);
+	int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
 	int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
 		struct rpc_cred *);
 };
@@ -227,8 +236,11 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc
 extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,
 			    struct nfs_fh *, struct nfs_fattr *);
 extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
-extern int nfs4_release_lockowner(struct nfs4_lock_state *);
 extern const struct xattr_handler *nfs4_xattr_handlers[];
+extern int nfs4_set_rw_stateid(nfs4_stateid *stateid,
+		const struct nfs_open_context *ctx,
+		const struct nfs_lock_context *l_ctx,
+		fmode_t fmode);
 
 #if defined(CONFIG_NFS_V4_1)
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -291,10 +303,10 @@ is_ds_client(struct nfs_client *clp)
 extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
 
 extern const u32 nfs4_fattr_bitmap[3];
-extern const u32 nfs4_statfs_bitmap[2];
-extern const u32 nfs4_pathconf_bitmap[2];
+extern const u32 nfs4_statfs_bitmap[3];
+extern const u32 nfs4_pathconf_bitmap[3];
 extern const u32 nfs4_fsinfo_bitmap[3];
-extern const u32 nfs4_fs_locations_bitmap[2];
+extern const u32 nfs4_fs_locations_bitmap[3];
 
 void nfs4_free_client(struct nfs_client *);
 
@@ -343,13 +355,13 @@ extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
 extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
-extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
+extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_server_scope(struct nfs_client *,
 				      struct nfs41_server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
-extern void nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
+extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *,
 		fmode_t, const struct nfs_lockowner *);
 
 extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
@@ -408,6 +420,11 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei
 	return memcmp(dst, src, sizeof(*dst)) == 0;
 }
 
+static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
+{
+	return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
+}
+
 #else
 
 #define nfs4_close_state(a, b) do { } while (0)
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 2e9779b58b7..90dce91dd5b 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -6,6 +6,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
 #include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/auth.h>
 #include <linux/sunrpc/xprt.h>
 #include <linux/sunrpc/bc_xprt.h>
@@ -29,15 +30,14 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
 
 	if (clp->rpc_ops->version != 4 || minorversion != 0)
 		return ret;
-retry:
-	if (!idr_pre_get(&nn->cb_ident_idr, GFP_KERNEL))
-		return -ENOMEM;
+	idr_preload(GFP_KERNEL);
 	spin_lock(&nn->nfs_client_lock);
-	ret = idr_get_new(&nn->cb_ident_idr, clp, &clp->cl_cb_ident);
+	ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT);
+	if (ret >= 0)
+		clp->cl_cb_ident = ret;
 	spin_unlock(&nn->nfs_client_lock);
-	if (ret == -EAGAIN)
-		goto retry;
-	return ret;
+	idr_preload_end();
+	return ret < 0 ? ret : 0;
 }
 
 #ifdef CONFIG_NFS_V4_1
@@ -66,6 +66,11 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
 	if (err)
 		goto error;
 
+	if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
+		err = -EINVAL;
+		goto error;
+	}
+
 	spin_lock_init(&clp->cl_lock);
 	INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
 	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -198,8 +203,12 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 	/* Check NFS protocol revision and initialize RPC op vector */
 	clp->rpc_ops = &nfs_v4_clientops;
 
+	if (clp->cl_minorversion != 0)
+		__set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
 	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
-	error = nfs_create_rpc_client(clp, timeparms, authflavour);
+	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
+	if (error == -EINVAL)
+		error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
 	if (error < 0)
 		goto error;
 
@@ -300,7 +309,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
 			   struct rpc_cred *cred)
 {
 	struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
-	struct nfs_client *pos, *n, *prev = NULL;
+	struct nfs_client *pos, *prev = NULL;
 	struct nfs4_setclientid_res clid = {
 		.clientid	= new->cl_clientid,
 		.confirm	= new->cl_confirm,
@@ -308,10 +317,23 @@ int nfs40_walk_client_list(struct nfs_client *new,
 	int status = -NFS4ERR_STALE_CLIENTID;
 
 	spin_lock(&nn->nfs_client_lock);
-	list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
+	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
 		/* If "pos" isn't marked ready, we can't trust the
 		 * remaining fields in "pos" */
-		if (pos->cl_cons_state < NFS_CS_READY)
+		if (pos->cl_cons_state > NFS_CS_READY) {
+			atomic_inc(&pos->cl_count);
+			spin_unlock(&nn->nfs_client_lock);
+
+			if (prev)
+				nfs_put_client(prev);
+			prev = pos;
+
+			status = nfs_wait_client_init_complete(pos);
+			spin_lock(&nn->nfs_client_lock);
+			if (status < 0)
+				continue;
+		}
+		if (pos->cl_cons_state != NFS_CS_READY)
 			continue;
 
 		if (pos->rpc_ops != new->rpc_ops)
@@ -423,16 +445,16 @@ int nfs41_walk_client_list(struct nfs_client *new,
 			   struct rpc_cred *cred)
 {
 	struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
-	struct nfs_client *pos, *n, *prev = NULL;
+	struct nfs_client *pos, *prev = NULL;
 	int status = -NFS4ERR_STALE_CLIENTID;
 
 	spin_lock(&nn->nfs_client_lock);
-	list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
+	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
 		/* If "pos" isn't marked ready, we can't trust the
 		 * remaining fields in "pos", especially the client
 		 * ID and serverowner fields.  Wait for CREATE_SESSION
 		 * to finish. */
-		if (pos->cl_cons_state < NFS_CS_READY) {
+		if (pos->cl_cons_state > NFS_CS_READY) {
 			atomic_inc(&pos->cl_count);
 			spin_unlock(&nn->nfs_client_lock);
 
@@ -440,18 +462,17 @@ int nfs41_walk_client_list(struct nfs_client *new,
 				nfs_put_client(prev);
 			prev = pos;
 
-			nfs4_schedule_lease_recovery(pos);
 			status = nfs_wait_client_init_complete(pos);
-			if (status < 0) {
-				nfs_put_client(pos);
-				spin_lock(&nn->nfs_client_lock);
-				continue;
+			if (status == 0) {
+				nfs4_schedule_lease_recovery(pos);
+				status = nfs4_wait_clnt_recover(pos);
 			}
-			status = pos->cl_cons_state;
 			spin_lock(&nn->nfs_client_lock);
 			if (status < 0)
 				continue;
 		}
+		if (pos->cl_cons_state != NFS_CS_READY)
+			continue;
 
 		if (pos->rpc_ops != new->rpc_ops)
 			continue;
@@ -469,17 +490,18 @@ int nfs41_walk_client_list(struct nfs_client *new,
 			continue;
 
 		atomic_inc(&pos->cl_count);
-		spin_unlock(&nn->nfs_client_lock);
+		*result = pos;
+		status = 0;
 		dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
 			__func__, pos, atomic_read(&pos->cl_count));
-
-		*result = pos;
-		return 0;
+		break;
 	}
 
 	/* No matching nfs_client found. */
 	spin_unlock(&nn->nfs_client_lock);
 	dprintk("NFS: <-- %s status = %d\n", __func__, status);
+	if (prev)
+		nfs_put_client(prev);
 	return status;
 }
 #endif	/* CONFIG_NFS_V4_1 */
@@ -545,14 +567,14 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
  */
 struct nfs_client *
 nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
-			   struct nfs4_sessionid *sid)
+			   struct nfs4_sessionid *sid, u32 minorversion)
 {
 	struct nfs_client *clp;
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
 
 	spin_lock(&nn->nfs_client_lock);
 	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
-		if (nfs4_cb_match_client(addr, clp, 1) == false)
+		if (nfs4_cb_match_client(addr, clp, minorversion) == false)
 			continue;
 
 		if (!nfs4_has_session(clp))
@@ -575,7 +597,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
 
 struct nfs_client *
 nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
-			   struct nfs4_sessionid *sid)
+			   struct nfs4_sessionid *sid, u32 minorversion)
 {
 	return NULL;
 }
@@ -609,6 +631,8 @@ static int nfs4_set_client(struct nfs_server *server,
 
 	if (server->flags & NFS_MOUNT_NORESVPORT)
 		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+	if (server->options & NFS_OPTION_MIGRATION)
+		set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
 
 	/* Allocate or find a client reference we can use */
 	clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
@@ -713,10 +737,23 @@ static int nfs4_server_common_setup(struct nfs_server *server,
 		return -ENOMEM;
 
 	/* We must ensure the session is initialised first */
-	error = nfs4_init_session(server);
+	error = nfs4_init_session(server->nfs_client);
 	if (error < 0)
 		goto out;
 
+	/* Set the basic capabilities */
+	server->caps |= server->nfs_client->cl_mvops->init_caps;
+	if (server->flags & NFS_MOUNT_NORDIRPLUS)
+			server->caps &= ~NFS_CAP_READDIRPLUS;
+	/*
+	 * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+	 * authentication.
+	 */
+	if (nfs4_disable_idmapping &&
+			server->client->cl_auth->au_flavor == RPC_AUTH_UNIX)
+		server->caps |= NFS_CAP_UIDGID_NOMAP;
+
+
 	/* Probe the root fh to retrieve its FSID and filehandle */
 	error = nfs4_get_rootfh(server, mntfh);
 	if (error < 0)
@@ -760,9 +797,6 @@ static int nfs4_init_server(struct nfs_server *server,
 
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
-	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|NFS_CAP_POSIX_LOCK;
-	if (!(data->flags & NFS_MOUNT_NORDIRPLUS))
-			server->caps |= NFS_CAP_READDIRPLUS;
 	server->options = data->options;
 
 	/* Get a client record */
@@ -779,13 +813,6 @@ static int nfs4_init_server(struct nfs_server *server,
 	if (error < 0)
 		goto error;
 
-	/*
-	 * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
-	 * authentication.
-	 */
-	if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
-		server->caps |= NFS_CAP_UIDGID_NOMAP;
-
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
 	if (data->wsize)
@@ -863,7 +890,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 
 	/* Initialise the client representation from the parent server */
 	nfs_server_copy_userdata(server, parent_server);
-	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
 
 	/* Get a client representation.
 	 * Note: NFSv4 always uses TCP, */
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 08ddcccb888..e5b804dd944 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -69,7 +69,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 			goto out_drop;
 		}
 	}
-	iput(inode);
 	if (inode != dentry->d_inode)
 		goto out_drop;
 
@@ -94,7 +93,7 @@ static int
 nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
 	int ret;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 
 	do {
 		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 194c4841033..17ed87ef9de 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -99,7 +99,8 @@ static void filelayout_reset_write(struct nfs_write_data *data)
 
 		task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
 							&hdr->pages,
-							hdr->completion_ops);
+							hdr->completion_ops,
+							hdr->dreq);
 	}
 }
 
@@ -119,7 +120,8 @@ static void filelayout_reset_read(struct nfs_read_data *data)
 
 		task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
 							&hdr->pages,
-							hdr->completion_ops);
+							hdr->completion_ops,
+							hdr->dreq);
 	}
 }
 
@@ -127,7 +129,6 @@ static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo)
 {
 	if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
 		return;
-	clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags);
 	pnfs_return_layout(inode);
 }
 
@@ -157,11 +158,14 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 	case -NFS4ERR_OPENMODE:
 		if (state == NULL)
 			break;
-		nfs4_schedule_stateid_recovery(mds_server, state);
+		if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+			goto out_bad_stateid;
 		goto wait_on_recovery;
 	case -NFS4ERR_EXPIRED:
-		if (state != NULL)
-			nfs4_schedule_stateid_recovery(mds_server, state);
+		if (state != NULL) {
+			if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+				goto out_bad_stateid;
+		}
 		nfs4_schedule_lease_recovery(mds_client);
 		goto wait_on_recovery;
 	/* DS session errors */
@@ -225,6 +229,9 @@ reset:
 out:
 	task->tk_status = 0;
 	return -EAGAIN;
+out_bad_stateid:
+	task->tk_status = -EIO;
+	return 0;
 wait_on_recovery:
 	rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
@@ -298,6 +305,10 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs_read_data *rdata = data;
 
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
+		rpc_exit(task, -EIO);
+		return;
+	}
 	if (filelayout_reset_to_mds(rdata->header->lseg)) {
 		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
 		filelayout_reset_read(rdata);
@@ -306,10 +317,13 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
 	}
 	rdata->read_done_cb = filelayout_read_done_cb;
 
-	nfs41_setup_sequence(rdata->ds_clp->cl_session,
+	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
 			&rdata->args.seq_args,
 			&rdata->res.seq_res,
-			task);
+			task))
+		return;
+	nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
+			rdata->args.lock_context, FMODE_READ);
 }
 
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
@@ -400,16 +414,23 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs_write_data *wdata = data;
 
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
+		rpc_exit(task, -EIO);
+		return;
+	}
 	if (filelayout_reset_to_mds(wdata->header->lseg)) {
 		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
 		filelayout_reset_write(wdata);
 		rpc_exit(task, 0);
 		return;
 	}
-	nfs41_setup_sequence(wdata->ds_clp->cl_session,
+	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
 			&wdata->args.seq_args,
 			&wdata->res.seq_res,
-			task);
+			task))
+		return;
+	nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
+			wdata->args.lock_context, FMODE_WRITE);
 }
 
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
@@ -622,7 +643,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 	d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
 				   NFS_SERVER(lo->plh_inode)->nfs_client, id);
 	if (d == NULL) {
-		dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags);
+		dsaddr = filelayout_get_device_info(lo->plh_inode, id,
+				lo->plh_lc_cred, gfp_flags);
 		if (dsaddr == NULL)
 			goto out;
 	} else
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 8c07241fe52..cebd20e7e92 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -36,7 +36,7 @@
  * Default data server connection timeout and retrans vaules.
  * Set by module paramters dataserver_timeo and dataserver_retrans.
  */
-#define NFS4_DEF_DS_TIMEO   60
+#define NFS4_DEF_DS_TIMEO   600 /* in tenths of a second */
 #define NFS4_DEF_DS_RETRANS 5
 
 /*
@@ -70,6 +70,8 @@ struct nfs4_pnfs_ds {
 	struct list_head	ds_addrs;
 	struct nfs_client	*ds_clp;
 	atomic_t		ds_count;
+	unsigned long		ds_state;
+#define NFS4DS_CONNECTING	0	/* ds is establishing connection */
 };
 
 struct nfs4_file_layout_dsaddr {
@@ -148,6 +150,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
+filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
+		struct rpc_cred *cred, gfp_t gfp_flags);
 
 #endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index b720064bcd7..95604f64cab 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -31,6 +31,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
+#include <linux/sunrpc/addr.h>
 
 #include "internal.h"
 #include "nfs4session.h"
@@ -667,7 +668,10 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
  * of available devices, and return it.
  */
 struct nfs4_file_layout_dsaddr *
-filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
+filelayout_get_device_info(struct inode *inode,
+		struct nfs4_deviceid *dev_id,
+		struct rpc_cred *cred,
+		gfp_t gfp_flags)
 {
 	struct pnfs_device *pdev = NULL;
 	u32 max_resp_sz;
@@ -707,8 +711,9 @@ filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gf
 	pdev->pgbase = 0;
 	pdev->pglen = max_resp_sz;
 	pdev->mincount = 0;
+	pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
 
-	rc = nfs4_proc_getdeviceinfo(server, pdev);
+	rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
 	dprintk("%s getdevice info returns %d\n", __func__, rc);
 	if (rc)
 		goto out_free;
@@ -774,6 +779,22 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
 	return flseg->fh_array[i];
 }
 
+static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+{
+	might_sleep();
+	wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
+			nfs_wait_bit_killable, TASK_KILLABLE);
+}
+
+static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
+}
+
+
 struct nfs4_pnfs_ds *
 nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 {
@@ -790,16 +811,22 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 		filelayout_mark_devid_invalid(devid);
 		return NULL;
 	}
+	if (ds->ds_clp)
+		return ds;
 
-	if (!ds->ds_clp) {
+	if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
 		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
 		int err;
 
 		err = nfs4_ds_connect(s, ds);
 		if (err) {
 			nfs4_mark_deviceid_unavailable(devid);
-			return NULL;
+			ds = NULL;
 		}
+		nfs4_clear_ds_conn_bit(ds);
+	} else {
+		/* Either ds is connected, or ds is NULL */
+		nfs4_wait_ds_connect(ds);
 	}
 	return ds;
 }
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 1e09eb78543..cdb0b41a481 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include "internal.h"
@@ -133,33 +134,38 @@ static size_t nfs_parse_server_name(char *string, size_t len,
 	return ret;
 }
 
+/**
+ * nfs_find_best_sec - Find a security mechanism supported locally
+ * @flavors: List of security tuples returned by SECINFO procedure
+ *
+ * Return the pseudoflavor of the first security mechanism in
+ * "flavors" that is locally supported.  Return RPC_AUTH_UNIX if
+ * no matching flavor is found in the array.  The "flavors" array
+ * is searched in the order returned from the server, per RFC 3530
+ * recommendation.
+ */
 rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
 {
-	struct gss_api_mech *mech;
-	struct xdr_netobj oid;
-	int i;
-	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
+	rpc_authflavor_t pseudoflavor;
+	struct nfs4_secinfo4 *secinfo;
+	unsigned int i;
 
 	for (i = 0; i < flavors->num_flavors; i++) {
-		struct nfs4_secinfo_flavor *flavor;
-		flavor = &flavors->flavors[i];
-
-		if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
-			pseudoflavor = flavor->flavor;
-			break;
-		} else if (flavor->flavor == RPC_AUTH_GSS) {
-			oid.len  = flavor->gss.sec_oid4.len;
-			oid.data = flavor->gss.sec_oid4.data;
-			mech = gss_mech_get_by_OID(&oid);
-			if (!mech)
-				continue;
-			pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
-			gss_mech_put(mech);
+		secinfo = &flavors->flavors[i];
+
+		switch (secinfo->flavor) {
+		case RPC_AUTH_NULL:
+		case RPC_AUTH_UNIX:
+		case RPC_AUTH_GSS:
+			pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+							&secinfo->flavor_info);
+			if (pseudoflavor != RPC_AUTH_MAXFLAVOR)
+				return pseudoflavor;
 			break;
 		}
 	}
 
-	return pseudoflavor;
+	return RPC_AUTH_UNIX;
 }
 
 static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf747ef8665..cf11799297c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,15 +77,68 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
 static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
-static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *);
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			    struct nfs_fattr *fattr, struct iattr *sattr,
-			    struct nfs4_state *state);
+			    struct nfs4_state *state, struct nfs4_label *ilabel,
+			    struct nfs4_label *olabel);
 #ifdef CONFIG_NFS_V4_1
-static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *);
-static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *);
+static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
+		struct rpc_cred *);
+static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
+		struct rpc_cred *);
 #endif
+
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+	struct iattr *sattr, struct nfs4_label *label)
+{
+	int err;
+
+	if (label == NULL)
+		return NULL;
+
+	if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
+		return NULL;
+
+	if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
+		return NULL;
+
+	err = security_dentry_init_security(dentry, sattr->ia_mode,
+				&dentry->d_name, (void **)&label->label, &label->len);
+	if (err == 0)
+		return label;
+
+	return NULL;
+}
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{
+	if (label)
+		security_release_secctx(label->label, label->len);
+}
+static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{
+	if (label)
+		return server->attr_bitmask;
+
+	return server->attr_bitmask_nl;
+}
+#else
+static inline struct nfs4_label *
+nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
+	struct iattr *sattr, struct nfs4_label *l)
+{ return NULL; }
+static inline void
+nfs4_label_release_security(struct nfs4_label *label)
+{ return; }
+static inline u32 *
+nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
+{ return server->attr_bitmask; }
+#endif
+
 /* Prevent leaks of NFSv4 errors into userland */
 static int nfs4_map_errors(int err)
 {
@@ -93,6 +146,8 @@ static int nfs4_map_errors(int err)
 		return err;
 	switch (err) {
 	case -NFS4ERR_RESOURCE:
+	case -NFS4ERR_LAYOUTTRYLATER:
+	case -NFS4ERR_RECALLCONFLICT:
 		return -EREMOTEIO;
 	case -NFS4ERR_WRONGSEC:
 		return -EPERM;
@@ -105,6 +160,8 @@ static int nfs4_map_errors(int err)
 		return -EPROTONOSUPPORT;
 	case -NFS4ERR_ACCESS:
 		return -EACCES;
+	case -NFS4ERR_FILE_OPEN:
+		return -EBUSY;
 	default:
 		dprintk("%s could not handle NFSv4 error %d\n",
 				__func__, -err);
@@ -130,7 +187,10 @@ const u32 nfs4_fattr_bitmap[3] = {
 	| FATTR4_WORD1_SPACE_USED
 	| FATTR4_WORD1_TIME_ACCESS
 	| FATTR4_WORD1_TIME_METADATA
-	| FATTR4_WORD1_TIME_MODIFY
+	| FATTR4_WORD1_TIME_MODIFY,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+	FATTR4_WORD2_SECURITY_LABEL
+#endif
 };
 
 static const u32 nfs4_pnfs_open_bitmap[3] = {
@@ -157,7 +217,7 @@ static const u32 nfs4_open_noattr_bitmap[3] = {
 	| FATTR4_WORD0_FILEID,
 };
 
-const u32 nfs4_statfs_bitmap[2] = {
+const u32 nfs4_statfs_bitmap[3] = {
 	FATTR4_WORD0_FILES_AVAIL
 	| FATTR4_WORD0_FILES_FREE
 	| FATTR4_WORD0_FILES_TOTAL,
@@ -166,7 +226,7 @@ const u32 nfs4_statfs_bitmap[2] = {
 	| FATTR4_WORD1_SPACE_TOTAL
 };
 
-const u32 nfs4_pathconf_bitmap[2] = {
+const u32 nfs4_pathconf_bitmap[3] = {
 	FATTR4_WORD0_MAXLINK
 	| FATTR4_WORD0_MAXNAME,
 	0
@@ -181,7 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
 			FATTR4_WORD2_LAYOUT_BLKSIZE
 };
 
-const u32 nfs4_fs_locations_bitmap[2] = {
+const u32 nfs4_fs_locations_bitmap[3] = {
 	FATTR4_WORD0_TYPE
 	| FATTR4_WORD0_CHANGE
 	| FATTR4_WORD0_SIZE
@@ -197,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = {
 	| FATTR4_WORD1_TIME_ACCESS
 	| FATTR4_WORD1_TIME_METADATA
 	| FATTR4_WORD1_TIME_MODIFY
-	| FATTR4_WORD1_MOUNTED_ON_FILEID
+	| FATTR4_WORD1_MOUNTED_ON_FILEID,
 };
 
 static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -264,7 +324,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 		*timeout = NFS4_POLL_RETRY_MIN;
 	if (*timeout > NFS4_POLL_RETRY_MAX)
 		*timeout = NFS4_POLL_RETRY_MAX;
-	freezable_schedule_timeout_killable(*timeout);
+	freezable_schedule_timeout_killable_unsafe(*timeout);
 	if (fatal_signal_pending(current))
 		res = -ERESTARTSYS;
 	*timeout <<= 1;
@@ -293,19 +353,30 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 			}
 			if (state == NULL)
 				break;
-			nfs4_schedule_stateid_recovery(server, state);
+			ret = nfs4_schedule_stateid_recovery(server, state);
+			if (ret < 0)
+				break;
 			goto wait_on_recovery;
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_BAD_STATEID:
+			if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) {
+				nfs_remove_bad_delegation(inode);
+				exception->retry = 1;
+				break;
+			}
 			if (state == NULL)
 				break;
-			nfs_remove_bad_delegation(state->inode);
-			nfs4_schedule_stateid_recovery(server, state);
+			ret = nfs4_schedule_stateid_recovery(server, state);
+			if (ret < 0)
+				break;
 			goto wait_on_recovery;
 		case -NFS4ERR_EXPIRED:
-			if (state != NULL)
-				nfs4_schedule_stateid_recovery(server, state);
+			if (state != NULL) {
+				ret = nfs4_schedule_stateid_recovery(server, state);
+				if (ret < 0)
+					break;
+			}
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
 			nfs4_schedule_lease_recovery(clp);
@@ -557,7 +628,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
 	task->tk_timeout = 0;
 
 	spin_lock(&tbl->slot_tbl_lock);
-	if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
+	if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) &&
 	    !args->sa_privileged) {
 		/* The state manager will wait until the slot table is empty */
 		dprintk("%s session is draining\n", __func__);
@@ -747,6 +818,7 @@ struct nfs4_opendata {
 	struct nfs4_string owner_name;
 	struct nfs4_string group_name;
 	struct nfs_fattr f_attr;
+	struct nfs4_label *f_label;
 	struct dentry *dir;
 	struct dentry *dentry;
 	struct nfs4_state_owner *owner;
@@ -754,14 +826,45 @@ struct nfs4_opendata {
 	struct iattr attrs;
 	unsigned long timestamp;
 	unsigned int rpc_done : 1;
+	unsigned int is_recover : 1;
 	int rpc_status;
 	int cancelled;
 };
 
+static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
+		int err, struct nfs4_exception *exception)
+{
+	if (err != -EINVAL)
+		return false;
+	if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+		return false;
+	server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1;
+	exception->retry = 1;
+	return true;
+}
+
+static enum open_claim_type4
+nfs4_map_atomic_open_claim(struct nfs_server *server,
+		enum open_claim_type4 claim)
+{
+	if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+		return claim;
+	switch (claim) {
+	default:
+		return claim;
+	case NFS4_OPEN_CLAIM_FH:
+		return NFS4_OPEN_CLAIM_NULL;
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+		return NFS4_OPEN_CLAIM_DELEGATE_CUR;
+	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+		return NFS4_OPEN_CLAIM_DELEGATE_PREV;
+	}
+}
 
 static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 {
 	p->o_res.f_attr = &p->f_attr;
+	p->o_res.f_label = p->f_label;
 	p->o_res.seqid = p->o_arg.seqid;
 	p->c_res.seqid = p->c_arg.seqid;
 	p->o_res.server = p->o_arg.server;
@@ -773,6 +876,8 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 		struct nfs4_state_owner *sp, fmode_t fmode, int flags,
 		const struct iattr *attrs,
+		struct nfs4_label *label,
+		enum open_claim_type4 claim,
 		gfp_t gfp_mask)
 {
 	struct dentry *parent = dget_parent(dentry);
@@ -783,15 +888,19 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	p = kzalloc(sizeof(*p), gfp_mask);
 	if (p == NULL)
 		goto err;
+
+	p->f_label = nfs4_label_alloc(server, gfp_mask);
+	if (IS_ERR(p->f_label))
+		goto err_free_p;
+
 	p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
 	if (p->o_arg.seqid == NULL)
-		goto err_free;
+		goto err_free_label;
 	nfs_sb_active(dentry->d_sb);
 	p->dentry = dget(dentry);
 	p->dir = parent;
 	p->owner = sp;
 	atomic_inc(&sp->so_count);
-	p->o_arg.fh = NFS_FH(dir);
 	p->o_arg.open_flags = flags;
 	p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
 	/* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
@@ -807,9 +916,22 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
 	p->o_arg.name = &dentry->d_name;
 	p->o_arg.server = server;
-	p->o_arg.bitmask = server->attr_bitmask;
+	p->o_arg.bitmask = nfs4_bitmask(server, label);
 	p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
-	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
+	p->o_arg.label = label;
+	p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
+	switch (p->o_arg.claim) {
+	case NFS4_OPEN_CLAIM_NULL:
+	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+		p->o_arg.fh = NFS_FH(dir);
+		break;
+	case NFS4_OPEN_CLAIM_PREVIOUS:
+	case NFS4_OPEN_CLAIM_FH:
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+		p->o_arg.fh = NFS_FH(dentry->d_inode);
+	}
 	if (attrs != NULL && attrs->ia_valid != 0) {
 		__be32 verf[2];
 
@@ -827,7 +949,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	nfs4_init_opendata_res(p);
 	kref_init(&p->kref);
 	return p;
-err_free:
+
+err_free_label:
+	nfs4_label_free(p->f_label);
+err_free_p:
 	kfree(p);
 err:
 	dput(parent);
@@ -844,6 +969,9 @@ static void nfs4_opendata_free(struct kref *kref)
 	if (p->state != NULL)
 		nfs4_put_open_state(p->state);
 	nfs4_put_state_owner(p->owner);
+
+	nfs4_label_free(p->f_label);
+
 	dput(p->dir);
 	dput(p->dentry);
 	nfs_sb_deactive(sb);
@@ -896,6 +1024,8 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
 		return 0;
 	if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
 		return 0;
+	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+		return 0;
 	nfs_mark_delegation_referenced(delegation);
 	return 1;
 }
@@ -920,6 +1050,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
 		nfs4_stateid_copy(&state->stateid, stateid);
 	nfs4_stateid_copy(&state->open_stateid, stateid);
+	set_bit(NFS_OPEN_STATE, &state->flags);
 	switch (fmode) {
 		case FMODE_READ:
 			set_bit(NFS_O_RDONLY_STATE, &state->flags);
@@ -973,6 +1104,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
 
 	spin_lock(&deleg_cur->lock);
 	if (nfsi->delegation != deleg_cur ||
+	   test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) ||
 	    (deleg_cur->type & fmode) != fmode)
 		goto no_delegation_unlock;
 
@@ -1017,7 +1149,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 	struct nfs4_state *state = opendata->state;
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs_delegation *delegation;
-	int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC);
+	int open_mode = opendata->o_arg.open_flags;
 	fmode_t fmode = opendata->o_arg.fmode;
 	nfs4_stateid stateid;
 	int ret = -EAGAIN;
@@ -1041,9 +1173,12 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
 		/* Save the delegation */
 		nfs4_stateid_copy(&stateid, &delegation->stateid);
 		rcu_read_unlock();
-		ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
-		if (ret != 0)
-			goto out;
+		nfs_release_seqid(opendata->o_arg.seqid);
+		if (!opendata->is_recover) {
+			ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
+			if (ret != 0)
+				goto out;
+		}
 		ret = -EAGAIN;
 
 		/* Try to update the stateid using the delegation */
@@ -1115,6 +1250,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
 	if (ret)
 		goto err;
 
+	nfs_setsecurity(inode, &data->f_attr, data->f_label);
+
 	if (data->o_res.delegation_type != 0)
 		nfs4_opendata_check_deleg(data, state);
 	update_open_stateid(state, &data->o_res.stateid, NULL,
@@ -1141,7 +1278,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 	ret = -EAGAIN;
 	if (!(data->f_attr.valid & NFS_ATTR_FATTR))
 		goto err;
-	inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr);
+	inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
 	ret = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto err;
@@ -1155,6 +1292,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 			data->o_arg.fmode);
 	iput(inode);
 out:
+	nfs_release_seqid(data->o_arg.seqid);
 	return state;
 err_put_inode:
 	iput(inode);
@@ -1187,11 +1325,13 @@ static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state)
+static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx,
+		struct nfs4_state *state, enum open_claim_type4 claim)
 {
 	struct nfs4_opendata *opendata;
 
-	opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, NULL, GFP_NOFS);
+	opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
+			NULL, NULL, claim, GFP_NOFS);
 	if (opendata == NULL)
 		return ERR_PTR(-ENOMEM);
 	opendata->state = state;
@@ -1227,6 +1367,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
 
 	/* memory barrier prior to reading state->n_* */
 	clear_bit(NFS_DELEGATED_STATE, &state->flags);
+	clear_bit(NFS_OPEN_STATE, &state->flags);
 	smp_rmb();
 	if (state->n_rdwr != 0) {
 		clear_bit(NFS_O_RDWR_STATE, &state->flags);
@@ -1277,11 +1418,10 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
 	fmode_t delegation_type = 0;
 	int status;
 
-	opendata = nfs4_open_recoverdata_alloc(ctx, state);
+	opendata = nfs4_open_recoverdata_alloc(ctx, state,
+			NFS4_OPEN_CLAIM_PREVIOUS);
 	if (IS_ERR(opendata))
 		return PTR_ERR(opendata);
-	opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS;
-	opendata->o_arg.fh = NFS_FH(state->inode);
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(state->inode)->delegation);
 	if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
@@ -1300,6 +1440,8 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
 	int err;
 	do {
 		err = _nfs4_do_open_reclaim(ctx, state);
+		if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+			continue;
 		if (err != -NFS4ERR_DELAY)
 			break;
 		nfs4_handle_exception(server, err, &exception);
@@ -1314,71 +1456,72 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta
 
 	ctx = nfs4_state_find_open_context(state);
 	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
+		return -EAGAIN;
 	ret = nfs4_do_open_reclaim(ctx, state);
 	put_nfs_open_context(ctx);
 	return ret;
 }
 
-static int _nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs4_state *state, const nfs4_stateid *stateid, int err)
 {
-	struct nfs4_opendata *opendata;
-	int ret;
-
-	opendata = nfs4_open_recoverdata_alloc(ctx, state);
-	if (IS_ERR(opendata))
-		return PTR_ERR(opendata);
-	opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR;
-	nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
-	ret = nfs4_open_recover(opendata, state);
-	nfs4_opendata_put(opendata);
-	return ret;
+	switch (err) {
+		default:
+			printk(KERN_ERR "NFS: %s: unhandled error "
+					"%d.\n", __func__, err);
+		case 0:
+		case -ENOENT:
+		case -ESTALE:
+			break;
+		case -NFS4ERR_BADSESSION:
+		case -NFS4ERR_BADSLOT:
+		case -NFS4ERR_BAD_HIGH_SLOT:
+		case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+		case -NFS4ERR_DEADSESSION:
+			set_bit(NFS_DELEGATED_STATE, &state->flags);
+			nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
+			return -EAGAIN;
+		case -NFS4ERR_STALE_CLIENTID:
+		case -NFS4ERR_STALE_STATEID:
+			set_bit(NFS_DELEGATED_STATE, &state->flags);
+		case -NFS4ERR_EXPIRED:
+			/* Don't recall a delegation if it was lost */
+			nfs4_schedule_lease_recovery(server->nfs_client);
+			return -EAGAIN;
+		case -NFS4ERR_DELEG_REVOKED:
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_OPENMODE:
+			nfs_inode_find_state_and_recover(state->inode,
+					stateid);
+			nfs4_schedule_stateid_recovery(server, state);
+			return 0;
+		case -NFS4ERR_DELAY:
+		case -NFS4ERR_GRACE:
+			set_bit(NFS_DELEGATED_STATE, &state->flags);
+			ssleep(1);
+			return -EAGAIN;
+		case -ENOMEM:
+		case -NFS4ERR_DENIED:
+			/* kill_proc(fl->fl_pid, SIGLOST, 1); */
+			return 0;
+	}
+	return err;
 }
 
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
 {
-	struct nfs4_exception exception = { };
 	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs4_opendata *opendata;
 	int err;
-	do {
-		err = _nfs4_open_delegation_recall(ctx, state, stateid);
-		switch (err) {
-			case 0:
-			case -ENOENT:
-			case -ESTALE:
-				goto out;
-			case -NFS4ERR_BADSESSION:
-			case -NFS4ERR_BADSLOT:
-			case -NFS4ERR_BAD_HIGH_SLOT:
-			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
-				goto out;
-			case -NFS4ERR_STALE_CLIENTID:
-			case -NFS4ERR_STALE_STATEID:
-			case -NFS4ERR_EXPIRED:
-				/* Don't recall a delegation if it was lost */
-				nfs4_schedule_lease_recovery(server->nfs_client);
-				goto out;
-			case -ERESTARTSYS:
-				/*
-				 * The show must go on: exit, but mark the
-				 * stateid as needing recovery.
-				 */
-			case -NFS4ERR_DELEG_REVOKED:
-			case -NFS4ERR_ADMIN_REVOKED:
-			case -NFS4ERR_BAD_STATEID:
-				nfs_inode_find_state_and_recover(state->inode,
-						stateid);
-				nfs4_schedule_stateid_recovery(server, state);
-			case -ENOMEM:
-				err = 0;
-				goto out;
-		}
-		err = nfs4_handle_exception(server, err, &exception);
-	} while (exception.retry);
-out:
-	return err;
+
+	opendata = nfs4_open_recoverdata_alloc(ctx, state,
+			NFS4_OPEN_CLAIM_DELEG_CUR_FH);
+	if (IS_ERR(opendata))
+		return PTR_ERR(opendata);
+	nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
+	err = nfs4_open_recover(opendata, state);
+	nfs4_opendata_put(opendata);
+	return nfs4_handle_delegation_recall_error(server, state, stateid, err);
 }
 
 static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
@@ -1461,9 +1604,10 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_opendata *data = calldata;
 	struct nfs4_state_owner *sp = data->owner;
+	struct nfs_client *clp = sp->so_server->nfs_client;
 
 	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
-		return;
+		goto out_wait;
 	/*
 	 * Check if we still need to send an OPEN call, or if we can use
 	 * a delegation instead.
@@ -1476,15 +1620,20 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 		rcu_read_lock();
 		delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
 		if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR &&
+		    data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH &&
 		    can_open_delegated(delegation, data->o_arg.fmode))
 			goto unlock_no_action;
 		rcu_read_unlock();
 	}
 	/* Update client id. */
-	data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
-	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
-		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
+	data->o_arg.clientid = clp->cl_clientid;
+	switch (data->o_arg.claim) {
+	case NFS4_OPEN_CLAIM_PREVIOUS:
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
 		data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
+	case NFS4_OPEN_CLAIM_FH:
+		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
 		nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
 	}
 	data->timestamp = jiffies;
@@ -1493,11 +1642,22 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 				&data->o_res.seq_res,
 				task) != 0)
 		nfs_release_seqid(data->o_arg.seqid);
+
+	/* Set the create mode (note dependency on the session type) */
+	data->o_arg.createmode = NFS4_CREATE_UNCHECKED;
+	if (data->o_arg.open_flags & O_EXCL) {
+		data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE;
+		if (nfs4_has_persistent_session(clp))
+			data->o_arg.createmode = NFS4_CREATE_GUARDED;
+		else if (clp->cl_mvops->minor_version > 0)
+			data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1;
+	}
 	return;
 unlock_no_action:
 	rcu_read_unlock();
 out_no_action:
 	task->tk_action = NULL;
+out_wait:
 	nfs4_sequence_done(task, &data->o_res.seq_res);
 }
 
@@ -1587,8 +1747,11 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
 	data->rpc_done = 0;
 	data->rpc_status = 0;
 	data->cancelled = 0;
-	if (isrecover)
+	data->is_recover = 0;
+	if (isrecover) {
 		nfs4_set_sequence_privileged(&o_arg->seq_args);
+		data->is_recover = 1;
+	}
 	task = rpc_run_task(&task_setup_data);
         if (IS_ERR(task))
                 return PTR_ERR(task);
@@ -1694,7 +1857,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 			return status;
 	}
 	if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
-		_nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr);
+		_nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
 	return 0;
 }
 
@@ -1713,7 +1876,8 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
 	struct nfs4_opendata *opendata;
 	int ret;
 
-	opendata = nfs4_open_recoverdata_alloc(ctx, state);
+	opendata = nfs4_open_recoverdata_alloc(ctx, state,
+			NFS4_OPEN_CLAIM_FH);
 	if (IS_ERR(opendata))
 		return PTR_ERR(opendata);
 	ret = nfs4_open_recover(opendata, state);
@@ -1731,6 +1895,8 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
 
 	do {
 		err = _nfs4_open_expired(ctx, state);
+		if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+			continue;
 		switch (err) {
 		default:
 			goto out;
@@ -1751,7 +1917,7 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
 
 	ctx = nfs4_state_find_open_context(state);
 	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
+		return -EAGAIN;
 	ret = nfs4_do_open_expired(ctx, state);
 	put_nfs_open_context(ctx);
 	return ret;
@@ -1762,18 +1928,30 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	nfs4_stateid *stateid = &state->stateid;
-	int status;
+	struct nfs_delegation *delegation;
+	struct rpc_cred *cred = NULL;
+	int status = -NFS4ERR_BAD_STATEID;
 
 	/* If a state reset has been done, test_stateid is unneeded */
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
 		return;
 
-	status = nfs41_test_stateid(server, stateid);
+	/* Get the delegation credential for use by test/free_stateid */
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(state->inode)->delegation);
+	if (delegation != NULL &&
+	    nfs4_stateid_match(&delegation->stateid, stateid)) {
+		cred = get_rpccred(delegation->cred);
+		rcu_read_unlock();
+		status = nfs41_test_stateid(server, stateid, cred);
+	} else
+		rcu_read_unlock();
+
 	if (status != NFS_OK) {
 		/* Free the stateid unless the server explicitly
 		 * informs us the stateid is unrecognized. */
 		if (status != -NFS4ERR_BAD_STATEID)
-			nfs41_free_stateid(server, stateid);
+			nfs41_free_stateid(server, stateid, cred);
 		nfs_remove_bad_delegation(state->inode);
 
 		write_seqlock(&state->seqlock);
@@ -1781,6 +1959,9 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
 		write_sequnlock(&state->seqlock);
 		clear_bit(NFS_DELEGATED_STATE, &state->flags);
 	}
+
+	if (cred != NULL)
+		put_rpccred(cred);
 }
 
 /**
@@ -1795,6 +1976,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	nfs4_stateid *stateid = &state->open_stateid;
+	struct rpc_cred *cred = state->owner->so_cred;
 	int status;
 
 	/* If a state reset has been done, test_stateid is unneeded */
@@ -1803,16 +1985,17 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
 	    (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
 		return -NFS4ERR_BAD_STATEID;
 
-	status = nfs41_test_stateid(server, stateid);
+	status = nfs41_test_stateid(server, stateid, cred);
 	if (status != NFS_OK) {
 		/* Free the stateid unless the server explicitly
 		 * informs us the stateid is unrecognized. */
 		if (status != -NFS4ERR_BAD_STATEID)
-			nfs41_free_stateid(server, stateid);
+			nfs41_free_stateid(server, stateid, cred);
 
 		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
 		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
 		clear_bit(NFS_O_RDWR_STATE, &state->flags);
+		clear_bit(NFS_OPEN_STATE, &state->flags);
 	}
 	return status;
 }
@@ -1845,22 +2028,79 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
 		sattr->ia_valid |= ATTR_MTIME;
 }
 
+static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
+		fmode_t fmode,
+		int flags,
+		struct nfs_open_context *ctx)
+{
+	struct nfs4_state_owner *sp = opendata->owner;
+	struct nfs_server *server = sp->so_server;
+	struct dentry *dentry;
+	struct nfs4_state *state;
+	unsigned int seq;
+	int ret;
+
+	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+
+	ret = _nfs4_proc_open(opendata);
+	if (ret != 0)
+		goto out;
+
+	state = nfs4_opendata_to_nfs4_state(opendata);
+	ret = PTR_ERR(state);
+	if (IS_ERR(state))
+		goto out;
+	if (server->caps & NFS_CAP_POSIX_LOCK)
+		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+
+	dentry = opendata->dentry;
+	if (dentry->d_inode == NULL) {
+		/* FIXME: Is this d_drop() ever needed? */
+		d_drop(dentry);
+		dentry = d_add_unique(dentry, igrab(state->inode));
+		if (dentry == NULL) {
+			dentry = opendata->dentry;
+		} else if (dentry != ctx->dentry) {
+			dput(ctx->dentry);
+			ctx->dentry = dget(dentry);
+		}
+		nfs_set_verifier(dentry,
+				nfs_save_change_attribute(opendata->dir->d_inode));
+	}
+
+	ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
+	if (ret != 0)
+		goto out;
+
+	ctx->state = state;
+	if (dentry->d_inode == state->inode) {
+		nfs_inode_attach_open_context(ctx);
+		if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+			nfs4_schedule_stateid_recovery(server, state);
+	}
+out:
+	return ret;
+}
+
 /*
  * Returns a referenced nfs4_state
  */
 static int _nfs4_do_open(struct inode *dir,
-			struct dentry *dentry,
-			fmode_t fmode,
+			struct nfs_open_context *ctx,
 			int flags,
 			struct iattr *sattr,
-			struct rpc_cred *cred,
-			struct nfs4_state **res,
-			struct nfs4_threshold **ctx_th)
+			struct nfs4_label *label)
 {
 	struct nfs4_state_owner  *sp;
 	struct nfs4_state     *state = NULL;
 	struct nfs_server       *server = NFS_SERVER(dir);
 	struct nfs4_opendata *opendata;
+	struct dentry *dentry = ctx->dentry;
+	struct rpc_cred *cred = ctx->cred;
+	struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
+	fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
+	enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
+	struct nfs4_label *olabel = NULL;
 	int status;
 
 	/* Protect against reboot recovery conflicts */
@@ -1876,44 +2116,48 @@ static int _nfs4_do_open(struct inode *dir,
 	if (dentry->d_inode != NULL)
 		nfs4_return_incompatible_delegation(dentry->d_inode, fmode);
 	status = -ENOMEM;
-	opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, GFP_KERNEL);
+	if (dentry->d_inode)
+		claim = NFS4_OPEN_CLAIM_FH;
+	opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
+			label, claim, GFP_KERNEL);
 	if (opendata == NULL)
 		goto err_put_state_owner;
 
+	if (label) {
+		olabel = nfs4_label_alloc(server, GFP_KERNEL);
+		if (IS_ERR(olabel)) {
+			status = PTR_ERR(olabel);
+			goto err_opendata_put;
+		}
+	}
+
 	if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
 		opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
 		if (!opendata->f_attr.mdsthreshold)
-			goto err_opendata_put;
+			goto err_free_label;
 		opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
 	}
 	if (dentry->d_inode != NULL)
 		opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
 
-	status = _nfs4_proc_open(opendata);
+	status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
 	if (status != 0)
-		goto err_opendata_put;
-
-	state = nfs4_opendata_to_nfs4_state(opendata);
-	status = PTR_ERR(state);
-	if (IS_ERR(state))
-		goto err_opendata_put;
-	if (server->caps & NFS_CAP_POSIX_LOCK)
-		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
-
-	status = nfs4_opendata_access(cred, opendata, state, fmode, flags);
-	if (status != 0)
-		goto err_opendata_put;
+		goto err_free_label;
+	state = ctx->state;
 
-	if (opendata->o_arg.open_flags & O_EXCL) {
+	if ((opendata->o_arg.open_flags & O_EXCL) &&
+	    (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
 		nfs4_exclusive_attrset(opendata, sattr);
 
 		nfs_fattr_init(opendata->o_res.f_attr);
 		status = nfs4_do_setattr(state->inode, cred,
 				opendata->o_res.f_attr, sattr,
-				state);
-		if (status == 0)
+				state, label, olabel);
+		if (status == 0) {
 			nfs_setattr_update_inode(state->inode, sattr);
-		nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+			nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+			nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
+		}
 	}
 
 	if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
@@ -1922,37 +2166,37 @@ static int _nfs4_do_open(struct inode *dir,
 		kfree(opendata->f_attr.mdsthreshold);
 	opendata->f_attr.mdsthreshold = NULL;
 
+	nfs4_label_free(olabel);
+
 	nfs4_opendata_put(opendata);
 	nfs4_put_state_owner(sp);
-	*res = state;
 	return 0;
+err_free_label:
+	nfs4_label_free(olabel);
 err_opendata_put:
 	kfree(opendata->f_attr.mdsthreshold);
 	nfs4_opendata_put(opendata);
 err_put_state_owner:
 	nfs4_put_state_owner(sp);
 out_err:
-	*res = NULL;
 	return status;
 }
 
 
 static struct nfs4_state *nfs4_do_open(struct inode *dir,
-					struct dentry *dentry,
-					fmode_t fmode,
+					struct nfs_open_context *ctx,
 					int flags,
 					struct iattr *sattr,
-					struct rpc_cred *cred,
-					struct nfs4_threshold **ctx_th)
+					struct nfs4_label *label)
 {
+	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs4_exception exception = { };
 	struct nfs4_state *res;
 	int status;
 
-	fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
 	do {
-		status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred,
-				       &res, ctx_th);
+		status = _nfs4_do_open(dir, ctx, flags, sattr, label);
+		res = ctx->state;
 		if (status == 0)
 			break;
 		/* NOTE: BAD_SEQID means the server and client disagree about the
@@ -1988,7 +2232,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 			exception.retry = 1;
 			continue;
 		}
-		res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir),
+		if (nfs4_clear_cap_atomic_open_v1(server, status, &exception))
+			continue;
+		res = ERR_PTR(nfs4_handle_exception(server,
 					status, &exception));
 	} while (exception.retry);
 	return res;
@@ -1996,7 +2242,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 
 static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			    struct nfs_fattr *fattr, struct iattr *sattr,
-			    struct nfs4_state *state)
+			    struct nfs4_state *state, struct nfs4_label *ilabel,
+			    struct nfs4_label *olabel)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
         struct nfs_setattrargs  arg = {
@@ -2004,9 +2251,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                 .iap            = sattr,
 		.server		= server,
 		.bitmask = server->attr_bitmask,
+		.label		= ilabel,
         };
         struct nfs_setattrres  res = {
 		.fattr		= fattr,
+		.label		= olabel,
 		.server		= server,
         };
         struct rpc_message msg = {
@@ -2016,20 +2265,29 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 		.rpc_cred	= cred,
         };
 	unsigned long timestamp = jiffies;
+	fmode_t fmode;
+	bool truncate;
 	int status;
 
+	arg.bitmask = nfs4_bitmask(server, ilabel);
+	if (ilabel)
+		arg.bitmask = nfs4_bitmask(server, olabel);
+
 	nfs_fattr_init(fattr);
 
-	if (state != NULL) {
+	/* Servers should only apply open mode checks for file size changes */
+	truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
+	fmode = truncate ? FMODE_WRITE : FMODE_READ;
+
+	if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {
+		/* Use that stateid */
+	} else if (truncate && state != NULL && nfs4_valid_open_stateid(state)) {
 		struct nfs_lockowner lockowner = {
 			.l_owner = current->files,
 			.l_pid = current->tgid,
 		};
 		nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
 				&lockowner);
-	} else if (nfs4_copy_delegation_stateid(&arg.stateid, inode,
-				FMODE_WRITE)) {
-		/* Use that stateid */
 	} else
 		nfs4_stateid_copy(&arg.stateid, &zero_stateid);
 
@@ -2041,7 +2299,8 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			   struct nfs_fattr *fattr, struct iattr *sattr,
-			   struct nfs4_state *state)
+			   struct nfs4_state *state, struct nfs4_label *ilabel,
+			   struct nfs4_label *olabel)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs4_exception exception = {
@@ -2050,9 +2309,16 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 	};
 	int err;
 	do {
-		err = _nfs4_do_setattr(inode, cred, fattr, sattr, state);
+		err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
 		switch (err) {
 		case -NFS4ERR_OPENMODE:
+			if (!(sattr->ia_valid & ATTR_SIZE)) {
+				pr_warn_once("NFSv4: server %s is incorrectly "
+						"applying open mode checks to "
+						"a SETATTR that is not "
+						"changing file size.\n",
+						server->nfs_client->cl_hostname);
+			}
 			if (state && !(state->state & FMODE_WRITE)) {
 				err = -EBADF;
 				if (sattr->ia_valid & ATTR_OPEN)
@@ -2088,7 +2354,7 @@ static void nfs4_free_closedata(void *data)
 	nfs4_put_open_state(calldata->state);
 	nfs_free_seqid(calldata->arg.seqid);
 	nfs4_put_state_owner(sp);
-	nfs_sb_deactive_async(sb);
+	nfs_sb_deactive(sb);
 	kfree(calldata);
 }
 
@@ -2096,11 +2362,19 @@ static void nfs4_close_clear_stateid_flags(struct nfs4_state *state,
 		fmode_t fmode)
 {
 	spin_lock(&state->owner->so_lock);
-	if (!(fmode & FMODE_READ))
+	clear_bit(NFS_O_RDWR_STATE, &state->flags);
+	switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+	case FMODE_WRITE:
 		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
-	if (!(fmode & FMODE_WRITE))
+		break;
+	case FMODE_READ:
 		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
-	clear_bit(NFS_O_RDWR_STATE, &state->flags);
+		break;
+	case 0:
+		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+		clear_bit(NFS_OPEN_STATE, &state->flags);
+	}
 	spin_unlock(&state->owner->so_lock);
 }
 
@@ -2150,7 +2424,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 
 	dprintk("%s: begin!\n", __func__);
 	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
-		return;
+		goto out_wait;
 
 	task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
 	calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
@@ -2168,20 +2442,22 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 			calldata->arg.fmode &= ~FMODE_WRITE;
 		}
 	}
+	if (!nfs4_valid_open_stateid(state))
+		call_close = 0;
 	spin_unlock(&state->owner->so_lock);
 
 	if (!call_close) {
 		/* Note: exit _without_ calling nfs4_close_done */
-		task->tk_action = NULL;
-		nfs4_sequence_done(task, &calldata->res.seq_res);
-		goto out;
+		goto out_no_action;
 	}
 
 	if (calldata->arg.fmode == 0) {
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
 		if (calldata->roc &&
-		    pnfs_roc_drain(inode, &calldata->roc_barrier, task))
-			goto out;
+		    pnfs_roc_drain(inode, &calldata->roc_barrier, task)) {
+			nfs_release_seqid(calldata->arg.seqid);
+			goto out_wait;
+		    }
 	}
 
 	nfs_fattr_init(calldata->res.fattr);
@@ -2191,8 +2467,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 				&calldata->res.seq_res,
 				task) != 0)
 		nfs_release_seqid(calldata->arg.seqid);
-out:
 	dprintk("%s: done!\n", __func__);
+	return;
+out_no_action:
+	task->tk_action = NULL;
+out_wait:
+	nfs4_sequence_done(task, &calldata->res.seq_res);
 }
 
 static const struct rpc_call_ops nfs4_close_ops = {
@@ -2274,14 +2554,18 @@ static struct inode *
 nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
 {
 	struct nfs4_state *state;
+	struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
+
+	label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
 
 	/* Protect against concurrent sillydeletes */
-	state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr,
-			     ctx->cred, &ctx->mdsthreshold);
+	state = nfs4_do_open(dir, ctx, open_flags, attr, label);
+
+	nfs4_label_release_security(label);
+
 	if (IS_ERR(state))
 		return ERR_CAST(state);
-	ctx->state = state;
-	return igrab(state->inode);
+	return state->inode;
 }
 
 static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2337,7 +2621,17 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 			server->caps |= NFS_CAP_CTIME;
 		if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
 			server->caps |= NFS_CAP_MTIME;
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+		if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
+			server->caps |= NFS_CAP_SECURITY_LABEL;
+#endif
+		memcpy(server->attr_bitmask_nl, res.attr_bitmask,
+				sizeof(server->attr_bitmask));
 
+		if (server->caps & NFS_CAP_SECURITY_LABEL) {
+			server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+			res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+		}
 		memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
 		server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
 		server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
@@ -2363,8 +2657,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
 		struct nfs_fsinfo *info)
 {
+	u32 bitmask[3];
 	struct nfs4_lookup_root_arg args = {
-		.bitmask = nfs4_fattr_bitmap,
+		.bitmask = bitmask,
 	};
 	struct nfs4_lookup_res res = {
 		.server = server,
@@ -2377,6 +2672,13 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
 		.rpc_resp = &res,
 	};
 
+	bitmask[0] = nfs4_fattr_bitmap[0];
+	bitmask[1] = nfs4_fattr_bitmap[1];
+	/*
+	 * Process the label in the upcoming getfattr
+	 */
+	bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
+
 	nfs_fattr_init(info->fattr);
 	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 }
@@ -2408,7 +2710,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl
 
 	auth = rpcauth_create(flavor, server->client);
 	if (IS_ERR(auth)) {
-		ret = -EIO;
+		ret = -EACCES;
 		goto out;
 	}
 	ret = nfs4_lookup_root(server, fhandle, info);
@@ -2416,27 +2718,36 @@ out:
 	return ret;
 }
 
+/*
+ * Retry pseudoroot lookup with various security flavors.  We do this when:
+ *
+ *   NFSv4.0: the PUTROOTFH operation returns NFS4ERR_WRONGSEC
+ *   NFSv4.1: the server does not support the SECINFO_NO_NAME operation
+ *
+ * Returns zero on success, or a negative NFS4ERR value, or a
+ * negative errno value.
+ */
 static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 			      struct nfs_fsinfo *info)
 {
-	int i, len, status = 0;
-	rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS];
-
-	len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array));
-	if (len < 0)
-		return len;
-
-	for (i = 0; i < len; i++) {
-		/* AUTH_UNIX is the default flavor if none was specified,
-		 * thus has already been tried. */
-		if (flav_array[i] == RPC_AUTH_UNIX)
-			continue;
+	/* Per 3530bis 15.33.5 */
+	static const rpc_authflavor_t flav_array[] = {
+		RPC_AUTH_GSS_KRB5P,
+		RPC_AUTH_GSS_KRB5I,
+		RPC_AUTH_GSS_KRB5,
+		RPC_AUTH_UNIX,			/* courtesy */
+		RPC_AUTH_NULL,
+	};
+	int status = -EPERM;
+	size_t i;
 
+	for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
 		status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
 		if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
 			continue;
 		break;
 	}
+
 	/*
 	 * -EACCESS could mean that the user doesn't have correct permissions
 	 * to access the mount.  It could also mean that we tried to mount
@@ -2449,24 +2760,36 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 	return status;
 }
 
-/*
- * get the file handle for the "/" directory on the server
+static int nfs4_do_find_root_sec(struct nfs_server *server,
+		struct nfs_fh *fhandle, struct nfs_fsinfo *info)
+{
+	int mv = server->nfs_client->cl_minorversion;
+	return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info);
+}
+
+/**
+ * nfs4_proc_get_rootfh - get file handle for server's pseudoroot
+ * @server: initialized nfs_server handle
+ * @fhandle: we fill in the pseudo-fs root file handle
+ * @info: we fill in an FSINFO struct
+ *
+ * Returns zero on success, or a negative errno.
  */
 int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
 			 struct nfs_fsinfo *info)
 {
-	int minor_version = server->nfs_client->cl_minorversion;
-	int status = nfs4_lookup_root(server, fhandle, info);
-	if ((status == -NFS4ERR_WRONGSEC) && !(server->flags & NFS_MOUNT_SECFLAVOUR))
-		/*
-		 * A status of -NFS4ERR_WRONGSEC will be mapped to -EPERM
-		 * by nfs4_map_errors() as this function exits.
-		 */
-		status = nfs_v4_minor_ops[minor_version]->find_root_sec(server, fhandle, info);
+	int status;
+
+	status = nfs4_lookup_root(server, fhandle, info);
+	if ((status == -NFS4ERR_WRONGSEC) &&
+	    !(server->flags & NFS_MOUNT_SECFLAVOUR))
+		status = nfs4_do_find_root_sec(server, fhandle, info);
+
 	if (status == 0)
 		status = nfs4_server_capabilities(server, fhandle);
 	if (status == 0)
 		status = nfs4_do_fsinfo(server, fhandle, info);
+
 	return nfs4_map_errors(status);
 }
 
@@ -2475,6 +2798,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
 {
 	int error;
 	struct nfs_fattr *fattr = info->fattr;
+	struct nfs4_label *label = NULL;
 
 	error = nfs4_server_capabilities(server, mntfh);
 	if (error < 0) {
@@ -2482,16 +2806,23 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
 		return error;
 	}
 
-	error = nfs4_proc_getattr(server, mntfh, fattr);
+	label = nfs4_label_alloc(server, GFP_KERNEL);
+	if (IS_ERR(label))
+		return PTR_ERR(label);
+
+	error = nfs4_proc_getattr(server, mntfh, fattr, label);
 	if (error < 0) {
 		dprintk("nfs4_get_root: getattr error = %d\n", -error);
-		return error;
+		goto err_free_label;
 	}
 
 	if (fattr->valid & NFS_ATTR_FATTR_FSID &&
 	    !nfs_fsid_equal(&server->fsid, &fattr->fsid))
 		memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
 
+err_free_label:
+	nfs4_label_free(label);
+
 	return error;
 }
 
@@ -2538,7 +2869,8 @@ out:
 	return status;
 }
 
-static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+				struct nfs_fattr *fattr, struct nfs4_label *label)
 {
 	struct nfs4_getattr_arg args = {
 		.fh = fhandle,
@@ -2546,6 +2878,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 	};
 	struct nfs4_getattr_res res = {
 		.fattr = fattr,
+		.label = label,
 		.server = server,
 	};
 	struct rpc_message msg = {
@@ -2553,18 +2886,21 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
-	
+
+	args.bitmask = nfs4_bitmask(server, label);
+
 	nfs_fattr_init(fattr);
 	return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 }
 
-static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
+				struct nfs_fattr *fattr, struct nfs4_label *label)
 {
 	struct nfs4_exception exception = { };
 	int err;
 	do {
 		err = nfs4_handle_exception(server,
-				_nfs4_proc_getattr(server, fhandle, fattr),
+				_nfs4_proc_getattr(server, fhandle, fattr, label),
 				&exception);
 	} while (exception.retry);
 	return err;
@@ -2594,10 +2930,11 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	struct inode *inode = dentry->d_inode;
 	struct rpc_cred *cred = NULL;
 	struct nfs4_state *state = NULL;
+	struct nfs4_label *label = NULL;
 	int status;
 
 	if (pnfs_ld_layoutret_on_setattr(inode))
-		pnfs_return_layout(inode);
+		pnfs_commit_and_return_layout(inode);
 
 	nfs_fattr_init(fattr);
 	
@@ -2620,15 +2957,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 		}
 	}
 
-	status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
-	if (status == 0)
+	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+	if (IS_ERR(label))
+		return PTR_ERR(label);
+
+	status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
+	if (status == 0) {
 		nfs_setattr_update_inode(inode, sattr);
+		nfs_setsecurity(inode, fattr, label);
+	}
+	nfs4_label_free(label);
 	return status;
 }
 
 static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
 		const struct qstr *name, struct nfs_fh *fhandle,
-		struct nfs_fattr *fattr)
+		struct nfs_fattr *fattr, struct nfs4_label *label)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
 	int		       status;
@@ -2640,6 +2984,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
 	struct nfs4_lookup_res res = {
 		.server = server,
 		.fattr = fattr,
+		.label = label,
 		.fh = fhandle,
 	};
 	struct rpc_message msg = {
@@ -2648,6 +2993,8 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
 		.rpc_resp = &res,
 	};
 
+	args.bitmask = nfs4_bitmask(server, label);
+
 	nfs_fattr_init(fattr);
 
 	dprintk("NFS call  lookup %s\n", name->name);
@@ -2666,13 +3013,13 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
 
 static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
 				   struct qstr *name, struct nfs_fh *fhandle,
-				   struct nfs_fattr *fattr)
+				   struct nfs_fattr *fattr, struct nfs4_label *label)
 {
 	struct nfs4_exception exception = { };
 	struct rpc_clnt *client = *clnt;
 	int err;
 	do {
-		err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr);
+		err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
 		switch (err) {
 		case -NFS4ERR_BADNAME:
 			err = -ENOENT;
@@ -2706,12 +3053,13 @@ out:
 }
 
 static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
-			    struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+			    struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+			    struct nfs4_label *label)
 {
 	int status;
 	struct rpc_clnt *client = NFS_CLIENT(dir);
 
-	status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
+	status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label);
 	if (client != NFS_CLIENT(dir)) {
 		rpc_shutdown_client(client);
 		nfs_fixup_secinfo_attributes(fattr);
@@ -2726,7 +3074,7 @@ nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
 	int status;
 	struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir));
 
-	status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr);
+	status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL);
 	if (status < 0) {
 		rpc_shutdown_client(client);
 		return ERR_PTR(status);
@@ -2751,7 +3099,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
 		.rpc_cred = entry->cred,
 	};
 	int mode = entry->mask;
-	int status;
+	int status = 0;
 
 	/*
 	 * Determine which access bits we want to ask for...
@@ -2856,6 +3204,7 @@ static int
 nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		 int flags)
 {
+	struct nfs4_label l, *ilabel = NULL;
 	struct nfs_open_context *ctx;
 	struct nfs4_state *state;
 	int status = 0;
@@ -2864,19 +3213,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
+
 	sattr->ia_mode &= ~current_umask();
-	state = nfs4_do_open(dir, dentry, ctx->mode,
-			flags, sattr, ctx->cred,
-			&ctx->mdsthreshold);
-	d_drop(dentry);
+	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
 		goto out;
 	}
-	d_add(dentry, igrab(state->inode));
-	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-	ctx->state = state;
 out:
+	nfs4_label_release_security(ilabel);
 	put_nfs_open_context(ctx);
 	return status;
 }
@@ -2925,6 +3271,8 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
 	res->server = server;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 	nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
+
+	nfs_fattr_init(res->dir_attr);
 }
 
 static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
@@ -3000,7 +3348,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
 		.rpc_resp = &res,
 	};
 	int status = -ENOMEM;
-	
+
 	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (!status) {
 		update_changeattr(old_dir, &res.old_cinfo);
@@ -3034,6 +3382,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
 	};
 	struct nfs4_link_res res = {
 		.server = server,
+		.label = NULL,
 	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
@@ -3046,11 +3395,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
 	if (res.fattr == NULL)
 		goto out;
 
+	res.label = nfs4_label_alloc(server, GFP_KERNEL);
+	if (IS_ERR(res.label)) {
+		status = PTR_ERR(res.label);
+		goto out;
+	}
+	arg.bitmask = nfs4_bitmask(server, res.label);
+
 	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (!status) {
 		update_changeattr(dir, &res.cinfo);
-		nfs_post_op_update_inode(inode, res.fattr);
+		status = nfs_post_op_update_inode(inode, res.fattr);
+		if (!status)
+			nfs_setsecurity(inode, res.fattr, res.label);
 	}
+
+
+	nfs4_label_free(res.label);
+
 out:
 	nfs_free_fattr(res.fattr);
 	return status;
@@ -3074,6 +3436,7 @@ struct nfs4_createdata {
 	struct nfs4_create_res res;
 	struct nfs_fh fh;
 	struct nfs_fattr fattr;
+	struct nfs4_label *label;
 };
 
 static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -3085,6 +3448,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
 	if (data != NULL) {
 		struct nfs_server *server = NFS_SERVER(dir);
 
+		data->label = nfs4_label_alloc(server, GFP_KERNEL);
+		if (IS_ERR(data->label))
+			goto out_free;
+
 		data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
 		data->msg.rpc_argp = &data->arg;
 		data->msg.rpc_resp = &data->res;
@@ -3093,13 +3460,17 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
 		data->arg.name = name;
 		data->arg.attrs = sattr;
 		data->arg.ftype = ftype;
-		data->arg.bitmask = server->attr_bitmask;
+		data->arg.bitmask = nfs4_bitmask(server, data->label);
 		data->res.server = server;
 		data->res.fh = &data->fh;
 		data->res.fattr = &data->fattr;
+		data->res.label = data->label;
 		nfs_fattr_init(data->res.fattr);
 	}
 	return data;
+out_free:
+	kfree(data);
+	return NULL;
 }
 
 static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
@@ -3108,18 +3479,20 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
 				    &data->arg.seq_args, &data->res.seq_res, 1);
 	if (status == 0) {
 		update_changeattr(dir, &data->res.dir_cinfo);
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
 	}
 	return status;
 }
 
 static void nfs4_free_createdata(struct nfs4_createdata *data)
 {
+	nfs4_label_free(data->label);
 	kfree(data);
 }
 
 static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
-		struct page *page, unsigned int len, struct iattr *sattr)
+		struct page *page, unsigned int len, struct iattr *sattr,
+		struct nfs4_label *label)
 {
 	struct nfs4_createdata *data;
 	int status = -ENAMETOOLONG;
@@ -3135,6 +3508,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 	data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
 	data->arg.u.symlink.pages = &page;
 	data->arg.u.symlink.len = len;
+	data->arg.label = label;
 	
 	status = nfs4_do_create(dir, dentry, data);
 
@@ -3147,18 +3521,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 		struct page *page, unsigned int len, struct iattr *sattr)
 {
 	struct nfs4_exception exception = { };
+	struct nfs4_label l, *label = NULL;
 	int err;
+
+	label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
 	do {
 		err = nfs4_handle_exception(NFS_SERVER(dir),
 				_nfs4_proc_symlink(dir, dentry, page,
-							len, sattr),
+							len, sattr, label),
 				&exception);
 	} while (exception.retry);
+
+	nfs4_label_release_security(label);
 	return err;
 }
 
 static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
-		struct iattr *sattr)
+		struct iattr *sattr, struct nfs4_label *label)
 {
 	struct nfs4_createdata *data;
 	int status = -ENOMEM;
@@ -3167,6 +3547,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 	if (data == NULL)
 		goto out;
 
+	data->arg.label = label;
 	status = nfs4_do_create(dir, dentry, data);
 
 	nfs4_free_createdata(data);
@@ -3178,14 +3559,19 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 		struct iattr *sattr)
 {
 	struct nfs4_exception exception = { };
+	struct nfs4_label l, *label = NULL;
 	int err;
 
+	label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
 	sattr->ia_mode &= ~current_umask();
 	do {
 		err = nfs4_handle_exception(NFS_SERVER(dir),
-				_nfs4_proc_mkdir(dir, dentry, sattr),
+				_nfs4_proc_mkdir(dir, dentry, sattr, label),
 				&exception);
 	} while (exception.retry);
+	nfs4_label_release_security(label);
+
 	return err;
 }
 
@@ -3243,7 +3629,7 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 }
 
 static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
-		struct iattr *sattr, dev_t rdev)
+		struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
 {
 	struct nfs4_createdata *data;
 	int mode = sattr->ia_mode;
@@ -3268,7 +3654,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 		status = -EINVAL;
 		goto out_free;
 	}
-	
+
+	data->arg.label = label;
 	status = nfs4_do_create(dir, dentry, data);
 out_free:
 	nfs4_free_createdata(data);
@@ -3280,14 +3667,20 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 		struct iattr *sattr, dev_t rdev)
 {
 	struct nfs4_exception exception = { };
+	struct nfs4_label l, *label = NULL;
 	int err;
 
+	label = nfs4_label_init_security(dir, dentry, sattr, &l);
+
 	sattr->ia_mode &= ~current_umask();
 	do {
 		err = nfs4_handle_exception(NFS_SERVER(dir),
-				_nfs4_proc_mknod(dir, dentry, sattr, rdev),
+				_nfs4_proc_mknod(dir, dentry, sattr, label, rdev),
 				&exception);
 	} while (exception.retry);
+
+	nfs4_label_release_security(label);
+
 	return err;
 }
 
@@ -3345,12 +3738,21 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
 static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
 {
 	struct nfs4_exception exception = { };
+	unsigned long now = jiffies;
 	int err;
 
 	do {
-		err = nfs4_handle_exception(server,
-				_nfs4_do_fsinfo(server, fhandle, fsinfo),
-				&exception);
+		err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
+		if (err == 0) {
+			struct nfs_client *clp = server->nfs_client;
+
+			spin_lock(&clp->cl_lock);
+			clp->cl_lease_time = fsinfo->lease_time * HZ;
+			clp->cl_last_renewal = now;
+			spin_unlock(&clp->cl_lock);
+			break;
+		}
+		err = nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
 	return err;
 }
@@ -3410,6 +3812,46 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return err;
 }
 
+int nfs4_set_rw_stateid(nfs4_stateid *stateid,
+		const struct nfs_open_context *ctx,
+		const struct nfs_lock_context *l_ctx,
+		fmode_t fmode)
+{
+	const struct nfs_lockowner *lockowner = NULL;
+
+	if (l_ctx != NULL)
+		lockowner = &l_ctx->lockowner;
+	return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner);
+}
+EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
+
+static bool nfs4_stateid_is_current(nfs4_stateid *stateid,
+		const struct nfs_open_context *ctx,
+		const struct nfs_lock_context *l_ctx,
+		fmode_t fmode)
+{
+	nfs4_stateid current_stateid;
+
+	if (nfs4_set_rw_stateid(&current_stateid, ctx, l_ctx, fmode))
+		return false;
+	return nfs4_stateid_match(stateid, &current_stateid);
+}
+
+static bool nfs4_error_stateid_expired(int err)
+{
+	switch (err) {
+	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_BAD_STATEID:
+	case -NFS4ERR_STALE_STATEID:
+	case -NFS4ERR_OLD_STATEID:
+	case -NFS4ERR_OPENMODE:
+	case -NFS4ERR_EXPIRED:
+		return true;
+	}
+	return false;
+}
+
 void __nfs4_read_done_cb(struct nfs_read_data *data)
 {
 	nfs_invalidate_atime(data->header->inode);
@@ -3430,6 +3872,20 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 	return 0;
 }
 
+static bool nfs4_read_stateid_changed(struct rpc_task *task,
+		struct nfs_readargs *args)
+{
+
+	if (!nfs4_error_stateid_expired(task->tk_status) ||
+		nfs4_stateid_is_current(&args->stateid,
+				args->context,
+				args->lock_context,
+				FMODE_READ))
+		return false;
+	rpc_restart_call_prepare(task);
+	return true;
+}
+
 static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
 
@@ -3437,7 +3893,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 
 	if (!nfs4_sequence_done(task, &data->res.seq_res))
 		return -EAGAIN;
-
+	if (nfs4_read_stateid_changed(task, &data->args))
+		return -EAGAIN;
 	return data->read_done_cb ? data->read_done_cb(task, data) :
 				    nfs4_read_done_cb(task, data);
 }
@@ -3452,10 +3909,13 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
 
 static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
 {
-	nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
 			&data->args.seq_args,
 			&data->res.seq_res,
-			task);
+			task))
+		return;
+	nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
+			data->args.lock_context, FMODE_READ);
 }
 
 static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
@@ -3473,10 +3933,26 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
 	return 0;
 }
 
+static bool nfs4_write_stateid_changed(struct rpc_task *task,
+		struct nfs_writeargs *args)
+{
+
+	if (!nfs4_error_stateid_expired(task->tk_status) ||
+		nfs4_stateid_is_current(&args->stateid,
+				args->context,
+				args->lock_context,
+				FMODE_WRITE))
+		return false;
+	rpc_restart_call_prepare(task);
+	return true;
+}
+
 static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	if (!nfs4_sequence_done(task, &data->res.seq_res))
 		return -EAGAIN;
+	if (nfs4_write_stateid_changed(task, &data->args))
+		return -EAGAIN;
 	return data->write_done_cb ? data->write_done_cb(task, data) :
 		nfs4_write_done_cb(task, data);
 }
@@ -3516,10 +3992,13 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 
 static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
 {
-	nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
 			&data->args.seq_args,
 			&data->res.seq_res,
-			task);
+			task))
+		return;
+	nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
+			data->args.lock_context, FMODE_WRITE);
 }
 
 static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -3621,7 +4100,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred,
 		return -ENOMEM;
 	data->client = clp;
 	data->timestamp = jiffies;
-	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT,
+	return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT,
 			&nfs4_renew_ops, data);
 }
 
@@ -3635,7 +4114,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
 	unsigned long now = jiffies;
 	int status;
 
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
 	if (status < 0)
 		return status;
 	do_renew_lease(clp, now);
@@ -3928,6 +4407,155 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 	return err;
 }
 
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static int _nfs4_get_security_label(struct inode *inode, void *buf,
+					size_t buflen)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_fattr fattr;
+	struct nfs4_label label = {0, 0, buflen, buf};
+
+	u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+	struct nfs4_getattr_arg args = {
+		.fh		= NFS_FH(inode),
+		.bitmask	= bitmask,
+	};
+	struct nfs4_getattr_res res = {
+		.fattr		= &fattr,
+		.label		= &label,
+		.server		= server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
+		.rpc_argp	= &args,
+		.rpc_resp	= &res,
+	};
+	int ret;
+
+	nfs_fattr_init(&fattr);
+
+	ret = rpc_call_sync(server->client, &msg, 0);
+	if (ret)
+		return ret;
+	if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
+		return -ENOENT;
+	if (buflen < label.len)
+		return -ERANGE;
+	return 0;
+}
+
+static int nfs4_get_security_label(struct inode *inode, void *buf,
+					size_t buflen)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+		return -EOPNOTSUPP;
+
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(inode),
+				_nfs4_get_security_label(inode, buf, buflen),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int _nfs4_do_set_security_label(struct inode *inode,
+		struct nfs4_label *ilabel,
+		struct nfs_fattr *fattr,
+		struct nfs4_label *olabel)
+{
+
+	struct iattr sattr = {0};
+	struct nfs_server *server = NFS_SERVER(inode);
+	const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
+	struct nfs_setattrargs args = {
+		.fh             = NFS_FH(inode),
+		.iap            = &sattr,
+		.server		= server,
+		.bitmask	= bitmask,
+		.label		= ilabel,
+	};
+	struct nfs_setattrres res = {
+		.fattr		= fattr,
+		.label		= olabel,
+		.server		= server,
+	};
+	struct rpc_message msg = {
+		.rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+		.rpc_argp       = &args,
+		.rpc_resp       = &res,
+	};
+	int status;
+
+	nfs4_stateid_copy(&args.stateid, &zero_stateid);
+
+	status = rpc_call_sync(server->client, &msg, 0);
+	if (status)
+		dprintk("%s failed: %d\n", __func__, status);
+
+	return status;
+}
+
+static int nfs4_do_set_security_label(struct inode *inode,
+		struct nfs4_label *ilabel,
+		struct nfs_fattr *fattr,
+		struct nfs4_label *olabel)
+{
+	struct nfs4_exception exception = { };
+	int err;
+
+	do {
+		err = nfs4_handle_exception(NFS_SERVER(inode),
+				_nfs4_do_set_security_label(inode, ilabel,
+				fattr, olabel),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
+static int
+nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+{
+	struct nfs4_label ilabel, *olabel = NULL;
+	struct nfs_fattr fattr;
+	struct rpc_cred *cred;
+	struct inode *inode = dentry->d_inode;
+	int status;
+
+	if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
+		return -EOPNOTSUPP;
+
+	nfs_fattr_init(&fattr);
+
+	ilabel.pi = 0;
+	ilabel.lfs = 0;
+	ilabel.label = (char *)buf;
+	ilabel.len = buflen;
+
+	cred = rpc_lookup_cred();
+	if (IS_ERR(cred))
+		return PTR_ERR(cred);
+
+	olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+	if (IS_ERR(olabel)) {
+		status = -PTR_ERR(olabel);
+		goto out;
+	}
+
+	status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
+	if (status == 0)
+		nfs_setsecurity(inode, &fattr, olabel);
+
+	nfs4_label_free(olabel);
+out:
+	put_rpccred(cred);
+	return status;
+}
+#endif	/* CONFIG_NFS_V4_SECURITY_LABEL */
+
+
 static int
 nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
 {
@@ -3945,11 +4573,14 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_OPENMODE:
 			if (state == NULL)
 				break;
-			nfs4_schedule_stateid_recovery(server, state);
+			if (nfs4_schedule_stateid_recovery(server, state) < 0)
+				goto stateid_invalid;
 			goto wait_on_recovery;
 		case -NFS4ERR_EXPIRED:
-			if (state != NULL)
-				nfs4_schedule_stateid_recovery(server, state);
+			if (state != NULL) {
+				if (nfs4_schedule_stateid_recovery(server, state) < 0)
+					goto stateid_invalid;
+			}
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
 			nfs4_schedule_lease_recovery(clp);
@@ -3981,6 +4612,9 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 	}
 	task->tk_status = nfs4_map_errors(task->tk_status);
 	return 0;
+stateid_invalid:
+	task->tk_status = -EIO;
+	return 0;
 wait_on_recovery:
 	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
@@ -4080,7 +4714,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 	/* cb_client4 */
 	rcu_read_lock();
 	setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
-				sizeof(setclientid.sc_netid),
+				sizeof(setclientid.sc_netid), "%s",
 				rpc_peeraddr2str(clp->cl_rpcclient,
 							RPC_DISPLAY_NETID));
 	rcu_read_unlock();
@@ -4108,27 +4742,17 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
 		struct nfs4_setclientid_res *arg,
 		struct rpc_cred *cred)
 {
-	struct nfs_fsinfo fsinfo;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM],
 		.rpc_argp = arg,
-		.rpc_resp = &fsinfo,
 		.rpc_cred = cred,
 	};
-	unsigned long now;
 	int status;
 
 	dprintk("NFS call  setclientid_confirm auth=%s, (client ID %llx)\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
 		clp->cl_clientid);
-	now = jiffies;
 	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
-	if (status == 0) {
-		spin_lock(&clp->cl_lock);
-		clp->cl_lease_time = fsinfo.lease_time * HZ;
-		clp->cl_last_renewal = now;
-		spin_unlock(&clp->cl_lock);
-	}
 	dprintk("NFS reply setclientid_confirm: %d\n", status);
 	return status;
 }
@@ -4273,7 +4897,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 static unsigned long
 nfs4_set_lock_task_retry(unsigned long timeout)
 {
-	freezable_schedule_timeout_killable(timeout);
+	freezable_schedule_timeout_killable_unsafe(timeout);
 	timeout <<= 1;
 	if (timeout > NFS4_LOCK_MAXTIMEOUT)
 		return NFS4_LOCK_MAXTIMEOUT;
@@ -4423,12 +5047,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 	struct nfs4_unlockdata *calldata = data;
 
 	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
-		return;
+		goto out_wait;
 	if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
 		/* Note: exit _without_ running nfs4_locku_done */
-		task->tk_action = NULL;
-		nfs4_sequence_done(task, &calldata->res.seq_res);
-		return;
+		goto out_no_action;
 	}
 	calldata->timestamp = jiffies;
 	if (nfs4_setup_sequence(calldata->server,
@@ -4436,6 +5058,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 				&calldata->res.seq_res,
 				task) != 0)
 		nfs_release_seqid(calldata->arg.seqid);
+	return;
+out_no_action:
+	task->tk_action = NULL;
+out_wait:
+	nfs4_sequence_done(task, &calldata->res.seq_res);
 }
 
 static const struct rpc_call_ops nfs4_locku_ops = {
@@ -4482,7 +5109,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-	struct nfs_inode *nfsi = NFS_I(state->inode);
+	struct inode *inode = state->inode;
+	struct nfs4_state_owner *sp = state->owner;
+	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_seqid *seqid;
 	struct nfs4_lock_state *lsp;
 	struct rpc_task *task;
@@ -4492,18 +5121,23 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	status = nfs4_set_lock_state(state, request);
 	/* Unlock _before_ we do the RPC call */
 	request->fl_flags |= FL_EXISTS;
+	/* Exclude nfs_delegation_claim_locks() */
+	mutex_lock(&sp->so_delegreturn_mutex);
+	/* Exclude nfs4_reclaim_open_stateid() - note nesting! */
 	down_read(&nfsi->rwsem);
 	if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
 		up_read(&nfsi->rwsem);
+		mutex_unlock(&sp->so_delegreturn_mutex);
 		goto out;
 	}
 	up_read(&nfsi->rwsem);
+	mutex_unlock(&sp->so_delegreturn_mutex);
 	if (status != 0)
 		goto out;
 	/* Is this a delegated lock? */
-	if (test_bit(NFS_DELEGATED_STATE, &state->flags))
-		goto out;
 	lsp = request->fl_u.nfs4_fl.owner;
+	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
+		goto out;
 	seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
 	status = -ENOMEM;
 	if (seqid == NULL)
@@ -4576,26 +5210,34 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 
 	dprintk("%s: begin!\n", __func__);
 	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
-		return;
+		goto out_wait;
 	/* Do we need to do an open_to_lock_owner? */
 	if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
 		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
 			goto out_release_lock_seqid;
 		}
-		data->arg.open_stateid = &state->stateid;
+		data->arg.open_stateid = &state->open_stateid;
 		data->arg.new_lock_owner = 1;
 		data->res.open_seqid = data->arg.open_seqid;
 	} else
 		data->arg.new_lock_owner = 0;
+	if (!nfs4_valid_open_stateid(state)) {
+		data->rpc_status = -EBADF;
+		task->tk_action = NULL;
+		goto out_release_open_seqid;
+	}
 	data->timestamp = jiffies;
 	if (nfs4_setup_sequence(data->server,
 				&data->arg.seq_args,
 				&data->res.seq_res,
 				task) == 0)
 		return;
+out_release_open_seqid:
 	nfs_release_seqid(data->arg.open_seqid);
 out_release_lock_seqid:
 	nfs_release_seqid(data->arg.lock_seqid);
+out_wait:
+	nfs4_sequence_done(task, &data->res.seq_res);
 	dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
 }
 
@@ -4783,13 +5425,18 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
 
 	list_for_each_entry(lsp, &state->lock_states, ls_locks) {
 		if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-			status = nfs41_test_stateid(server, &lsp->ls_stateid);
+			struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+			status = nfs41_test_stateid(server,
+					&lsp->ls_stateid,
+					cred);
 			if (status != NFS_OK) {
 				/* Free the stateid unless the server
 				 * informs us the stateid is unrecognized. */
 				if (status != -NFS4ERR_BAD_STATEID)
 					nfs41_free_stateid(server,
-							&lsp->ls_stateid);
+							&lsp->ls_stateid,
+							cred);
 				clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
 				ret = status;
 			}
@@ -4813,8 +5460,10 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
 
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
+	struct nfs4_state_owner *sp = state->owner;
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	unsigned char fl_flags = request->fl_flags;
+	unsigned int seq;
 	int status = -ENOLCK;
 
 	if ((fl_flags & FL_POSIX) &&
@@ -4836,9 +5485,16 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 		status = do_vfs_lock(request->fl_file, request);
 		goto out_unlock;
 	}
+	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+	up_read(&nfsi->rwsem);
 	status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
 	if (status != 0)
+		goto out;
+	down_read(&nfsi->rwsem);
+	if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
+		status = -NFS4ERR_DELAY;
 		goto out_unlock;
+	}
 	/* Note: we always want to sleep here! */
 	request->fl_flags = fl_flags | FL_SLEEP;
 	if (do_vfs_lock(request->fl_file, request) < 0)
@@ -4927,61 +5583,16 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 	return status;
 }
 
-int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
+int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
-	struct nfs4_exception exception = { };
 	int err;
 
 	err = nfs4_set_lock_state(state, fl);
 	if (err != 0)
-		goto out;
-	do {
-		err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
-		switch (err) {
-			default:
-				printk(KERN_ERR "NFS: %s: unhandled error "
-					"%d.\n", __func__, err);
-			case 0:
-			case -ESTALE:
-				goto out;
-			case -NFS4ERR_EXPIRED:
-				nfs4_schedule_stateid_recovery(server, state);
-			case -NFS4ERR_STALE_CLIENTID:
-			case -NFS4ERR_STALE_STATEID:
-				nfs4_schedule_lease_recovery(server->nfs_client);
-				goto out;
-			case -NFS4ERR_BADSESSION:
-			case -NFS4ERR_BADSLOT:
-			case -NFS4ERR_BAD_HIGH_SLOT:
-			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
-				goto out;
-			case -ERESTARTSYS:
-				/*
-				 * The show must go on: exit, but mark the
-				 * stateid as needing recovery.
-				 */
-			case -NFS4ERR_DELEG_REVOKED:
-			case -NFS4ERR_ADMIN_REVOKED:
-			case -NFS4ERR_BAD_STATEID:
-			case -NFS4ERR_OPENMODE:
-				nfs4_schedule_stateid_recovery(server, state);
-				err = 0;
-				goto out;
-			case -ENOMEM:
-			case -NFS4ERR_DENIED:
-				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
-				err = 0;
-				goto out;
-			case -NFS4ERR_DELAY:
-				break;
-		}
-		err = nfs4_handle_exception(server, err, &exception);
-	} while (exception.retry);
-out:
-	return err;
+		return err;
+	err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
+	return nfs4_handle_delegation_recall_error(server, state, stateid, err);
 }
 
 struct nfs_release_lockowner_data {
@@ -5001,9 +5612,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
 	.rpc_release = nfs4_release_lockowner_release,
 };
 
-int nfs4_release_lockowner(struct nfs4_lock_state *lsp)
+static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
-	struct nfs_server *server = lsp->ls_state->owner->so_server;
 	struct nfs_release_lockowner_data *data;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
@@ -5059,6 +5669,53 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
 	return len;
 }
 
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static inline int nfs4_server_supports_labels(struct nfs_server *server)
+{
+	return server->caps & NFS_CAP_SECURITY_LABEL;
+}
+
+static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
+				   const void *buf, size_t buflen,
+				   int flags, int type)
+{
+	if (security_ismaclabel(key))
+		return nfs4_set_security_label(dentry, buf, buflen);
+
+	return -EOPNOTSUPP;
+}
+
+static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
+				   void *buf, size_t buflen, int type)
+{
+	if (security_ismaclabel(key))
+		return nfs4_get_security_label(dentry->d_inode, buf, buflen);
+	return -EOPNOTSUPP;
+}
+
+static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
+				       size_t list_len, const char *name,
+				       size_t name_len, int type)
+{
+	size_t len = 0;
+
+	if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) {
+		len = security_inode_listsecurity(dentry->d_inode, NULL, 0);
+		if (list && len <= list_len)
+			security_inode_listsecurity(dentry->d_inode, list, len);
+	}
+	return len;
+}
+
+static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.list	= nfs4_xattr_list_nfs4_label,
+	.get	= nfs4_xattr_get_nfs4_label,
+	.set	= nfs4_xattr_set_nfs4_label,
+};
+#endif
+
+
 /*
  * nfs_fhget will use either the mounted_on_fileid or the fileid
  */
@@ -5082,7 +5739,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
 				   struct page *page)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
-	u32 bitmask[2] = {
+	u32 bitmask[3] = {
 		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
 	};
 	struct nfs4_fs_locations_arg args = {
@@ -5269,7 +5926,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	struct nfs41_exchange_id_args args = {
 		.verifier = &verifier,
 		.client = clp,
-		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER,
+		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+			EXCHGID4_FLAG_BIND_PRINC_STATEID,
 	};
 	struct nfs41_exchange_id_res res = {
 		0
@@ -5526,17 +6184,14 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
  */
 static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
 {
-	struct nfs4_session *session = args->client->cl_session;
-	unsigned int mxrqst_sz = session->fc_target_max_rqst_sz,
-		     mxresp_sz = session->fc_target_max_resp_sz;
+	unsigned int max_rqst_sz, max_resp_sz;
+
+	max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
+	max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
 
-	if (mxrqst_sz == 0)
-		mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
-	if (mxresp_sz == 0)
-		mxresp_sz = NFS_MAX_FILE_IO_SIZE;
 	/* Fore channel attributes */
-	args->fc_attrs.max_rqst_sz = mxrqst_sz;
-	args->fc_attrs.max_resp_sz = mxresp_sz;
+	args->fc_attrs.max_rqst_sz = max_rqst_sz;
+	args->fc_attrs.max_resp_sz = max_resp_sz;
 	args->fc_attrs.max_ops = NFS4_MAX_OPS;
 	args->fc_attrs.max_reqs = max_session_slots;
 
@@ -5795,7 +6450,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
 		.rpc_client = clp->cl_rpcclient,
 		.rpc_message = &msg,
 		.callback_ops = &nfs41_sequence_ops,
-		.flags = RPC_TASK_ASYNC | RPC_TASK_SOFT,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
 	};
 
 	if (!atomic_inc_not_zero(&clp->cl_count))
@@ -5923,12 +6578,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
 /*
  * Issue a global reclaim complete.
  */
-static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
+static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
+		struct rpc_cred *cred)
 {
 	struct nfs4_reclaim_complete_data *calldata;
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
+		.rpc_cred = cred,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = clp->cl_rpcclient,
@@ -5995,6 +6652,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct pnfs_layout_hdr *lo;
 	struct nfs4_state *state = NULL;
+	unsigned long timeo, giveup;
 
 	dprintk("--> %s\n", __func__);
 
@@ -6006,7 +6664,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 		goto out;
 	case -NFS4ERR_LAYOUTTRYLATER:
 	case -NFS4ERR_RECALLCONFLICT:
-		task->tk_status = -NFS4ERR_DELAY;
+		timeo = rpc_get_timeout(task->tk_client);
+		giveup = lgp->args.timestamp + timeo;
+		if (time_after(giveup, jiffies))
+			task->tk_status = -NFS4ERR_DELAY;
 		break;
 	case -NFS4ERR_EXPIRED:
 	case -NFS4ERR_BAD_STATEID:
@@ -6079,11 +6740,13 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
 static void nfs4_layoutget_release(void *calldata)
 {
 	struct nfs4_layoutget *lgp = calldata;
-	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+	struct inode *inode = lgp->args.inode;
+	struct nfs_server *server = NFS_SERVER(inode);
 	size_t max_pages = max_response_pages(server);
 
 	dprintk("--> %s\n", __func__);
 	nfs4_free_pages(lgp->args.layout.pages, max_pages);
+	pnfs_put_layout_hdr(NFS_I(inode)->layout);
 	put_nfs_open_context(lgp->args.ctx);
 	kfree(calldata);
 	dprintk("<-- %s\n", __func__);
@@ -6098,13 +6761,15 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
 struct pnfs_layout_segment *
 nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 {
-	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+	struct inode *inode = lgp->args.inode;
+	struct nfs_server *server = NFS_SERVER(inode);
 	size_t max_pages = max_response_pages(server);
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
 		.rpc_argp = &lgp->args,
 		.rpc_resp = &lgp->res,
+		.rpc_cred = lgp->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = server->client,
@@ -6124,17 +6789,23 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 		return ERR_PTR(-ENOMEM);
 	}
 	lgp->args.layout.pglen = max_pages * PAGE_SIZE;
+	lgp->args.timestamp = jiffies;
 
 	lgp->res.layoutp = &lgp->args.layout;
 	lgp->res.seq_res.sr_slot = NULL;
 	nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
+
+	/* nfs4_layoutget_release calls pnfs_put_layout_hdr */
+	pnfs_get_layout_hdr(NFS_I(inode)->layout);
+
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return ERR_CAST(task);
 	status = nfs4_wait_for_completion_rpc_task(task);
 	if (status == 0)
 		status = task->tk_status;
-	if (status == 0)
+	/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
+	if (status == 0 && lgp->res.layoutp->len)
 		lseg = pnfs_layout_process(lgp);
 	rpc_put_task(task);
 	dprintk("<-- %s status=%d\n", __func__, status);
@@ -6202,6 +6873,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
 		.rpc_argp = &lrp->args,
 		.rpc_resp = &lrp->res,
+		.rpc_cred = lrp->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.rpc_client = lrp->clp->cl_rpcclient,
@@ -6271,7 +6943,9 @@ int nfs4_proc_getdevicelist(struct nfs_server *server,
 EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
 
 static int
-_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+_nfs4_proc_getdeviceinfo(struct nfs_server *server,
+		struct pnfs_device *pdev,
+		struct rpc_cred *cred)
 {
 	struct nfs4_getdeviceinfo_args args = {
 		.pdev = pdev,
@@ -6283,6 +6957,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
 		.rpc_argp = &args,
 		.rpc_resp = &res,
+		.rpc_cred = cred,
 	};
 	int status;
 
@@ -6293,14 +6968,16 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
 	return status;
 }
 
-int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
+int nfs4_proc_getdeviceinfo(struct nfs_server *server,
+		struct pnfs_device *pdev,
+		struct rpc_cred *cred)
 {
 	struct nfs4_exception exception = { };
 	int err;
 
 	do {
 		err = nfs4_handle_exception(server,
-					_nfs4_proc_getdeviceinfo(server, pdev),
+					_nfs4_proc_getdeviceinfo(server, pdev, cred),
 					&exception);
 	} while (exception.retry);
 	return err;
@@ -6350,22 +7027,8 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
 static void nfs4_layoutcommit_release(void *calldata)
 {
 	struct nfs4_layoutcommit_data *data = calldata;
-	struct pnfs_layout_segment *lseg, *tmp;
-	unsigned long *bitlock = &NFS_I(data->args.inode)->flags;
 
 	pnfs_cleanup_layoutcommit(data);
-	/* Matched by references in pnfs_set_layoutcommit */
-	list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) {
-		list_del_init(&lseg->pls_lc_list);
-		if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT,
-				       &lseg->pls_flags))
-			pnfs_put_lseg(lseg);
-	}
-
-	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
-	smp_mb__after_clear_bit();
-	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
-
 	put_rpccred(data->cred);
 	kfree(data);
 }
@@ -6498,7 +7161,9 @@ out:
 	return err;
 }
 
-static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int _nfs41_test_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
 {
 	int status;
 	struct nfs41_test_stateid_args args = {
@@ -6509,6 +7174,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
 		.rpc_argp = &args,
 		.rpc_resp = &res,
+		.rpc_cred = cred,
 	};
 
 	dprintk("NFS call  test_stateid %p\n", stateid);
@@ -6529,17 +7195,20 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
  *
  * @server: server / transport on which to perform the operation
  * @stateid: state ID to test
+ * @cred: credential
  *
  * Returns NFS_OK if the server recognizes that "stateid" is valid.
  * Otherwise a negative NFS4ERR value is returned if the operation
  * failed or the state ID is not currently valid.
  */
-static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_test_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
 {
 	struct nfs4_exception exception = { };
 	int err;
 	do {
-		err = _nfs41_test_stateid(server, stateid);
+		err = _nfs41_test_stateid(server, stateid, cred);
 		if (err != -NFS4ERR_DELAY)
 			break;
 		nfs4_handle_exception(server, err, &exception);
@@ -6547,26 +7216,78 @@ static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 	return err;
 }
 
-static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
-{
-	struct nfs41_free_stateid_args args = {
-		.stateid = stateid,
-	};
+struct nfs_free_stateid_data {
+	struct nfs_server *server;
+	struct nfs41_free_stateid_args args;
 	struct nfs41_free_stateid_res res;
+};
+
+static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_free_stateid_data *data = calldata;
+	nfs41_setup_sequence(nfs4_get_session(data->server),
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
+}
+
+static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_free_stateid_data *data = calldata;
+
+	nfs41_sequence_done(task, &data->res.seq_res);
+
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+		if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN)
+			rpc_restart_call_prepare(task);
+	}
+}
+
+static void nfs41_free_stateid_release(void *calldata)
+{
+	kfree(calldata);
+}
+
+const struct rpc_call_ops nfs41_free_stateid_ops = {
+	.rpc_call_prepare = nfs41_free_stateid_prepare,
+	.rpc_call_done = nfs41_free_stateid_done,
+	.rpc_release = nfs41_free_stateid_release,
+};
+
+static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred,
+		bool privileged)
+{
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
-		.rpc_argp = &args,
-		.rpc_resp = &res,
+		.rpc_cred = cred,
 	};
-	int status;
+	struct rpc_task_setup task_setup = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs41_free_stateid_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
+	struct nfs_free_stateid_data *data;
 
 	dprintk("NFS call  free_stateid %p\n", stateid);
-	nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
-	nfs4_set_sequence_privileged(&args.seq_args);
-	status = nfs4_call_sync_sequence(server->client, server, &msg,
-			&args.seq_args, &res.seq_res);
-	dprintk("NFS reply free_stateid: %d\n", status);
-	return status;
+	data = kmalloc(sizeof(*data), GFP_NOFS);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+	data->server = server;
+	nfs4_stateid_copy(&data->args.stateid, stateid);
+
+	task_setup.callback_data = data;
+
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+	if (privileged)
+		nfs4_set_sequence_privileged(&data->args.seq_args);
+
+	return rpc_run_task(&task_setup);
 }
 
 /**
@@ -6574,21 +7295,39 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
  *
  * @server: server / transport on which to perform the operation
  * @stateid: state ID to release
+ * @cred: credential
  *
  * Returns NFS_OK if the server freed "stateid".  Otherwise a
  * negative NFS4ERR value is returned.
  */
-static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
+static int nfs41_free_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
 {
-	struct nfs4_exception exception = { };
-	int err;
-	do {
-		err = _nfs4_free_stateid(server, stateid);
-		if (err != -NFS4ERR_DELAY)
-			break;
-		nfs4_handle_exception(server, err, &exception);
-	} while (exception.retry);
-	return err;
+	struct rpc_task *task;
+	int ret;
+
+	task = _nfs41_free_stateid(server, stateid, cred, true);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	ret = rpc_wait_for_completion_task(task);
+	if (!ret)
+		ret = task->tk_status;
+	rpc_put_task(task);
+	return ret;
+}
+
+static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+{
+	struct rpc_task *task;
+	struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+	task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
+	nfs4_free_lock_state(server, lsp);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
 }
 
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
@@ -6673,9 +7412,14 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
 
 static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 	.minor_version = 0,
+	.init_caps = NFS_CAP_READDIRPLUS
+		| NFS_CAP_ATOMIC_OPEN
+		| NFS_CAP_CHANGE_ATTR
+		| NFS_CAP_POSIX_LOCK,
 	.call_sync = _nfs4_call_sync,
 	.match_stateid = nfs4_match_stateid,
 	.find_root_sec = nfs4_find_root_sec,
+	.free_lock_state = nfs4_release_lockowner,
 	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
 	.state_renewal_ops = &nfs40_state_renewal_ops,
@@ -6684,9 +7428,35 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 #if defined(CONFIG_NFS_V4_1)
 static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 	.minor_version = 1,
+	.init_caps = NFS_CAP_READDIRPLUS
+		| NFS_CAP_ATOMIC_OPEN
+		| NFS_CAP_CHANGE_ATTR
+		| NFS_CAP_POSIX_LOCK
+		| NFS_CAP_STATEID_NFSV41
+		| NFS_CAP_ATOMIC_OPEN_V1,
+	.call_sync = nfs4_call_sync_sequence,
+	.match_stateid = nfs41_match_stateid,
+	.find_root_sec = nfs41_find_root_sec,
+	.free_lock_state = nfs41_free_lock_state,
+	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
+	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
+	.state_renewal_ops = &nfs41_state_renewal_ops,
+};
+#endif
+
+#if defined(CONFIG_NFS_V4_2)
+static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
+	.minor_version = 2,
+	.init_caps = NFS_CAP_READDIRPLUS
+		| NFS_CAP_ATOMIC_OPEN
+		| NFS_CAP_CHANGE_ATTR
+		| NFS_CAP_POSIX_LOCK
+		| NFS_CAP_STATEID_NFSV41
+		| NFS_CAP_ATOMIC_OPEN_V1,
 	.call_sync = nfs4_call_sync_sequence,
 	.match_stateid = nfs41_match_stateid,
 	.find_root_sec = nfs41_find_root_sec,
+	.free_lock_state = nfs41_free_lock_state,
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
 	.state_renewal_ops = &nfs41_state_renewal_ops,
@@ -6698,6 +7468,9 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
 #if defined(CONFIG_NFS_V4_1)
 	[1] = &nfs_v4_1_minor_ops,
 #endif
+#if defined(CONFIG_NFS_V4_2)
+	[2] = &nfs_v4_2_minor_ops,
+#endif
 };
 
 const struct inode_operations nfs4_dir_inode_operations = {
@@ -6797,6 +7570,9 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
 
 const struct xattr_handler *nfs4_xattr_handlers[] = {
 	&nfs4_xattr_nfs4_acl_handler,
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+	&nfs4_xattr_nfs4_label_handler,
+#endif
 	NULL
 };
 
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index ebda5f4a031..36e21cb29d6 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -73,7 +73,7 @@ void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
 			tbl->highest_used_slotid = new_max;
 		else {
 			tbl->highest_used_slotid = NFS4_NO_SLOT;
-			nfs4_session_drain_complete(tbl->session, tbl);
+			nfs4_slot_tbl_drain_complete(tbl);
 		}
 	}
 	dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
@@ -226,7 +226,7 @@ static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
 	struct nfs4_slot *slot = pslot;
 	struct nfs4_slot_table *tbl = slot->table;
 
-	if (nfs4_session_draining(tbl->session) && !args->sa_privileged)
+	if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged)
 		return false;
 	slot->generation = tbl->generation;
 	args->sa_slot = slot;
@@ -478,48 +478,12 @@ static int nfs41_check_session_ready(struct nfs_client *clp)
 	return 0;
 }
 
-int nfs4_init_session(struct nfs_server *server)
+int nfs4_init_session(struct nfs_client *clp)
 {
-	struct nfs_client *clp = server->nfs_client;
-	struct nfs4_session *session;
-	unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
-	unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
-
 	if (!nfs4_has_session(clp))
 		return 0;
 
-	if (server->rsize != 0)
-		target_max_resp_sz = server->rsize;
-	target_max_resp_sz += nfs41_maxread_overhead;
-
-	if (server->wsize != 0)
-		target_max_rqst_sz = server->wsize;
-	target_max_rqst_sz += nfs41_maxwrite_overhead;
-
-	session = clp->cl_session;
-	spin_lock(&clp->cl_lock);
-	if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
-		/* Initialise targets and channel attributes */
-		session->fc_target_max_rqst_sz = target_max_rqst_sz;
-		session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
-		session->fc_target_max_resp_sz = target_max_resp_sz;
-		session->fc_attrs.max_resp_sz = target_max_resp_sz;
-	} else {
-		/* Just adjust the targets */
-		if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
-			session->fc_target_max_rqst_sz = target_max_rqst_sz;
-			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
-		}
-		if (target_max_resp_sz > session->fc_target_max_resp_sz) {
-			session->fc_target_max_resp_sz = target_max_resp_sz;
-			set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
-		}
-	}
-	spin_unlock(&clp->cl_lock);
-
-	if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
-		nfs4_schedule_lease_recovery(clp);
-
+	clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
 	return nfs41_check_session_ready(clp);
 }
 
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 6f3cb39386d..3a153d82b90 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -25,6 +25,10 @@ struct nfs4_slot {
 };
 
 /* Sessions */
+enum nfs4_slot_tbl_state {
+	NFS4_SLOT_TBL_DRAINING,
+};
+
 #define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
 struct nfs4_slot_table {
 	struct nfs4_session *session;		/* Parent session */
@@ -43,6 +47,7 @@ struct nfs4_slot_table {
 	unsigned long	generation;		/* Generation counter for
 						   target_highest_slotid */
 	struct completion complete;
+	unsigned long	slot_tbl_state;
 };
 
 /*
@@ -61,14 +66,10 @@ struct nfs4_session {
 	struct nfs4_channel_attrs	bc_attrs;
 	struct nfs4_slot_table		bc_slot_table;
 	struct nfs_client		*clp;
-	/* Create session arguments */
-	unsigned int			fc_target_max_rqst_sz;
-	unsigned int			fc_target_max_resp_sz;
 };
 
 enum nfs4_session_state {
 	NFS4_SESSION_INITING,
-	NFS4_SESSION_DRAINING,
 };
 
 #if defined(CONFIG_NFS_V4_1)
@@ -85,15 +86,14 @@ extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
 
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern void nfs4_destroy_session(struct nfs4_session *session);
-extern int nfs4_init_session(struct nfs_server *server);
+extern int nfs4_init_session(struct nfs_client *clp);
 extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
 
-extern void nfs4_session_drain_complete(struct nfs4_session *session,
-		struct nfs4_slot_table *tbl);
+extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
 
-static inline bool nfs4_session_draining(struct nfs4_session *session)
+static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
 {
-	return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state);
+	return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
 }
 
 bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
@@ -119,7 +119,7 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
 
 #else /* defined(CONFIG_NFS_V4_1) */
 
-static inline int nfs4_init_session(struct nfs_server *server)
+static inline int nfs4_init_session(struct nfs_client *clp)
 {
 	return 0;
 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e61f68d5ef2..e22862f1356 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -154,18 +154,6 @@ struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
 	return cred;
 }
 
-static void nfs4_clear_machine_cred(struct nfs_client *clp)
-{
-	struct rpc_cred *cred;
-
-	spin_lock(&clp->cl_lock);
-	cred = clp->cl_machine_cred;
-	clp->cl_machine_cred = NULL;
-	spin_unlock(&clp->cl_lock);
-	if (cred != NULL)
-		put_rpccred(cred);
-}
-
 static struct rpc_cred *
 nfs4_get_renew_cred_server_locked(struct nfs_server *server)
 {
@@ -240,38 +228,37 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
 	return status;
 }
 
-/*
- * Back channel returns NFS4ERR_DELAY for new requests when
- * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
- * is ended.
- */
-static void nfs4_end_drain_session(struct nfs_client *clp)
+static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
 {
-	struct nfs4_session *ses = clp->cl_session;
-	struct nfs4_slot_table *tbl;
-
-	if (ses == NULL)
-		return;
-	tbl = &ses->fc_slot_table;
-	if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
+	if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
 		spin_lock(&tbl->slot_tbl_lock);
 		nfs41_wake_slot_table(tbl);
 		spin_unlock(&tbl->slot_tbl_lock);
 	}
 }
 
+static void nfs4_end_drain_session(struct nfs_client *clp)
+{
+	struct nfs4_session *ses = clp->cl_session;
+
+	if (ses != NULL) {
+		nfs4_end_drain_slot_table(&ses->bc_slot_table);
+		nfs4_end_drain_slot_table(&ses->fc_slot_table);
+	}
+}
+
 /*
  * Signal state manager thread if session fore channel is drained
  */
-void nfs4_session_drain_complete(struct nfs4_session *session,
-		struct nfs4_slot_table *tbl)
+void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl)
 {
-	if (nfs4_session_draining(session))
+	if (nfs4_slot_tbl_draining(tbl))
 		complete(&tbl->complete);
 }
 
-static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
+static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
 {
+	set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
 	spin_lock(&tbl->slot_tbl_lock);
 	if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
 		INIT_COMPLETION(tbl->complete);
@@ -287,13 +274,12 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
 	struct nfs4_session *ses = clp->cl_session;
 	int ret = 0;
 
-	set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
 	/* back channel */
-	ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table);
+	ret = nfs4_drain_slot_tbl(&ses->bc_slot_table);
 	if (ret)
 		return ret;
 	/* fore channel */
-	return nfs4_wait_on_slot_tbl(&ses->fc_slot_table);
+	return nfs4_drain_slot_tbl(&ses->fc_slot_table);
 }
 
 static void nfs41_finish_session_reset(struct nfs_client *clp)
@@ -518,6 +504,8 @@ nfs4_alloc_state_owner(struct nfs_server *server,
 	nfs4_init_seqid_counter(&sp->so_seqid);
 	atomic_set(&sp->so_count, 1);
 	INIT_LIST_HEAD(&sp->so_lru);
+	seqcount_init(&sp->so_reclaim_seqcount);
+	mutex_init(&sp->so_delegreturn_mutex);
 	return sp;
 }
 
@@ -697,6 +685,8 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner)
 	list_for_each_entry(state, &nfsi->open_states, inode_states) {
 		if (state->owner != owner)
 			continue;
+		if (!nfs4_valid_open_stateid(state))
+			continue;
 		if (atomic_inc_not_zero(&state->count))
 			return state;
 	}
@@ -929,6 +919,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
  */
 void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
 {
+	struct nfs_server *server;
 	struct nfs4_state *state;
 
 	if (lsp == NULL)
@@ -940,11 +931,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
 	if (list_empty(&state->lock_states))
 		clear_bit(LK_STATE_IN_USE, &state->flags);
 	spin_unlock(&state->state_lock);
+	server = state->owner->so_server;
 	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-		if (nfs4_release_lockowner(lsp) == 0)
-			return;
-	}
-	nfs4_free_lock_state(lsp->ls_state->owner->so_server, lsp);
+		struct nfs_client *clp = server->nfs_client;
+
+		clp->cl_mvops->free_lock_state(server, lsp);
+	} else
+		nfs4_free_lock_state(server, lsp);
 }
 
 static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -985,13 +978,14 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
 	return 0;
 }
 
-static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state,
+static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
+		struct nfs4_state *state,
 		const struct nfs_lockowner *lockowner)
 {
 	struct nfs4_lock_state *lsp;
 	fl_owner_t fl_owner;
 	pid_t fl_pid;
-	bool ret = false;
+	int ret = -ENOENT;
 
 
 	if (lockowner == NULL)
@@ -1006,7 +1000,10 @@ static bool nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state,
 	lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
 	if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
 		nfs4_stateid_copy(dst, &lsp->ls_stateid);
-		ret = true;
+		ret = 0;
+		smp_rmb();
+		if (!list_empty(&lsp->ls_seqid.list))
+			ret = -EWOULDBLOCK;
 	}
 	spin_unlock(&state->state_lock);
 	nfs4_put_lock_state(lsp);
@@ -1014,28 +1011,44 @@ out:
 	return ret;
 }
 
-static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
 {
+	const nfs4_stateid *src;
+	int ret;
 	int seq;
 
 	do {
+		src = &zero_stateid;
 		seq = read_seqbegin(&state->seqlock);
-		nfs4_stateid_copy(dst, &state->stateid);
+		if (test_bit(NFS_OPEN_STATE, &state->flags))
+			src = &state->open_stateid;
+		nfs4_stateid_copy(dst, src);
+		ret = 0;
+		smp_rmb();
+		if (!list_empty(&state->owner->so_seqid.list))
+			ret = -EWOULDBLOCK;
 	} while (read_seqretry(&state->seqlock, seq));
+	return ret;
 }
 
 /*
  * Byte-range lock aware utility to initialize the stateid of read/write
  * requests.
  */
-void nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
+int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
 		fmode_t fmode, const struct nfs_lockowner *lockowner)
 {
+	int ret = 0;
 	if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
-		return;
-	if (nfs4_copy_lock_stateid(dst, state, lockowner))
-		return;
-	nfs4_copy_open_stateid(dst, state);
+		goto out;
+	ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+	if (ret != -ENOENT)
+		goto out;
+	ret = nfs4_copy_open_stateid(dst, state);
+out:
+	if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41))
+		dst->seqid = 0;
+	return ret;
 }
 
 struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask)
@@ -1180,7 +1193,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 	snprintf(buf, sizeof(buf), "%s-manager",
 			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
 	rcu_read_unlock();
-	task = kthread_run(nfs4_run_state_manager, clp, buf);
+	task = kthread_run(nfs4_run_state_manager, clp, "%s", buf);
 	if (IS_ERR(task)) {
 		printk(KERN_ERR "%s: kthread_run: %ld\n",
 			__func__, PTR_ERR(task));
@@ -1284,14 +1297,17 @@ static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_s
 	return 1;
 }
 
-void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
 {
 	struct nfs_client *clp = server->nfs_client;
 
+	if (!nfs4_valid_open_stateid(state))
+		return -EBADF;
 	nfs4_state_mark_reclaim_nograce(clp, state);
 	dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
 			clp->cl_hostname);
 	nfs4_schedule_state_manager(clp);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
 
@@ -1321,6 +1337,27 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
 		nfs4_schedule_state_manager(clp);
 }
 
+static void nfs4_state_mark_open_context_bad(struct nfs4_state *state)
+{
+	struct inode *inode = state->inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_open_context *ctx;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		if (ctx->state != state)
+			continue;
+		set_bit(NFS_CONTEXT_BAD, &ctx->flags);
+	}
+	spin_unlock(&inode->i_lock);
+}
+
+static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error)
+{
+	set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags);
+	nfs4_state_mark_open_context_bad(state);
+}
+
 
 static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
 {
@@ -1335,13 +1372,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 	/* Guard against delegation returns and new lock/unlock calls */
 	down_write(&nfsi->rwsem);
 	/* Protect inode->i_flock using the BKL */
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
 		if (nfs_file_open_context(fl->fl_file)->state != state)
 			continue;
-		unlock_flocks();
+		spin_unlock(&inode->i_lock);
 		status = ops->recover_lock(state, fl);
 		switch (status) {
 			case 0:
@@ -1368,9 +1405,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 				status = 0;
 		}
-		lock_flocks();
+		spin_lock(&inode->i_lock);
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 out:
 	up_write(&nfsi->rwsem);
 	return status;
@@ -1390,11 +1427,14 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
 	 * recovering after a network partition or a reboot from a
 	 * server that doesn't support a grace period.
 	 */
-restart:
 	spin_lock(&sp->so_lock);
+	write_seqcount_begin(&sp->so_reclaim_seqcount);
+restart:
 	list_for_each_entry(state, &sp->so_states, open_states) {
 		if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
 			continue;
+		if (!nfs4_valid_open_stateid(state))
+			continue;
 		if (state->state == 0)
 			continue;
 		atomic_inc(&state->count);
@@ -1412,6 +1452,7 @@ restart:
 				}
 				spin_unlock(&state->state_lock);
 				nfs4_put_open_state(state);
+				spin_lock(&sp->so_lock);
 				goto restart;
 			}
 		}
@@ -1426,11 +1467,10 @@ restart:
 				 * Open state on this file cannot be recovered
 				 * All we can do is revert to using the zero stateid.
 				 */
-				memset(&state->stateid, 0,
-					sizeof(state->stateid));
-				/* Mark the file as being 'closed' */
-				state->state = 0;
+				nfs4_state_mark_recovery_failed(state, status);
 				break;
+			case -EAGAIN:
+				ssleep(1);
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_BAD_STATEID:
@@ -1449,12 +1489,17 @@ restart:
 				goto out_err;
 		}
 		nfs4_put_open_state(state);
+		spin_lock(&sp->so_lock);
 		goto restart;
 	}
+	write_seqcount_end(&sp->so_reclaim_seqcount);
 	spin_unlock(&sp->so_lock);
 	return 0;
 out_err:
 	nfs4_put_open_state(state);
+	spin_lock(&sp->so_lock);
+	write_seqcount_end(&sp->so_reclaim_seqcount);
+	spin_unlock(&sp->so_lock);
 	return status;
 }
 
@@ -1517,11 +1562,12 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
 }
 
 static void nfs4_reclaim_complete(struct nfs_client *clp,
-				 const struct nfs4_state_recovery_ops *ops)
+				 const struct nfs4_state_recovery_ops *ops,
+				 struct rpc_cred *cred)
 {
 	/* Notify the server we're done reclaiming our state */
 	if (ops->reclaim_complete)
-		(void)ops->reclaim_complete(clp);
+		(void)ops->reclaim_complete(clp, cred);
 }
 
 static void nfs4_clear_reclaim_server(struct nfs_server *server)
@@ -1566,9 +1612,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
 
 static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 {
+	const struct nfs4_state_recovery_ops *ops;
+	struct rpc_cred *cred;
+
 	if (!nfs4_state_clear_reclaim_reboot(clp))
 		return;
-	nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
+	ops = clp->cl_mvops->reboot_recovery_ops;
+	cred = ops->get_clid_cred(clp);
+	nfs4_reclaim_complete(clp, ops, cred);
+	put_rpccred(cred);
 }
 
 static void nfs_delegation_clear_all(struct nfs_client *clp)
@@ -1687,6 +1739,10 @@ static int nfs4_check_lease(struct nfs_client *clp)
 	}
 	status = ops->renew_lease(clp, cred);
 	put_rpccred(cred);
+	if (status == -ETIMEDOUT) {
+		set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+		return 0;
+	}
 out:
 	return nfs4_recovery_handle_error(clp, status);
 }
@@ -1716,10 +1772,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
 		clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
 		return -EPERM;
 	case -EACCES:
-		if (clp->cl_machine_cred == NULL)
-			return -EACCES;
-		/* Handle case where the user hasn't set up machine creds */
-		nfs4_clear_machine_cred(clp);
 	case -NFS4ERR_DELAY:
 	case -ETIMEDOUT:
 	case -EAGAIN:
@@ -1814,31 +1866,18 @@ int nfs4_discover_server_trunking(struct nfs_client *clp,
 {
 	const struct nfs4_state_recovery_ops *ops =
 				clp->cl_mvops->reboot_recovery_ops;
-	rpc_authflavor_t *flavors, flav, save;
 	struct rpc_clnt *clnt;
 	struct rpc_cred *cred;
-	int i, len, status;
+	int i, status;
 
 	dprintk("NFS: %s: testing '%s'\n", __func__, clp->cl_hostname);
 
-	len = NFS_MAX_SECFLAVORS;
-	flavors = kcalloc(len, sizeof(*flavors), GFP_KERNEL);
-	if (flavors == NULL) {
-		status = -ENOMEM;
-		goto out;
-	}
-	len = rpcauth_list_flavors(flavors, len);
-	if (len < 0) {
-		status = len;
-		goto out_free;
-	}
 	clnt = clp->cl_rpcclient;
-	save = clnt->cl_auth->au_flavor;
 	i = 0;
 
 	mutex_lock(&nfs_clid_init_mutex);
-	status  = -ENOENT;
 again:
+	status  = -ENOENT;
 	cred = ops->get_clid_cred(clp);
 	if (cred == NULL)
 		goto out_unlock;
@@ -1848,12 +1887,6 @@ again:
 	switch (status) {
 	case 0:
 		break;
-
-	case -EACCES:
-		if (clp->cl_machine_cred == NULL)
-			break;
-		/* Handle case where the user hasn't set up machine creds */
-		nfs4_clear_machine_cred(clp);
 	case -NFS4ERR_DELAY:
 	case -ETIMEDOUT:
 	case -EAGAIN:
@@ -1862,22 +1895,23 @@ again:
 		dprintk("NFS: %s after status %d, retrying\n",
 			__func__, status);
 		goto again;
-
+	case -EACCES:
+		if (i++)
+			break;
 	case -NFS4ERR_CLID_INUSE:
 	case -NFS4ERR_WRONGSEC:
-		status = -EPERM;
-		if (i >= len)
-			break;
-
-		flav = flavors[i++];
-		if (flav == save)
-			flav = flavors[i++];
-		clnt = rpc_clone_client_set_auth(clnt, flav);
+		clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX);
 		if (IS_ERR(clnt)) {
 			status = PTR_ERR(clnt);
 			break;
 		}
-		clp->cl_rpcclient = clnt;
+		/* Note: this is safe because we haven't yet marked the
+		 * client as ready, so we are the only user of
+		 * clp->cl_rpcclient
+		 */
+		clnt = xchg(&clp->cl_rpcclient, clnt);
+		rpc_shutdown_client(clnt);
+		clnt = clp->cl_rpcclient;
 		goto again;
 
 	case -NFS4ERR_MINOR_VERS_MISMATCH:
@@ -1888,13 +1922,15 @@ again:
 	case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
 				 * in nfs4_exchange_id */
 		status = -EKEYEXPIRED;
+		break;
+	default:
+		pr_warn("NFS: %s unhandled error %d. Exiting with error EIO\n",
+				__func__, status);
+		status = -EIO;
 	}
 
 out_unlock:
 	mutex_unlock(&nfs_clid_init_mutex);
-out_free:
-	kfree(flavors);
-out:
 	dprintk("NFS: %s: status = %d\n", __func__, status);
 	return status;
 }
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 84d2e9e2f31..5dbe2d26921 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -9,6 +9,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "dns_resolve.h"
 #include "pnfs.h"
 #include "nfs.h"
 
@@ -28,7 +29,7 @@ static struct file_system_type nfs4_remote_fs_type = {
 	.name		= "nfs4",
 	.mount		= nfs4_remote_mount,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
 };
 
 static struct file_system_type nfs4_remote_referral_fs_type = {
@@ -36,7 +37,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = {
 	.name		= "nfs4",
 	.mount		= nfs4_remote_referral_mount,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
 };
 
 struct file_system_type nfs4_referral_fs_type = {
@@ -44,7 +45,7 @@ struct file_system_type nfs4_referral_fs_type = {
 	.name		= "nfs4",
 	.mount		= nfs4_referral_mount,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
 };
 
 static const struct super_operations nfs4_sops = {
@@ -252,6 +253,8 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name,
 
 	dfprintk(MOUNT, "--> nfs4_try_mount()\n");
 
+	if (data->auth_flavors[0] == RPC_AUTH_MAXFLAVOR)
+		data->auth_flavors[0] = RPC_AUTH_UNIX;
 	export_path = data->nfs_server.export_path;
 	data->nfs_server.export_path = "/";
 	root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, mount_info,
@@ -329,18 +332,24 @@ static int __init init_nfs_v4(void)
 {
 	int err;
 
-	err = nfs_idmap_init();
+	err = nfs_dns_resolver_init();
 	if (err)
 		goto out;
 
-	err = nfs4_register_sysctl();
+	err = nfs_idmap_init();
 	if (err)
 		goto out1;
 
+	err = nfs4_register_sysctl();
+	if (err)
+		goto out2;
+
 	register_nfs_version(&nfs_v4);
 	return 0;
-out1:
+out2:
 	nfs_idmap_quit();
+out1:
+	nfs_dns_resolver_destroy();
 out:
 	return err;
 }
@@ -350,6 +359,7 @@ static void __exit exit_nfs_v4(void)
 	unregister_nfs_version(&nfs_v4);
 	nfs4_unregister_sysctl();
 	nfs_idmap_quit();
+	nfs_dns_resolver_destroy();
 }
 
 MODULE_LICENSE("GPL");
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 26b14392043..0abfb8466e7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -102,12 +102,23 @@ static int nfs4_stat_to_errno(int);
 #define nfs4_path_maxsz		(1 + ((3 + NFS4_MAXPATHLEN) >> 2))
 #define nfs4_owner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define nfs4_group_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
+#define	nfs4_label_maxsz	(4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
+#define encode_readdir_space 24
+#define encode_readdir_bitmask_sz 3
+#else
+#define	nfs4_label_maxsz	0
+#define encode_readdir_space 20
+#define encode_readdir_bitmask_sz 2
+#endif
 /* We support only one layout type per file system */
 #define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
 /* This is based on getfattr, which uses the most attributes: */
 #define nfs4_fattr_value_maxsz	(1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
 				3 + 3 + 3 + nfs4_owner_maxsz + \
-				nfs4_group_maxsz + decode_mdsthreshold_maxsz))
+				nfs4_group_maxsz + nfs4_label_maxsz + \
+				 decode_mdsthreshold_maxsz))
 #define nfs4_fattr_maxsz	(nfs4_fattr_bitmap_maxsz + \
 				nfs4_fattr_value_maxsz)
 #define decode_getattr_maxsz    (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -115,6 +126,7 @@ static int nfs4_stat_to_errno(int);
 				 1 + 2 + 1 + \
 				nfs4_owner_maxsz + \
 				nfs4_group_maxsz + \
+				nfs4_label_maxsz + \
 				4 + 4)
 #define encode_savefh_maxsz     (op_encode_hdr_maxsz)
 #define decode_savefh_maxsz     (op_decode_hdr_maxsz)
@@ -192,9 +204,11 @@ static int nfs4_stat_to_errno(int);
 				 encode_stateid_maxsz + 3)
 #define decode_read_maxsz	(op_decode_hdr_maxsz + 2)
 #define encode_readdir_maxsz	(op_encode_hdr_maxsz + \
-				 2 + encode_verifier_maxsz + 5)
+				 2 + encode_verifier_maxsz + 5 + \
+				nfs4_label_maxsz)
 #define decode_readdir_maxsz	(op_decode_hdr_maxsz + \
-				 decode_verifier_maxsz)
+				 decode_verifier_maxsz + \
+				nfs4_label_maxsz + nfs4_fattr_maxsz)
 #define encode_readlink_maxsz	(op_encode_hdr_maxsz)
 #define decode_readlink_maxsz	(op_decode_hdr_maxsz + 1)
 #define encode_write_maxsz	(op_encode_hdr_maxsz + \
@@ -530,14 +544,10 @@ static int nfs4_stat_to_errno(int);
 				decode_setclientid_maxsz)
 #define NFS4_enc_setclientid_confirm_sz \
 				(compound_encode_hdr_maxsz + \
-				encode_setclientid_confirm_maxsz + \
-				encode_putrootfh_maxsz + \
-				encode_fsinfo_maxsz)
+				encode_setclientid_confirm_maxsz)
 #define NFS4_dec_setclientid_confirm_sz \
 				(compound_decode_hdr_maxsz + \
-				decode_setclientid_confirm_maxsz + \
-				decode_putrootfh_maxsz + \
-				decode_fsinfo_maxsz)
+				decode_setclientid_confirm_maxsz)
 #define NFS4_enc_lock_sz        (compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
@@ -857,6 +867,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
 				     decode_sequence_maxsz +
 				     decode_putfh_maxsz) *
 				    XDR_UNIT);
+
+const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
+					   compound_decode_hdr_maxsz +
+					   decode_sequence_maxsz) *
+					  XDR_UNIT);
+EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
 #endif /* CONFIG_NFS_V4_1 */
 
 static const umode_t nfs_type2fmt[] = {
@@ -972,7 +988,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
 	encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
 }
 
-static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
+				const struct nfs4_label *label,
+				const struct nfs_server *server)
 {
 	char owner_name[IDMAP_NAMESZ];
 	char owner_group[IDMAP_NAMESZ];
@@ -983,15 +1001,16 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	int len;
 	uint32_t bmval0 = 0;
 	uint32_t bmval1 = 0;
+	uint32_t bmval2 = 0;
 
 	/*
 	 * We reserve enough space to write the entire attribute buffer at once.
 	 * In the worst-case, this would be
-	 *   12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
-	 *          = 36 bytes, plus any contribution from variable-length fields
+	 * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+	 * = 40 bytes, plus any contribution from variable-length fields
 	 *            such as owner/group.
 	 */
-	len = 16;
+	len = 20;
 
 	/* Sigh */
 	if (iap->ia_valid & ATTR_SIZE)
@@ -1002,7 +1021,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 		owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
 		if (owner_namelen < 0) {
 			dprintk("nfs: couldn't resolve uid %d to string\n",
-					iap->ia_uid);
+					from_kuid(&init_user_ns, iap->ia_uid));
 			/* XXX */
 			strcpy(owner_name, "nobody");
 			owner_namelen = sizeof("nobody") - 1;
@@ -1014,13 +1033,15 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 		owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
 		if (owner_grouplen < 0) {
 			dprintk("nfs: couldn't resolve gid %d to string\n",
-					iap->ia_gid);
+					from_kgid(&init_user_ns, iap->ia_gid));
 			strcpy(owner_group, "nobody");
 			owner_grouplen = sizeof("nobody") - 1;
 			/* goto out; */
 		}
 		len += 4 + (XDR_QUADLEN(owner_grouplen) << 2);
 	}
+	if (label)
+		len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
 	if (iap->ia_valid & ATTR_ATIME_SET)
 		len += 16;
 	else if (iap->ia_valid & ATTR_ATIME)
@@ -1035,9 +1056,9 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	 * We write the bitmap length now, but leave the bitmap and the attribute
 	 * buffer length to be backfilled at the end of this routine.
 	 */
-	*p++ = cpu_to_be32(2);
+	*p++ = cpu_to_be32(3);
 	q = p;
-	p += 3;
+	p += 4;
 
 	if (iap->ia_valid & ATTR_SIZE) {
 		bmval0 |= FATTR4_WORD0_SIZE;
@@ -1058,8 +1079,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	if (iap->ia_valid & ATTR_ATIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
 		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(iap->ia_atime.tv_sec);
+		p = xdr_encode_hyper(p, (s64)iap->ia_atime.tv_sec);
 		*p++ = cpu_to_be32(iap->ia_atime.tv_nsec);
 	}
 	else if (iap->ia_valid & ATTR_ATIME) {
@@ -1069,14 +1089,20 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	if (iap->ia_valid & ATTR_MTIME_SET) {
 		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
 		*p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
-		*p++ = cpu_to_be32(0);
-		*p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+		p = xdr_encode_hyper(p, (s64)iap->ia_mtime.tv_sec);
 		*p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
 	}
 	else if (iap->ia_valid & ATTR_MTIME) {
 		bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
 		*p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
 	}
+	if (label) {
+		bmval2 |= FATTR4_WORD2_SECURITY_LABEL;
+		*p++ = cpu_to_be32(label->lfs);
+		*p++ = cpu_to_be32(label->pi);
+		*p++ = cpu_to_be32(label->len);
+		p = xdr_encode_opaque_fixed(p, label->label, label->len);
+	}
 
 	/*
 	 * Now we backfill the bitmap and the attribute buffer length.
@@ -1086,9 +1112,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 				len, ((char *)p - (char *)q) + 4);
 		BUG();
 	}
-	len = (char *)p - (char *)q - 12;
+	len = (char *)p - (char *)q - 16;
 	*q++ = htonl(bmval0);
 	*q++ = htonl(bmval1);
+	*q++ = htonl(bmval2);
 	*q = htonl(len);
 
 /* out: */
@@ -1142,7 +1169,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	}
 
 	encode_string(xdr, create->name->len, create->name->name);
-	encode_attrs(xdr, create->attrs, create->server);
+	encode_attrs(xdr, create->attrs, create->label, create->server);
 }
 
 static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1194,8 +1221,10 @@ encode_getattr_three(struct xdr_stream *xdr,
 
 static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-	encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
-			   bitmask[1] & nfs4_fattr_bitmap[1], hdr);
+	encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
+			   bitmask[1] & nfs4_fattr_bitmap[1],
+			   bitmask[2] & nfs4_fattr_bitmap[2],
+			   hdr);
 }
 
 static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
@@ -1366,33 +1395,28 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
+	struct iattr dummy;
 	__be32 *p;
-	struct nfs_client *clp;
 
 	p = reserve_space(xdr, 4);
-	switch(arg->open_flags & O_EXCL) {
-	case 0:
+	switch(arg->createmode) {
+	case NFS4_CREATE_UNCHECKED:
 		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
-		encode_attrs(xdr, arg->u.attrs, arg->server);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
 		break;
-	default:
-		clp = arg->server->nfs_client;
-		if (clp->cl_mvops->minor_version > 0) {
-			if (nfs4_has_persistent_session(clp)) {
-				*p = cpu_to_be32(NFS4_CREATE_GUARDED);
-				encode_attrs(xdr, arg->u.attrs, arg->server);
-			} else {
-				struct iattr dummy;
-
-				*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
-				encode_nfs4_verifier(xdr, &arg->u.verifier);
-				dummy.ia_valid = 0;
-				encode_attrs(xdr, &dummy, arg->server);
-			}
-		} else {
-			*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
-			encode_nfs4_verifier(xdr, &arg->u.verifier);
-		}
+	case NFS4_CREATE_GUARDED:
+		*p = cpu_to_be32(NFS4_CREATE_GUARDED);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+		break;
+	case NFS4_CREATE_EXCLUSIVE:
+		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+		encode_nfs4_verifier(xdr, &arg->u.verifier);
+		break;
+	case NFS4_CREATE_EXCLUSIVE4_1:
+		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
+		encode_nfs4_verifier(xdr, &arg->u.verifier);
+		dummy.ia_valid = 0;
+		encode_attrs(xdr, &dummy, arg->label, arg->server);
 	}
 }
 
@@ -1459,6 +1483,23 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 	encode_string(xdr, name->len, name->name);
 }
 
+static inline void encode_claim_fh(struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_FH);
+}
+
+static inline void encode_claim_delegate_cur_fh(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+{
+	__be32 *p;
+
+	p = reserve_space(xdr, 4);
+	*p = cpu_to_be32(NFS4_OPEN_CLAIM_DELEG_CUR_FH);
+	encode_nfs4_stateid(xdr, stateid);
+}
+
 static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
 {
 	encode_op_hdr(xdr, OP_OPEN, decode_open_maxsz, hdr);
@@ -1474,6 +1515,12 @@ static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg,
 	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
 		encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
 		break;
+	case NFS4_OPEN_CLAIM_FH:
+		encode_claim_fh(xdr);
+		break;
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+		encode_claim_delegate_cur_fh(xdr, &arg->u.delegation);
+		break;
 	default:
 		BUG();
 	}
@@ -1506,35 +1553,12 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
 }
 
-static void encode_open_stateid(struct xdr_stream *xdr,
-		const struct nfs_open_context *ctx,
-		const struct nfs_lock_context *l_ctx,
-		fmode_t fmode,
-		int zero_seqid)
-{
-	nfs4_stateid stateid;
-
-	if (ctx->state != NULL) {
-		const struct nfs_lockowner *lockowner = NULL;
-
-		if (l_ctx != NULL)
-			lockowner = &l_ctx->lockowner;
-		nfs4_select_rw_stateid(&stateid, ctx->state,
-				fmode, lockowner);
-		if (zero_seqid)
-			stateid.seqid = 0;
-		encode_nfs4_stateid(xdr, &stateid);
-	} else
-		encode_nfs4_stateid(xdr, &zero_stateid);
-}
-
 static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
 	encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr);
-	encode_open_stateid(xdr, args->context, args->lock_context,
-			FMODE_READ, hdr->minorversion);
+	encode_nfs4_stateid(xdr, &args->stateid);
 
 	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
@@ -1543,7 +1567,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 
 static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
-	uint32_t attrs[2] = {
+	uint32_t attrs[3] = {
 		FATTR4_WORD0_RDATTR_ERROR,
 		FATTR4_WORD1_MOUNTED_ON_FILEID,
 	};
@@ -1566,20 +1590,26 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
 	encode_uint64(xdr, readdir->cookie);
 	encode_nfs4_verifier(xdr, &readdir->verifier);
-	p = reserve_space(xdr, 20);
+	p = reserve_space(xdr, encode_readdir_space);
 	*p++ = cpu_to_be32(dircount);
 	*p++ = cpu_to_be32(readdir->count);
-	*p++ = cpu_to_be32(2);
-
+	*p++ = cpu_to_be32(encode_readdir_bitmask_sz);
 	*p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
-	*p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+	*p   = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
+	if (encode_readdir_bitmask_sz > 2) {
+		if (hdr->minorversion > 1)
+			attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
+		p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
+	}
 	memcpy(verf, readdir->verifier.data, sizeof(verf));
-	dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
+
+	dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
 			__func__,
 			(unsigned long long)readdir->cookie,
 			verf[0], verf[1],
 			attrs[0] & readdir->bitmask[0],
-			attrs[1] & readdir->bitmask[1]);
+			attrs[1] & readdir->bitmask[1],
+			attrs[2] & readdir->bitmask[2]);
 }
 
 static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1638,7 +1668,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
 	encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
 	encode_nfs4_stateid(xdr, &arg->stateid);
-	encode_attrs(xdr, arg->iap, server);
+	encode_attrs(xdr, arg->iap, arg->label, server);
 }
 
 static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -1670,8 +1700,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	__be32 *p;
 
 	encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr);
-	encode_open_stateid(xdr, args->context, args->lock_context,
-			FMODE_WRITE, hdr->minorversion);
+	encode_nfs4_stateid(xdr, &args->stateid);
 
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
@@ -1901,7 +1930,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
 	p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
 				    NFS4_DEVICEID4_SIZE);
 	*p++ = cpu_to_be32(args->pdev->layout_type);
-	*p++ = cpu_to_be32(args->pdev->pglen);		/* gdia_maxcount */
+	*p++ = cpu_to_be32(args->pdev->maxcount);	/* gdia_maxcount */
 	*p++ = cpu_to_be32(0);				/* bitmap length 0 */
 }
 
@@ -2015,7 +2044,7 @@ static void encode_free_stateid(struct xdr_stream *xdr,
 				struct compound_hdr *hdr)
 {
 	encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
-	encode_nfs4_stateid(xdr, args->stateid);
+	encode_nfs4_stateid(xdr, &args->stateid);
 }
 #endif /* CONFIG_NFS_V4_1 */
 
@@ -2609,12 +2638,9 @@ static void nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req,
 	struct compound_hdr hdr = {
 		.nops	= 0,
 	};
-	const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME };
 
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_setclientid_confirm(xdr, arg, &hdr);
-	encode_putrootfh(xdr, &hdr);
-	encode_fsinfo(xdr, lease_bitmap, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -3497,8 +3523,11 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 	if (n == 0)
 		goto root_path;
 	dprintk("pathname4: ");
-	path->ncomponents = 0;
-	while (path->ncomponents < n) {
+	if (n > NFS4_PATHNAME_MAXCOMPONENTS) {
+		dprintk("cannot parse %d components in path\n", n);
+		goto out_eio;
+	}
+	for (path->ncomponents = 0; path->ncomponents < n; path->ncomponents++) {
 		struct nfs4_string *component = &path->components[path->ncomponents];
 		status = decode_opaque_inline(xdr, &component->len, &component->data);
 		if (unlikely(status != 0))
@@ -3507,12 +3536,6 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 			pr_cont("%s%.*s ",
 				(path->ncomponents != n ? "/ " : ""),
 				component->len, component->data);
-		if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
-			path->ncomponents++;
-		else {
-			dprintk("cannot parse %d components in path\n", n);
-			goto out_eio;
-		}
 	}
 out:
 	return status;
@@ -3557,27 +3580,23 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	n = be32_to_cpup(p);
 	if (n <= 0)
 		goto out_eio;
-	res->nlocations = 0;
-	while (res->nlocations < n) {
+	for (res->nlocations = 0; res->nlocations < n; res->nlocations++) {
 		u32 m;
-		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
+		struct nfs4_fs_location *loc;
 
+		if (res->nlocations == NFS4_FS_LOCATIONS_MAXENTRIES)
+			break;
+		loc = &res->locations[res->nlocations];
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
 		m = be32_to_cpup(p);
 
-		loc->nservers = 0;
 		dprintk("%s: servers:\n", __func__);
-		while (loc->nservers < m) {
-			struct nfs4_string *server = &loc->servers[loc->nservers];
-			status = decode_opaque_inline(xdr, &server->len, &server->data);
-			if (unlikely(status != 0))
-				goto out_eio;
-			dprintk("%s ", server->data);
-			if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
-				loc->nservers++;
-			else {
+		for (loc->nservers = 0; loc->nservers < m; loc->nservers++) {
+			struct nfs4_string *server;
+
+			if (loc->nservers == NFS4_FS_LOCATION_MAXSERVERS) {
 				unsigned int i;
 				dprintk("%s: using first %u of %u servers "
 					"returned for location %u\n",
@@ -3591,13 +3610,17 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 					if (unlikely(status != 0))
 						goto out_eio;
 				}
+				break;
 			}
+			server = &loc->servers[loc->nservers];
+			status = decode_opaque_inline(xdr, &server->len, &server->data);
+			if (unlikely(status != 0))
+				goto out_eio;
+			dprintk("%s ", server->data);
 		}
 		status = decode_pathname(xdr, &loc->rootpath);
 		if (unlikely(status != 0))
 			goto out_eio;
-		if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
-			res->nlocations++;
 	}
 	if (res->nlocations != 0)
 		status = NFS_ATTR_FATTR_V4_LOCATIONS;
@@ -3778,14 +3801,14 @@ out_overflow:
 }
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
-		const struct nfs_server *server, uint32_t *uid,
+		const struct nfs_server *server, kuid_t *uid,
 		struct nfs4_string *owner_name)
 {
 	uint32_t len;
 	__be32 *p;
 	int ret = 0;
 
-	*uid = -2;
+	*uid = make_kuid(&init_user_ns, -2);
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
@@ -3813,7 +3836,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
 					__func__, len);
 		bitmap[1] &= ~FATTR4_WORD1_OWNER;
 	}
-	dprintk("%s: uid=%d\n", __func__, (int)*uid);
+	dprintk("%s: uid=%d\n", __func__, (int)from_kuid(&init_user_ns, *uid));
 	return ret;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -3821,14 +3844,14 @@ out_overflow:
 }
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
-		const struct nfs_server *server, uint32_t *gid,
+		const struct nfs_server *server, kgid_t *gid,
 		struct nfs4_string *group_name)
 {
 	uint32_t len;
 	__be32 *p;
 	int ret = 0;
 
-	*gid = -2;
+	*gid = make_kgid(&init_user_ns, -2);
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
 		return -EIO;
 	if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
@@ -3856,7 +3879,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
 					__func__, len);
 		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
 	}
-	dprintk("%s: gid=%d\n", __func__, (int)*gid);
+	dprintk("%s: gid=%d\n", __func__, (int)from_kgid(&init_user_ns, *gid));
 	return ret;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -4056,6 +4079,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
 	return status;
 }
 
+static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
+					struct nfs4_label *label)
+{
+	uint32_t pi = 0;
+	uint32_t lfs = 0;
+	__u32 len;
+	__be32 *p;
+	int status = 0;
+
+	if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
+		return -EIO;
+	if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		lfs = be32_to_cpup(p++);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		pi = be32_to_cpup(p++);
+		p = xdr_inline_decode(xdr, 4);
+		if (unlikely(!p))
+			goto out_overflow;
+		len = be32_to_cpup(p++);
+		p = xdr_inline_decode(xdr, len);
+		if (unlikely(!p))
+			goto out_overflow;
+		if (len < NFS4_MAXLABELLEN) {
+			if (label) {
+				memcpy(label->label, p, len);
+				label->len = len;
+				label->pi = pi;
+				label->lfs = lfs;
+				status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
+			}
+			bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+		} else
+			printk(KERN_WARNING "%s: label too long (%u)!\n",
+					__func__, len);
+	}
+	if (label && label->label)
+		dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
+			(char *)label->label, label->len, label->pi, label->lfs);
+	return status;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
 {
 	int status = 0;
@@ -4398,7 +4471,7 @@ out_overflow:
 
 static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		struct nfs_fattr *fattr, struct nfs_fh *fh,
-		struct nfs4_fs_locations *fs_loc,
+		struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
 		const struct nfs_server *server)
 {
 	int status;
@@ -4506,6 +4579,13 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 	if (status < 0)
 		goto xdr_error;
 
+	if (label) {
+		status = decode_attr_security_label(xdr, bitmap, label);
+		if (status < 0)
+			goto xdr_error;
+		fattr->valid |= status;
+	}
+
 xdr_error:
 	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
@@ -4513,7 +4593,7 @@ xdr_error:
 
 static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
 		struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
-		const struct nfs_server *server)
+		struct nfs4_label *label, const struct nfs_server *server)
 {
 	unsigned int savep;
 	uint32_t attrlen,
@@ -4532,7 +4612,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
 	if (status < 0)
 		goto xdr_error;
 
-	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server);
+	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
+					label, server);
 	if (status < 0)
 		goto xdr_error;
 
@@ -4542,10 +4623,16 @@ xdr_error:
 	return status;
 }
 
+static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+		struct nfs4_label *label, const struct nfs_server *server)
+{
+	return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
+}
+
 static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
 		const struct nfs_server *server)
 {
-	return decode_getfattr_generic(xdr, fattr, NULL, NULL, server);
+	return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
 }
 
 /*
@@ -5209,27 +5296,30 @@ static int decode_delegreturn(struct xdr_stream *xdr)
 	return decode_op_hdr(xdr, OP_DELEGRETURN);
 }
 
-static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor)
+static int decode_secinfo_gss(struct xdr_stream *xdr,
+			      struct nfs4_secinfo4 *flavor)
 {
+	u32 oid_len;
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	flavor->gss.sec_oid4.len = be32_to_cpup(p);
-	if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN)
+	oid_len = be32_to_cpup(p);
+	if (oid_len > GSS_OID_MAX_LEN)
 		goto out_err;
 
-	p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len);
+	p = xdr_inline_decode(xdr, oid_len);
 	if (unlikely(!p))
 		goto out_overflow;
-	memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len);
+	memcpy(flavor->flavor_info.oid.data, p, oid_len);
+	flavor->flavor_info.oid.len = oid_len;
 
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
-	flavor->gss.qop4 = be32_to_cpup(p++);
-	flavor->gss.service = be32_to_cpup(p);
+	flavor->flavor_info.qop = be32_to_cpup(p++);
+	flavor->flavor_info.service = be32_to_cpup(p);
 
 	return 0;
 
@@ -5242,10 +5332,10 @@ out_err:
 
 static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
 {
-	struct nfs4_secinfo_flavor *sec_flavor;
+	struct nfs4_secinfo4 *sec_flavor;
+	unsigned int i, num_flavors;
 	int status;
 	__be32 *p;
-	int i, num_flavors;
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
@@ -5934,7 +6024,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_getfh(xdr, res->fh);
 	if (status)
 		goto out;
-	status = decode_getfattr(xdr, res->fattr, res->server);
+	status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
 	return status;
 }
@@ -5960,7 +6050,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
 		goto out;
 	status = decode_getfh(xdr, res->fh);
 	if (status == 0)
-		status = decode_getfattr(xdr, res->fattr, res->server);
+		status = decode_getfattr_label(xdr, res->fattr,
+						res->label, res->server);
 out:
 	return status;
 }
@@ -6051,7 +6142,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_restorefh(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server);
+	decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
 	return status;
 }
@@ -6080,7 +6171,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_getfh(xdr, res->fh);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server);
+	decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
 	return status;
 }
@@ -6112,7 +6203,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_getfattr(xdr, res->fattr, res->server);
+	status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
 	return status;
 }
@@ -6245,7 +6336,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 		goto out;
 	if (res->access_request)
 		decode_access(xdr, &res->access_supported, &res->access_result);
-	decode_getfattr(xdr, res->f_attr, res->server);
+	decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
 out:
 	return status;
 }
@@ -6322,7 +6413,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
 	status = decode_setattr(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server);
+	decode_getfattr_label(xdr, res->fattr, res->label, res->server);
 out:
 	return status;
 }
@@ -6648,8 +6739,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req,
  * Decode SETCLIENTID_CONFIRM response
  */
 static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
-					    struct xdr_stream *xdr,
-					    struct nfs_fsinfo *fsinfo)
+					    struct xdr_stream *xdr)
 {
 	struct compound_hdr hdr;
 	int status;
@@ -6657,10 +6747,6 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req,
 	status = decode_compound_hdr(xdr, &hdr);
 	if (!status)
 		status = decode_setclientid_confirm(xdr);
-	if (!status)
-		status = decode_putrootfh(xdr);
-	if (!status)
-		status = decode_fsinfo(xdr, fsinfo);
 	return status;
 }
 
@@ -6716,7 +6802,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
 	xdr_enter_page(xdr, PAGE_SIZE);
 	status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
 					 NULL, res->fs_locations,
-					 res->fs_locations->server);
+					 NULL, res->fs_locations->server);
 out:
 	return status;
 }
@@ -7129,7 +7215,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		goto out_overflow;
 
 	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
-				  NULL, entry->server) < 0)
+			NULL, entry->label, entry->server) < 0)
 		goto out_overflow;
 	if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
 		entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c6f990656f8..5457745dd4f 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -234,7 +234,7 @@ static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags,
 
 	lseg = kzalloc(lseg_size, gfp_flags);
 	if (unlikely(!lseg)) {
-		dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__,
+		dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__,
 			numdevs, lseg_size);
 		return -ENOMEM;
 	}
@@ -647,6 +647,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
 	.flags                   = PNFS_LAYOUTRET_ON_SETATTR |
 				   PNFS_LAYOUTRET_ON_ERROR,
 
+	.owner		       	 = THIS_MODULE,
 	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
 	.free_layout_hdr         = objlayout_free_layout_hdr,
 
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index a9ebd817278..e4f9cbfec67 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -613,8 +613,10 @@ int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
 	pd.pgbase = 0;
 	pd.pglen = PAGE_SIZE;
 	pd.mincount = 0;
+	pd.maxcount = PAGE_SIZE;
 
-	err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+	err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
+			pnfslay->plh_lc_cred);
 	dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
 	if (err)
 		goto err_out;
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 880ba086be9..87aa1dec612 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -114,7 +114,7 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
 	gfp_t gfp_flags);
 extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
 
-/* objio_free_result will free these @oir structs recieved from
+/* objio_free_result will free these @oir structs received from
  * objlayout_{read,write}_done
  */
 extern void objio_free_result(struct objlayout_io_res *oir);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e56e846e9d2..29cfb7ade12 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -84,6 +84,55 @@ nfs_page_free(struct nfs_page *p)
 	kmem_cache_free(nfs_page_cachep, p);
 }
 
+static void
+nfs_iocounter_inc(struct nfs_io_counter *c)
+{
+	atomic_inc(&c->io_count);
+}
+
+static void
+nfs_iocounter_dec(struct nfs_io_counter *c)
+{
+	if (atomic_dec_and_test(&c->io_count)) {
+		clear_bit(NFS_IO_INPROGRESS, &c->flags);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
+	}
+}
+
+static int
+__nfs_iocounter_wait(struct nfs_io_counter *c)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS);
+	DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS);
+	int ret = 0;
+
+	do {
+		prepare_to_wait(wq, &q.wait, TASK_KILLABLE);
+		set_bit(NFS_IO_INPROGRESS, &c->flags);
+		if (atomic_read(&c->io_count) == 0)
+			break;
+		ret = nfs_wait_bit_killable(&c->flags);
+	} while (atomic_read(&c->io_count) != 0);
+	finish_wait(wq, &q.wait);
+	return ret;
+}
+
+/**
+ * nfs_iocounter_wait - wait for i/o to complete
+ * @c: nfs_io_counter to use
+ *
+ * returns -ERESTARTSYS if interrupted by a fatal signal.
+ * Otherwise returns 0 once the io_count hits 0.
+ */
+int
+nfs_iocounter_wait(struct nfs_io_counter *c)
+{
+	if (atomic_read(&c->io_count) == 0)
+		return 0;
+	return __nfs_iocounter_wait(c);
+}
+
 /**
  * nfs_create_request - Create an NFS read/write request.
  * @ctx: open context to use
@@ -104,6 +153,8 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	struct nfs_page		*req;
 	struct nfs_lock_context *l_ctx;
 
+	if (test_bit(NFS_CONTEXT_BAD, &ctx->flags))
+		return ERR_PTR(-EBADF);
 	/* try to allocate the request struct */
 	req = nfs_page_alloc();
 	if (req == NULL)
@@ -116,6 +167,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 		return ERR_CAST(l_ctx);
 	}
 	req->wb_lock_context = l_ctx;
+	nfs_iocounter_inc(&l_ctx->io_count);
 
 	/* Initialize the request struct. Initially, we assume a
 	 * long write-back delay. This will be adjusted in
@@ -175,6 +227,7 @@ static void nfs_clear_request(struct nfs_page *req)
 		req->wb_page = NULL;
 	}
 	if (l_ctx != NULL) {
+		nfs_iocounter_dec(&l_ctx->io_count);
 		nfs_put_lock_context(l_ctx);
 		req->wb_lock_context = NULL;
 	}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d00260b0810..3a3a79d6bf1 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -360,7 +360,7 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 }
 EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 
-static inline u64
+static u64
 end_offset(u64 start, u64 len)
 {
 	u64 end;
@@ -376,9 +376,9 @@ end_offset(u64 start, u64 len)
  *           start2           end2
  *           [----------------)
  */
-static inline int
-lo_seg_contained(struct pnfs_layout_range *l1,
-		 struct pnfs_layout_range *l2)
+static bool
+pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
+		 const struct pnfs_layout_range *l2)
 {
 	u64 start1 = l1->offset;
 	u64 end1 = end_offset(start1, l1->length);
@@ -395,9 +395,9 @@ lo_seg_contained(struct pnfs_layout_range *l1,
  *                              start2           end2
  *                              [----------------)
  */
-static inline int
-lo_seg_intersecting(struct pnfs_layout_range *l1,
-		    struct pnfs_layout_range *l2)
+static bool
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
+		    const struct pnfs_layout_range *l2)
 {
 	u64 start1 = l1->offset;
 	u64 end1 = end_offset(start1, l1->length);
@@ -409,12 +409,22 @@ lo_seg_intersecting(struct pnfs_layout_range *l1,
 }
 
 static bool
-should_free_lseg(struct pnfs_layout_range *lseg_range,
-		 struct pnfs_layout_range *recall_range)
+should_free_lseg(const struct pnfs_layout_range *lseg_range,
+		 const struct pnfs_layout_range *recall_range)
 {
 	return (recall_range->iomode == IOMODE_ANY ||
 		lseg_range->iomode == recall_range->iomode) &&
-	       lo_seg_intersecting(lseg_range, recall_range);
+	       pnfs_lseg_range_intersecting(lseg_range, recall_range);
+}
+
+static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
+		struct list_head *tmp_list)
+{
+	if (!atomic_dec_and_test(&lseg->pls_refcount))
+		return false;
+	pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
+	list_add(&lseg->pls_list, tmp_list);
+	return true;
 }
 
 /* Returns 1 if lseg is removed from list, 0 otherwise */
@@ -430,11 +440,8 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 		 */
 		dprintk("%s: lseg %p ref %d\n", __func__, lseg,
 			atomic_read(&lseg->pls_refcount));
-		if (atomic_dec_and_test(&lseg->pls_refcount)) {
-			pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
-			list_add(&lseg->pls_list, tmp_list);
+		if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
 			rv = 1;
-		}
 	}
 	return rv;
 }
@@ -505,37 +512,147 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 }
 EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
 
-/*
- * Called by the state manger to remove all layouts established under an
- * expired lease.
- */
-void
-pnfs_destroy_all_layouts(struct nfs_client *clp)
+static bool
+pnfs_layout_add_bulk_destroy_list(struct inode *inode,
+		struct list_head *layout_list)
 {
-	struct nfs_server *server;
 	struct pnfs_layout_hdr *lo;
-	LIST_HEAD(tmp_list);
+	bool ret = false;
 
-	nfs4_deviceid_mark_client_invalid(clp);
-	nfs4_deviceid_purge_client(clp);
+	spin_lock(&inode->i_lock);
+	lo = NFS_I(inode)->layout;
+	if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
+		pnfs_get_layout_hdr(lo);
+		list_add(&lo->plh_bulk_destroy, layout_list);
+		ret = true;
+	}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/* Caller must hold rcu_read_lock and clp->cl_lock */
+static int
+pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
+		struct nfs_server *server,
+		struct list_head *layout_list)
+{
+	struct pnfs_layout_hdr *lo, *next;
+	struct inode *inode;
+
+	list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
+		inode = igrab(lo->plh_inode);
+		if (inode == NULL)
+			continue;
+		list_del_init(&lo->plh_layouts);
+		if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
+			continue;
+		rcu_read_unlock();
+		spin_unlock(&clp->cl_lock);
+		iput(inode);
+		spin_lock(&clp->cl_lock);
+		rcu_read_lock();
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static int
+pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
+		bool is_bulk_recall)
+{
+	struct pnfs_layout_hdr *lo;
+	struct inode *inode;
+	struct pnfs_layout_range range = {
+		.iomode = IOMODE_ANY,
+		.offset = 0,
+		.length = NFS4_MAX_UINT64,
+	};
+	LIST_HEAD(lseg_list);
+	int ret = 0;
+
+	while (!list_empty(layout_list)) {
+		lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
+				plh_bulk_destroy);
+		dprintk("%s freeing layout for inode %lu\n", __func__,
+			lo->plh_inode->i_ino);
+		inode = lo->plh_inode;
+		spin_lock(&inode->i_lock);
+		list_del_init(&lo->plh_bulk_destroy);
+		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
+		if (is_bulk_recall)
+			set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+		if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
+			ret = -EAGAIN;
+		spin_unlock(&inode->i_lock);
+		pnfs_free_lseg_list(&lseg_list);
+		pnfs_put_layout_hdr(lo);
+		iput(inode);
+	}
+	return ret;
+}
+
+int
+pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+		struct nfs_fsid *fsid,
+		bool is_recall)
+{
+	struct nfs_server *server;
+	LIST_HEAD(layout_list);
 
 	spin_lock(&clp->cl_lock);
 	rcu_read_lock();
+restart:
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-		if (!list_empty(&server->layouts))
-			list_splice_init(&server->layouts, &tmp_list);
+		if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
+			continue;
+		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+				server,
+				&layout_list) != 0)
+			goto restart;
 	}
 	rcu_read_unlock();
 	spin_unlock(&clp->cl_lock);
 
-	while (!list_empty(&tmp_list)) {
-		lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
-				plh_layouts);
-		dprintk("%s freeing layout for inode %lu\n", __func__,
-			lo->plh_inode->i_ino);
-		list_del_init(&lo->plh_layouts);
-		pnfs_destroy_layout(NFS_I(lo->plh_inode));
+	if (list_empty(&layout_list))
+		return 0;
+	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+int
+pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+		bool is_recall)
+{
+	struct nfs_server *server;
+	LIST_HEAD(layout_list);
+
+	spin_lock(&clp->cl_lock);
+	rcu_read_lock();
+restart:
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+					server,
+					&layout_list) != 0)
+			goto restart;
 	}
+	rcu_read_unlock();
+	spin_unlock(&clp->cl_lock);
+
+	if (list_empty(&layout_list))
+		return 0;
+	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+	nfs4_deviceid_mark_client_invalid(clp);
+	nfs4_deviceid_purge_client(clp);
+
+	pnfs_destroy_layouts_byclid(clp, false);
 }
 
 /*
@@ -601,6 +718,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 	spin_lock(&lo->plh_inode->i_lock);
 	if (pnfs_layoutgets_blocked(lo, 1)) {
 		status = -EAGAIN;
+	} else if (!nfs4_valid_open_stateid(open_state)) {
+		status = -EBADF;
 	} else if (list_empty(&lo->plh_segs)) {
 		int seq;
 
@@ -647,6 +766,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	lgp->args.inode = ino;
 	lgp->args.ctx = get_nfs_open_context(ctx);
 	lgp->gfp_flags = gfp_flags;
+	lgp->cred = lo->plh_lc_cred;
 
 	/* Synchronously retrieve layout information from server and
 	 * store in lseg.
@@ -667,6 +787,21 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 	return lseg;
 }
 
+static void pnfs_clear_layoutcommit(struct inode *inode,
+		struct list_head *head)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct pnfs_layout_segment *lseg, *tmp;
+
+	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+		return;
+	list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
+		if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+			continue;
+		pnfs_lseg_dec_and_remove_zero(lseg, head);
+	}
+}
+
 /*
  * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
  * when the layout segment list is empty.
@@ -698,6 +833,7 @@ _pnfs_return_layout(struct inode *ino)
 	/* Reference matched in nfs4_layoutreturn_release */
 	pnfs_get_layout_hdr(lo);
 	empty = list_empty(&lo->plh_segs);
+	pnfs_clear_layoutcommit(ino, &tmp_list);
 	pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
 	/* Don't send a LAYOUTRETURN if list was initially empty */
 	if (empty) {
@@ -710,8 +846,6 @@ _pnfs_return_layout(struct inode *ino)
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&tmp_list);
 
-	WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
-
 	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
 	if (unlikely(lrp == NULL)) {
 		status = -ENOMEM;
@@ -727,6 +861,7 @@ _pnfs_return_layout(struct inode *ino)
 	lrp->args.inode = ino;
 	lrp->args.layout = lo;
 	lrp->clp = NFS_SERVER(ino)->nfs_client;
+	lrp->cred = lo->plh_lc_cred;
 
 	status = nfs4_proc_layoutreturn(lrp);
 out:
@@ -735,6 +870,33 @@ out:
 }
 EXPORT_SYMBOL_GPL(_pnfs_return_layout);
 
+int
+pnfs_commit_and_return_layout(struct inode *inode)
+{
+	struct pnfs_layout_hdr *lo;
+	int ret;
+
+	spin_lock(&inode->i_lock);
+	lo = NFS_I(inode)->layout;
+	if (lo == NULL) {
+		spin_unlock(&inode->i_lock);
+		return 0;
+	}
+	pnfs_get_layout_hdr(lo);
+	/* Block new layoutgets and read/write to ds */
+	lo->plh_block_lgets++;
+	spin_unlock(&inode->i_lock);
+	filemap_fdatawait(inode->i_mapping);
+	ret = pnfs_layoutcommit_inode(inode, true);
+	if (ret == 0)
+		ret = _pnfs_return_layout(inode);
+	spin_lock(&inode->i_lock);
+	lo->plh_block_lgets--;
+	spin_unlock(&inode->i_lock);
+	pnfs_put_layout_hdr(lo);
+	return ret;
+}
+
 bool pnfs_roc(struct inode *ino)
 {
 	struct pnfs_layout_hdr *lo;
@@ -824,8 +986,8 @@ out:
  * are seen first.
  */
 static s64
-cmp_layout(struct pnfs_layout_range *l1,
-	   struct pnfs_layout_range *l2)
+pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
+	   const struct pnfs_layout_range *l2)
 {
 	s64 d;
 
@@ -852,7 +1014,7 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
 	dprintk("%s:Begin\n", __func__);
 
 	list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-		if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
+		if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
 			continue;
 		list_add_tail(&lseg->pls_list, &lp->pls_list);
 		dprintk("%s: inserted lseg %p "
@@ -888,9 +1050,9 @@ alloc_init_layout_hdr(struct inode *ino,
 	atomic_set(&lo->plh_refcount, 1);
 	INIT_LIST_HEAD(&lo->plh_layouts);
 	INIT_LIST_HEAD(&lo->plh_segs);
-	INIT_LIST_HEAD(&lo->plh_bulk_recall);
+	INIT_LIST_HEAD(&lo->plh_bulk_destroy);
 	lo->plh_inode = ino;
-	lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
+	lo->plh_lc_cred = get_rpccred(ctx->cred);
 	return lo;
 }
 
@@ -931,21 +1093,21 @@ out_existing:
  * READ		READ	true
  * READ		RW	true
  */
-static int
-is_matching_lseg(struct pnfs_layout_range *ls_range,
-		 struct pnfs_layout_range *range)
+static bool
+pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
+		 const struct pnfs_layout_range *range)
 {
 	struct pnfs_layout_range range1;
 
 	if ((range->iomode == IOMODE_RW &&
 	     ls_range->iomode != IOMODE_RW) ||
-	    !lo_seg_intersecting(ls_range, range))
+	    !pnfs_lseg_range_intersecting(ls_range, range))
 		return 0;
 
 	/* range1 covers only the first byte in the range */
 	range1 = *range;
 	range1.length = 1;
-	return lo_seg_contained(ls_range, &range1);
+	return pnfs_lseg_range_contained(ls_range, &range1);
 }
 
 /*
@@ -961,7 +1123,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
 
 	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
 		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
-		    is_matching_lseg(&lseg->pls_range, range)) {
+		    pnfs_lseg_range_match(&lseg->pls_range, range)) {
 			ret = pnfs_get_lseg(lseg);
 			break;
 		}
@@ -1071,7 +1233,7 @@ pnfs_update_layout(struct inode *ino,
 	struct nfs_client *clp = server->nfs_client;
 	struct pnfs_layout_hdr *lo;
 	struct pnfs_layout_segment *lseg = NULL;
-	bool first = false;
+	bool first;
 
 	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
 		goto out;
@@ -1105,10 +1267,9 @@ pnfs_update_layout(struct inode *ino,
 		goto out_unlock;
 	atomic_inc(&lo->plh_outstanding);
 
-	if (list_empty(&lo->plh_segs))
-		first = true;
-
+	first = list_empty(&lo->plh_layouts) ? true : false;
 	spin_unlock(&ino->i_lock);
+
 	if (first) {
 		/* The lo must be on the clp list if there is any
 		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
@@ -1312,13 +1473,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
 int pnfs_write_done_resend_to_mds(struct inode *inode,
 				struct list_head *head,
-				const struct nfs_pgio_completion_ops *compl_ops)
+				const struct nfs_pgio_completion_ops *compl_ops,
+				struct nfs_direct_req *dreq)
 {
 	struct nfs_pageio_descriptor pgio;
 	LIST_HEAD(failed);
 
 	/* Resend all requests through the MDS */
 	nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops);
+	pgio.pg_dreq = dreq;
 	while (!list_empty(head)) {
 		struct nfs_page *req = nfs_list_entry(head->next);
 
@@ -1347,13 +1510,13 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
 	dprintk("pnfs write error = %d\n", hdr->pnfs_error);
 	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
 	    PNFS_LAYOUTRET_ON_ERROR) {
-		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
 		pnfs_return_layout(hdr->inode);
 	}
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
 		data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
 							&hdr->pages,
-							hdr->completion_ops);
+							hdr->completion_ops,
+							hdr->dreq);
 }
 
 /*
@@ -1468,13 +1631,15 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
 
 int pnfs_read_done_resend_to_mds(struct inode *inode,
 				struct list_head *head,
-				const struct nfs_pgio_completion_ops *compl_ops)
+				const struct nfs_pgio_completion_ops *compl_ops,
+				struct nfs_direct_req *dreq)
 {
 	struct nfs_pageio_descriptor pgio;
 	LIST_HEAD(failed);
 
 	/* Resend all requests through the MDS */
 	nfs_pageio_init_read(&pgio, inode, compl_ops);
+	pgio.pg_dreq = dreq;
 	while (!list_empty(head)) {
 		struct nfs_page *req = nfs_list_entry(head->next);
 
@@ -1499,13 +1664,13 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
 	dprintk("pnfs read error = %d\n", hdr->pnfs_error);
 	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
 	    PNFS_LAYOUTRET_ON_ERROR) {
-		clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags);
 		pnfs_return_layout(hdr->inode);
 	}
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
 		data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
 							&hdr->pages,
-							hdr->completion_ops);
+							hdr->completion_ops,
+							hdr->dreq);
 }
 
 /*
@@ -1631,11 +1796,27 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
 
 	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
 		if (lseg->pls_range.iomode == IOMODE_RW &&
-		    test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+		    test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
 			list_add(&lseg->pls_lc_list, listp);
 	}
 }
 
+static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
+{
+	struct pnfs_layout_segment *lseg, *tmp;
+	unsigned long *bitlock = &NFS_I(inode)->flags;
+
+	/* Matched by references in pnfs_set_layoutcommit */
+	list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
+		list_del_init(&lseg->pls_lc_list);
+		pnfs_put_lseg(lseg);
+	}
+
+	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+	smp_mb__after_clear_bit();
+	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+}
+
 void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
 {
 	pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
@@ -1680,6 +1861,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
 
 	if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
 		nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
+	pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
 }
 
 /*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index dbf7bba52da..a4f41810a7f 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -132,7 +132,7 @@ struct pnfs_layoutdriver_type {
 struct pnfs_layout_hdr {
 	atomic_t		plh_refcount;
 	struct list_head	plh_layouts;   /* other client layouts */
-	struct list_head	plh_bulk_recall; /* clnt list of bulk recalls */
+	struct list_head	plh_bulk_destroy;
 	struct list_head	plh_segs;      /* layout segments list */
 	nfs4_stateid		plh_stateid;
 	atomic_t		plh_outstanding; /* number of RPCs out */
@@ -149,9 +149,10 @@ struct pnfs_device {
 	struct nfs4_deviceid dev_id;
 	unsigned int  layout_type;
 	unsigned int  mincount;
+	unsigned int  maxcount;	/* gdia_maxcount */
 	struct page **pages;
 	unsigned int  pgbase;
-	unsigned int  pglen;
+	unsigned int  pglen;	/* reply buffer length */
 };
 
 #define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -170,7 +171,8 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
 				   const struct nfs_fh *fh,
 				   struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
-				   struct pnfs_device *dev);
+				   struct pnfs_device *dev,
+				   struct rpc_cred *cred);
 extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 
@@ -196,6 +198,11 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
+int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+		struct nfs_fsid *fsid,
+		bool is_recall);
+int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+		bool is_recall);
 void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
 			     const nfs4_stateid *new,
@@ -214,6 +221,7 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
+int pnfs_commit_and_return_layout(struct inode *);
 void pnfs_ld_write_done(struct nfs_write_data *);
 void pnfs_ld_read_done(struct nfs_read_data *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
@@ -225,9 +233,11 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
 int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
-			const struct nfs_pgio_completion_ops *compl_ops);
+			const struct nfs_pgio_completion_ops *compl_ops,
+			struct nfs_direct_req *dreq);
 int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
-			const struct nfs_pgio_completion_ops *compl_ops);
+			const struct nfs_pgio_completion_ops *compl_ops,
+			struct nfs_direct_req *dreq);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 
 /* nfs4_deviceid_flags */
@@ -400,6 +410,11 @@ static inline int pnfs_return_layout(struct inode *ino)
 	return 0;
 }
 
+static inline int pnfs_commit_and_return_layout(struct inode *inode)
+{
+	return 0;
+}
+
 static inline bool
 pnfs_ld_layoutret_on_setattr(struct inode *inode)
 {
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index d35b62e83ea..6da209bd940 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -77,9 +77,8 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
 		 long hash)
 {
 	struct nfs4_deviceid_node *d;
-	struct hlist_node *n;
 
-	hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+	hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)
 		if (d->ld == ld && d->nfs_client == clp &&
 		    !memcmp(&d->deviceid, id, sizeof(*id))) {
 			if (atomic_read(&d->ref))
@@ -248,12 +247,11 @@ static void
 _deviceid_purge_client(const struct nfs_client *clp, long hash)
 {
 	struct nfs4_deviceid_node *d;
-	struct hlist_node *n;
 	HLIST_HEAD(tmp);
 
 	spin_lock(&nfs4_deviceid_lock);
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+	hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[hash], node)
 		if (d->nfs_client == clp && atomic_read(&d->ref)) {
 			hlist_del_init_rcu(&d->node);
 			hlist_add_head(&d->tmpnode, &tmp);
@@ -291,12 +289,11 @@ void
 nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
 {
 	struct nfs4_deviceid_node *d;
-	struct hlist_node *n;
 	int i;
 
 	rcu_read_lock();
 	for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i ++){
-		hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[i], node)
+		hlist_for_each_entry_rcu(d, &nfs4_deviceid_cache[i], node)
 			if (d->nfs_client == clp)
 				set_bit(NFS_DEVICEID_INVALID, &d->flags);
 	}
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index f084dac948e..c041c41f7a5 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -98,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
  */
 static int
 nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
-		struct nfs_fattr *fattr)
+		struct nfs_fattr *fattr, struct nfs4_label *label)
 {
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_GETATTR],
@@ -146,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 
 static int
 nfs_proc_lookup(struct inode *dir, struct qstr *name,
-		struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+		struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+		struct nfs4_label *label)
 {
 	struct nfs_diropargs	arg = {
 		.fh		= NFS_FH(dir),
@@ -243,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 	if (status == 0)
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
 	nfs_free_createdata(data);
 out:
 	dprintk("NFS reply create: %d\n", status);
@@ -290,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	}
 	if (status == 0)
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
 	nfs_free_createdata(data);
 out:
 	dprintk("NFS reply mknod: %d\n", status);
@@ -442,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
 	 * should fill in the data with a LOOKUP call on the wire.
 	 */
 	if (status == 0)
-		status = nfs_instantiate(dentry, fh, fattr);
+		status = nfs_instantiate(dentry, fh, fattr, NULL);
 
 out_free:
 	nfs_free_fattr(fattr);
@@ -471,7 +472,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 	if (status == 0)
-		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
 	nfs_free_createdata(data);
 out:
 	dprintk("NFS reply mkdir: %d\n", status);
@@ -662,7 +663,7 @@ nfs_proc_commit_setup(struct nfs_commit_data *data, struct rpc_message *msg)
 static int
 nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
 	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a5e5d9899d5..70a26c651f0 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -514,6 +514,8 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
 	NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+		rpc_exit(task, -EIO);
 }
 
 static const struct rpc_call_ops nfs_read_common_ops = {
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b056b162872..71fdc0dfa0d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -31,6 +31,7 @@
 #include <linux/errno.h>
 #include <linux/unistd.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
 #include <linux/sunrpc/xprtsock.h>
@@ -54,7 +55,6 @@
 #include <linux/parser.h>
 #include <linux/nsproxy.h>
 #include <linux/rcupdate.h>
-#include <linux/kthread.h>
 
 #include <asm/uaccess.h>
 
@@ -269,7 +269,7 @@ static match_table_t nfs_local_lock_tokens = {
 
 enum {
 	Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
-	Opt_vers_4_1,
+	Opt_vers_4_1, Opt_vers_4_2,
 
 	Opt_vers_err
 };
@@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = {
 	{ Opt_vers_4, "4" },
 	{ Opt_vers_4_0, "4.0" },
 	{ Opt_vers_4_1, "4.1" },
+	{ Opt_vers_4_2, "4.2" },
 
 	{ Opt_vers_err, NULL }
 };
@@ -292,8 +293,9 @@ struct file_system_type nfs_fs_type = {
 	.name		= "nfs",
 	.mount		= nfs_fs_mount,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
 };
+MODULE_ALIAS_FS("nfs");
 EXPORT_SYMBOL_GPL(nfs_fs_type);
 
 struct file_system_type nfs_xdev_fs_type = {
@@ -301,7 +303,7 @@ struct file_system_type nfs_xdev_fs_type = {
 	.name		= "nfs",
 	.mount		= nfs_xdev_mount,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
 };
 
 const struct super_operations nfs_sops = {
@@ -331,8 +333,10 @@ struct file_system_type nfs4_fs_type = {
 	.name		= "nfs4",
 	.mount		= nfs_fs_mount,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA,
 };
+MODULE_ALIAS_FS("nfs4");
+MODULE_ALIAS("nfs4");
 EXPORT_SYMBOL_GPL(nfs4_fs_type);
 
 static int __init register_nfs4_fs(void)
@@ -418,54 +422,6 @@ void nfs_sb_deactive(struct super_block *sb)
 }
 EXPORT_SYMBOL_GPL(nfs_sb_deactive);
 
-static int nfs_deactivate_super_async_work(void *ptr)
-{
-	struct super_block *sb = ptr;
-
-	deactivate_super(sb);
-	module_put_and_exit(0);
-	return 0;
-}
-
-/*
- * same effect as deactivate_super, but will do final unmount in kthread
- * context
- */
-static void nfs_deactivate_super_async(struct super_block *sb)
-{
-	struct task_struct *task;
-	char buf[INET6_ADDRSTRLEN + 1];
-	struct nfs_server *server = NFS_SB(sb);
-	struct nfs_client *clp = server->nfs_client;
-
-	if (!atomic_add_unless(&sb->s_active, -1, 1)) {
-		rcu_read_lock();
-		snprintf(buf, sizeof(buf),
-			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-		rcu_read_unlock();
-
-		__module_get(THIS_MODULE);
-		task = kthread_run(nfs_deactivate_super_async_work, sb,
-				"%s-deactivate-super", buf);
-		if (IS_ERR(task)) {
-			pr_err("%s: kthread_run: %ld\n",
-				__func__, PTR_ERR(task));
-			/* make synchronous call and hope for the best */
-			deactivate_super(sb);
-			module_put(THIS_MODULE);
-		}
-	}
-}
-
-void nfs_sb_deactive_async(struct super_block *sb)
-{
-	struct nfs_server *server = NFS_SB(sb);
-
-	if (atomic_dec_and_test(&server->active))
-		nfs_deactivate_super_async(sb);
-}
-EXPORT_SYMBOL_GPL(nfs_sb_deactive_async);
-
 /*
  * Deliver file system statistics to userspace
  */
@@ -877,6 +833,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
 		seq_printf(m, "\n\tnfsv4:\t");
 		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
 		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+		seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
 		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
 		show_sessions(m, nfss);
 		show_pnfs(m, nfss);
@@ -965,7 +922,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
 		data->mount_server.port	= NFS_UNSPEC_PORT;
 		data->nfs_server.port	= NFS_UNSPEC_PORT;
 		data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-		data->auth_flavors[0]	= RPC_AUTH_UNIX;
+		data->auth_flavors[0]	= RPC_AUTH_MAXFLAVOR;
 		data->auth_flavor_len	= 1;
 		data->minorversion	= 0;
 		data->need_mount	= true;
@@ -1142,6 +1099,10 @@ static int nfs_parse_version_string(char *string,
 		mnt->version = 4;
 		mnt->minorversion = 1;
 		break;
+	case Opt_vers_4_2:
+		mnt->version = 4;
+		mnt->minorversion = 2;
+		break;
 	default:
 		return 0;
 	}
@@ -1653,49 +1614,35 @@ out_security_failure:
 }
 
 /*
- * Match the requested auth flavors with the list returned by
- * the server.  Returns zero and sets the mount's authentication
- * flavor on success; returns -EACCES if server does not support
- * the requested flavor.
+ * Ensure that the specified authtype in args->auth_flavors[0] is supported by
+ * the server. Returns 0 if it's ok, and -EACCES if not.
  */
-static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
-			     struct nfs_mount_request *request)
+static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
+			rpc_authflavor_t *server_authlist, unsigned int count)
 {
-	unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
-
-	/*
-	 * Certain releases of Linux's mountd return an empty
-	 * flavor list.  To prevent behavioral regression with
-	 * these servers (ie. rejecting mounts that used to
-	 * succeed), revert to pre-2.6.32 behavior (no checking)
-	 * if the returned flavor list is empty.
-	 */
-	if (server_authlist_len == 0)
-		return 0;
+	unsigned int i;
 
 	/*
-	 * We avoid sophisticated negotiating here, as there are
-	 * plenty of cases where we can get it wrong, providing
-	 * either too little or too much security.
+	 * If the sec= mount option is used, the specified flavor or AUTH_NULL
+	 * must be in the list returned by the server.
 	 *
-	 * RFC 2623, section 2.7 suggests we SHOULD prefer the
-	 * flavor listed first.  However, some servers list
-	 * AUTH_NULL first.  Our caller plants AUTH_SYS, the
-	 * preferred default, in args->auth_flavors[0] if user
-	 * didn't specify sec= mount option.
+	 * AUTH_NULL has a special meaning when it's in the server list - it
+	 * means that the server will ignore the rpc creds, so any flavor
+	 * can be used.
 	 */
-	for (i = 0; i < args->auth_flavor_len; i++)
-		for (j = 0; j < server_authlist_len; j++)
-			if (args->auth_flavors[i] == request->auth_flavs[j]) {
-				dfprintk(MOUNT, "NFS: using auth flavor %d\n",
-					request->auth_flavs[j]);
-				args->auth_flavors[0] = request->auth_flavs[j];
-				return 0;
-			}
+	for (i = 0; i < count; i++) {
+		if (args->auth_flavors[0] == server_authlist[i] ||
+		    server_authlist[i] == RPC_AUTH_NULL)
+			goto out;
+	}
 
-	dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n");
-	nfs_umount(request);
+	dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
+		args->auth_flavors[0]);
 	return -EACCES;
+
+out:
+	dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+	return 0;
 }
 
 /*
@@ -1703,10 +1650,10 @@ static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
  * corresponding to the provided path.
  */
 static int nfs_request_mount(struct nfs_parsed_mount_data *args,
-			     struct nfs_fh *root_fh)
+			     struct nfs_fh *root_fh,
+			     rpc_authflavor_t *server_authlist,
+			     unsigned int *server_authlist_len)
 {
-	rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
-	unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
 	struct nfs_mount_request request = {
 		.sap		= (struct sockaddr *)
 						&args->mount_server.address,
@@ -1714,7 +1661,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
 		.protocol	= args->mount_server.protocol,
 		.fh		= root_fh,
 		.noresvport	= args->flags & NFS_MOUNT_NORESVPORT,
-		.auth_flav_len	= &server_authlist_len,
+		.auth_flav_len	= server_authlist_len,
 		.auth_flavs	= server_authlist,
 		.net		= args->net,
 	};
@@ -1758,29 +1705,92 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
 		return status;
 	}
 
+	return 0;
+}
+
+static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info,
+					struct nfs_subversion *nfs_mod)
+{
+	int status;
+	unsigned int i;
+	bool tried_auth_unix = false;
+	bool auth_null_in_list = false;
+	struct nfs_server *server = ERR_PTR(-EACCES);
+	struct nfs_parsed_mount_data *args = mount_info->parsed;
+	rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
+	unsigned int authlist_len = ARRAY_SIZE(authlist);
+
+	status = nfs_request_mount(args, mount_info->mntfh, authlist,
+					&authlist_len);
+	if (status)
+		return ERR_PTR(status);
+
+	/*
+	 * Was a sec= authflavor specified in the options? First, verify
+	 * whether the server supports it, and then just try to use it if so.
+	 */
+	if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
+		status = nfs_verify_authflavor(args, authlist, authlist_len);
+		dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+		if (status)
+			return ERR_PTR(status);
+		return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+	}
+
 	/*
-	 * MNTv1 (NFSv2) does not support auth flavor negotiation.
+	 * No sec= option was provided. RFC 2623, section 2.7 suggests we
+	 * SHOULD prefer the flavor listed first. However, some servers list
+	 * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
 	 */
-	if (args->mount_server.version != NFS_MNT3_VERSION)
-		return 0;
-	return nfs_walk_authlist(args, &request);
+	for (i = 0; i < authlist_len; ++i) {
+		rpc_authflavor_t flavor;
+		struct rpcsec_gss_info info;
+
+		flavor = authlist[i];
+		switch (flavor) {
+		case RPC_AUTH_UNIX:
+			tried_auth_unix = true;
+			break;
+		case RPC_AUTH_NULL:
+			auth_null_in_list = true;
+			continue;
+		default:
+			if (rpcauth_get_gssinfo(flavor, &info) != 0)
+				continue;
+			/* Fallthrough */
+		}
+		dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
+		args->auth_flavors[0] = flavor;
+		server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
+		if (!IS_ERR(server))
+			return server;
+	}
+
+	/*
+	 * Nothing we tried so far worked. At this point, give up if we've
+	 * already tried AUTH_UNIX or if the server's list doesn't contain
+	 * AUTH_NULL
+	 */
+	if (tried_auth_unix || !auth_null_in_list)
+		return server;
+
+	/* Last chance! Try AUTH_UNIX */
+	dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
+	args->auth_flavors[0] = RPC_AUTH_UNIX;
+	return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
 }
 
 struct dentry *nfs_try_mount(int flags, const char *dev_name,
 			     struct nfs_mount_info *mount_info,
 			     struct nfs_subversion *nfs_mod)
 {
-	int status;
 	struct nfs_server *server;
 
-	if (mount_info->parsed->need_mount) {
-		status = nfs_request_mount(mount_info->parsed, mount_info->mntfh);
-		if (status)
-			return ERR_PTR(status);
-	}
+	if (mount_info->parsed->need_mount)
+		server = nfs_try_mount_request(mount_info, nfs_mod);
+	else
+		server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
 
-	/* Get a volume representation */
-	server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
 	if (IS_ERR(server))
 		return ERR_CAST(server);
 
@@ -1949,6 +1959,7 @@ static int nfs23_validate_mount_data(void *options,
 		args->namlen		= data->namlen;
 		args->bsize		= data->bsize;
 
+		args->auth_flavors[0] = RPC_AUTH_UNIX;
 		if (data->flags & NFS_MOUNT_SECFLAVOUR)
 			args->auth_flavors[0] = data->pseudoflavor;
 		if (!args->nfs_server.hostname)
@@ -2418,7 +2429,21 @@ static int nfs_bdi_register(struct nfs_server *server)
 int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
 			struct nfs_mount_info *mount_info)
 {
-	return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts);
+	int error;
+	unsigned long kflags = 0, kflags_out = 0;
+	if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
+		kflags |= SECURITY_LSM_NATIVE_LABELS;
+
+	error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
+						kflags, &kflags_out);
+	if (error)
+		goto err;
+
+	if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
+		!(kflags_out & SECURITY_LSM_NATIVE_LABELS))
+		NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
+err:
+	return error;
 }
 EXPORT_SYMBOL_GPL(nfs_set_sb_security);
 
@@ -2426,10 +2451,9 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
 			  struct nfs_mount_info *mount_info)
 {
 	/* clone any lsm security options from the parent to the new sb */
-	security_sb_clone_mnt_opts(mount_info->cloned->sb, s);
 	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops)
 		return -ESTALE;
-	return 0;
+	return security_sb_clone_mnt_opts(mount_info->cloned->sb, s);
 }
 EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
 
@@ -2645,6 +2669,7 @@ static int nfs4_validate_mount_data(void *options,
 			goto out_no_address;
 		args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
 
+		args->auth_flavors[0] = RPC_AUTH_UNIX;
 		if (data->auth_flavourlen) {
 			if (data->auth_flavourlen > 1)
 				goto out_inval_auth;
@@ -2765,6 +2790,5 @@ module_param(send_implementation_id, ushort, 0644);
 MODULE_PARM_DESC(send_implementation_id,
 		"Send implementation ID with NFSv4.1 exchange_id");
 MODULE_PARM_DESC(nfs4_unique_id, "nfs_client_id4 uniquifier string");
-MODULE_ALIAS("nfs4");
 
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 3f79c77153b..60395ad3a2e 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata)
 
 	nfs_dec_sillycount(data->dir);
 	nfs_free_unlinkdata(data);
-	nfs_sb_deactive_async(sb);
+	nfs_sb_deactive(sb);
 }
 
 static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
@@ -268,8 +268,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 	 * point dentry is definitely not a root, so we won't need
 	 * that anymore.
 	 */
-	if (devname_garbage)
-		kfree(devname_garbage);
+	kfree(devname_garbage);
 	return 0;
 out_unlock:
 	spin_unlock(&dentry->d_lock);
@@ -336,20 +335,14 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
 	struct inode *old_dir = data->old_dir;
 	struct inode *new_dir = data->new_dir;
 	struct dentry *old_dentry = data->old_dentry;
-	struct dentry *new_dentry = data->new_dentry;
 
 	if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
 		rpc_restart_call_prepare(task);
 		return;
 	}
 
-	if (task->tk_status != 0) {
+	if (task->tk_status != 0)
 		nfs_cancel_async_unlink(old_dentry);
-		return;
-	}
-
-	d_drop(old_dentry);
-	d_drop(new_dentry);
 }
 
 /**
@@ -486,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 
 	dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		dentry->d_count);
+		d_count(dentry));
 	nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
 
 	/*
@@ -550,6 +543,18 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
 	error = rpc_wait_for_completion_task(task);
 	if (error == 0)
 		error = task->tk_status;
+	switch (error) {
+	case 0:
+		/* The rename succeeded */
+		nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+		d_move(dentry, sdentry);
+		break;
+	case -ERESTARTSYS:
+		/* The result of the rename is unknown. Play it safe by
+		 * forcing a new lookup */
+		d_drop(dentry);
+		d_drop(sdentry);
+	}
 	rpc_put_task(task);
 out_dput:
 	dput(sdentry);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c483cc50b82..a2c7c28049d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1251,6 +1251,8 @@ void nfs_write_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
 	NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+		rpc_exit(task, -EIO);
 }
 
 void nfs_commit_prepare(struct rpc_task *task, void *calldata)
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 6940439bd60..ed628f71274 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -38,8 +38,8 @@ struct nfsacl_encode_desc {
 	unsigned int count;
 	struct posix_acl *acl;
 	int typeflag;
-	uid_t uid;
-	gid_t gid;
+	kuid_t uid;
+	kgid_t gid;
 };
 
 struct nfsacl_simple_acl {
@@ -60,14 +60,16 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
 	*p++ = htonl(entry->e_tag | nfsacl_desc->typeflag);
 	switch(entry->e_tag) {
 		case ACL_USER_OBJ:
-			*p++ = htonl(nfsacl_desc->uid);
+			*p++ = htonl(from_kuid(&init_user_ns, nfsacl_desc->uid));
 			break;
 		case ACL_GROUP_OBJ:
-			*p++ = htonl(nfsacl_desc->gid);
+			*p++ = htonl(from_kgid(&init_user_ns, nfsacl_desc->gid));
 			break;
 		case ACL_USER:
+			*p++ = htonl(from_kuid(&init_user_ns, entry->e_uid));
+			break;
 		case ACL_GROUP:
-			*p++ = htonl(entry->e_id);
+			*p++ = htonl(from_kgid(&init_user_ns, entry->e_gid));
 			break;
 		default:  /* Solaris depends on that! */
 			*p++ = 0;
@@ -148,6 +150,7 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
 		(struct nfsacl_decode_desc *) desc;
 	__be32 *p = elem;
 	struct posix_acl_entry *entry;
+	unsigned int id;
 
 	if (!nfsacl_desc->acl) {
 		if (desc->array_len > NFS_ACL_MAX_ENTRIES)
@@ -160,14 +163,22 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
 
 	entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
 	entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT;
-	entry->e_id = ntohl(*p++);
+	id = ntohl(*p++);
 	entry->e_perm = ntohl(*p++);
 
 	switch(entry->e_tag) {
-		case ACL_USER_OBJ:
 		case ACL_USER:
-		case ACL_GROUP_OBJ:
+			entry->e_uid = make_kuid(&init_user_ns, id);
+			if (!uid_valid(entry->e_uid))
+				return -EINVAL;
+			break;
 		case ACL_GROUP:
+			entry->e_gid = make_kgid(&init_user_ns, id);
+			if (!gid_valid(entry->e_gid))
+				return -EINVAL;
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
 		case ACL_OTHER:
 			if (entry->e_perm & ~S_IRWXO)
 				return -EINVAL;
@@ -190,9 +201,13 @@ cmp_acl_entry(const void *x, const void *y)
 
 	if (a->e_tag != b->e_tag)
 		return a->e_tag - b->e_tag;
-	else if (a->e_id > b->e_id)
+	else if ((a->e_tag == ACL_USER) && uid_gt(a->e_uid, b->e_uid))
+		return 1;
+	else if ((a->e_tag == ACL_USER) && uid_lt(a->e_uid, b->e_uid))
+		return -1;
+	else if ((a->e_tag == ACL_GROUP) && gid_gt(a->e_gid, b->e_gid))
 		return 1;
-	else if (a->e_id < b->e_id)
+	else if ((a->e_tag == ACL_GROUP) && gid_lt(a->e_gid, b->e_gid))
 		return -1;
 	else
 		return 0;
@@ -213,22 +228,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
 	sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry),
 	     cmp_acl_entry, NULL);
 
-	/* Clear undefined identifier fields and find the ACL_GROUP_OBJ
-	   and ACL_MASK entries. */
+	/* Find the ACL_GROUP_OBJ and ACL_MASK entries. */
 	FOREACH_ACL_ENTRY(pa, acl, pe) {
 		switch(pa->e_tag) {
 			case ACL_USER_OBJ:
-				pa->e_id = ACL_UNDEFINED_ID;
 				break;
 			case ACL_GROUP_OBJ:
-				pa->e_id = ACL_UNDEFINED_ID;
 				group_obj = pa;
 				break;
 			case ACL_MASK:
 				mask = pa;
 				/* fall through */
 			case ACL_OTHER:
-				pa->e_id = ACL_UNDEFINED_ID;
 				break;
 		}
 	}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 8df1ea4a6ff..430b6872806 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -65,8 +65,8 @@ config NFSD_V3_ACL
 	  If unsure, say N.
 
 config NFSD_V4
-	bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
-	depends on NFSD && PROC_FS && EXPERIMENTAL
+	bool "NFS server support for NFS version 4"
+	depends on NFSD && PROC_FS
 	select NFSD_V3
 	select FS_POSIX_ACL
 	select SUNRPC_GSS
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index 34e5c40af5e..8b186a4955c 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -44,8 +44,6 @@
 struct nfs4_acl *nfs4_acl_new(int);
 int nfs4_acl_get_whotype(char *, u32);
 int nfs4_acl_write_who(int who, char *p);
-int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
-		                        uid_t who, u32 mask);
 
 #define NFS4_ACL_TYPE_DEFAULT	0x01
 #define NFS4_ACL_DIR		0x02
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 34a10d78b83..06cddd57226 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -47,9 +47,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 		if (!gi)
 			goto oom;
 	} else if (flags & NFSEXP_ROOTSQUASH) {
-		if (!new->fsuid)
+		if (uid_eq(new->fsuid, GLOBAL_ROOT_UID))
 			new->fsuid = exp->ex_anon_uid;
-		if (!new->fsgid)
+		if (gid_eq(new->fsgid, GLOBAL_ROOT_GID))
 			new->fsgid = exp->ex_anon_gid;
 
 		gi = groups_alloc(rqgi->ngroups);
@@ -58,7 +58,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 
 		for (i = 0; i < rqgi->ngroups; i++) {
 			if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i)))
-				GROUP_AT(gi, i) = make_kgid(&init_user_ns, exp->ex_anon_gid);
+				GROUP_AT(gi, i) = exp->ex_anon_gid;
 			else
 				GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
 		}
@@ -66,9 +66,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 		gi = get_group_info(rqgi);
 	}
 
-	if (new->fsuid == (uid_t) -1)
+	if (uid_eq(new->fsuid, INVALID_UID))
 		new->fsuid = exp->ex_anon_uid;
-	if (new->fsgid == (gid_t) -1)
+	if (gid_eq(new->fsgid, INVALID_GID))
 		new->fsgid = exp->ex_anon_gid;
 
 	ret = set_groups(new, gi);
@@ -76,7 +76,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	if (ret < 0)
 		goto error;
 
-	if (new->fsuid)
+	if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID))
 		new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h
index 78b3c0e9382..53325a12ba6 100644
--- a/fs/nfsd/auth.h
+++ b/fs/nfsd/auth.h
@@ -1,6 +1,5 @@
 /*
  * nfsd-specific authentication stuff.
- * uid/gid mapping not yet implemented.
  *
  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
  */
@@ -8,11 +7,6 @@
 #ifndef LINUX_NFSD_AUTH_H
 #define LINUX_NFSD_AUTH_H
 
-#define nfsd_luid(rq, uid)	((u32)(uid))
-#define nfsd_lgid(rq, gid)	((u32)(gid))
-#define nfsd_ruid(rq, uid)	((u32)(uid))
-#define nfsd_rgid(rq, gid)	((u32)(gid))
-
 /*
  * Set the current process's fsuid/fsgid etc to those of the NFS
  * client user
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 93cc9d34c45..d5c5b3e0026 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -12,6 +12,10 @@
 
 /*
  * Representation of a reply cache entry.
+ *
+ * Note that we use a sockaddr_in6 to hold the address instead of the more
+ * typical sockaddr_storage. This is for space reasons, since sockaddr_storage
+ * is much larger than a sockaddr_in6.
  */
 struct svc_cacherep {
 	struct hlist_node	c_hash;
@@ -20,11 +24,13 @@ struct svc_cacherep {
 	unsigned char		c_state,	/* unused, inprog, done */
 				c_type,		/* status, buffer */
 				c_secure : 1;	/* req came from port < 1024 */
-	struct sockaddr_in	c_addr;
+	struct sockaddr_in6	c_addr;
 	__be32			c_xid;
 	u32			c_prot;
 	u32			c_proc;
 	u32			c_vers;
+	unsigned int		c_len;
+	__wsum			c_csum;
 	unsigned long		c_timestamp;
 	union {
 		struct kvec	u_vec;
@@ -46,8 +52,7 @@ enum {
 enum {
 	RC_DROPIT,
 	RC_REPLY,
-	RC_DOIT,
-	RC_INTR
+	RC_DOIT
 };
 
 /*
@@ -67,10 +72,17 @@ enum {
  */
 #define RC_DELAY		(HZ/5)
 
+/* Cache entries expire after this time period */
+#define RC_EXPIRE		(120 * HZ)
+
+/* Checksum this amount of the request */
+#define RC_CSUMLEN		(256U)
+
 int	nfsd_reply_cache_init(void);
 void	nfsd_reply_cache_shutdown(void);
 int	nfsd_cache_lookup(struct svc_rqst *);
 void	nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+int	nfsd_reply_cache_stats_open(struct inode *, struct file *);
 
 #ifdef CONFIG_NFSD_V4
 void	nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a3946cf13fc..5f38ea36e26 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -67,11 +67,6 @@ static void expkey_request(struct cache_detail *cd,
 	(*bpp)[-1] = '\n';
 }
 
-static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
-{
-	return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
-}
-
 static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new,
 					    struct svc_expkey *old);
 static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *);
@@ -245,7 +240,7 @@ static struct cache_detail svc_expkey_cache_template = {
 	.hash_size	= EXPKEY_HASHMAX,
 	.name		= "nfsd.fh",
 	.cache_put	= expkey_put,
-	.cache_upcall	= expkey_upcall,
+	.cache_request	= expkey_request,
 	.cache_parse	= expkey_parse,
 	.cache_show	= expkey_show,
 	.match		= expkey_match,
@@ -315,6 +310,7 @@ static void svc_export_put(struct kref *ref)
 	path_put(&exp->ex_path);
 	auth_domain_put(exp->ex_client);
 	nfsd4_fslocs_free(&exp->ex_fslocs);
+	kfree(exp->ex_uuid);
 	kfree(exp);
 }
 
@@ -337,11 +333,6 @@ static void svc_export_request(struct cache_detail *cd,
 	(*bpp)[-1] = '\n';
 }
 
-static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
-{
-	return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
-}
-
 static struct svc_export *svc_export_update(struct svc_export *new,
 					    struct svc_export *old);
 static struct svc_export *svc_export_lookup(struct svc_export *);
@@ -544,13 +535,17 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		err = get_int(&mesg, &an_int);
 		if (err)
 			goto out3;
-		exp.ex_anon_uid= an_int;
+		exp.ex_anon_uid= make_kuid(&init_user_ns, an_int);
+		if (!uid_valid(exp.ex_anon_uid))
+			goto out3;
 
 		/* anon gid */
 		err = get_int(&mesg, &an_int);
 		if (err)
 			goto out3;
-		exp.ex_anon_gid= an_int;
+		exp.ex_anon_gid= make_kgid(&init_user_ns, an_int);
+		if (!gid_valid(exp.ex_anon_gid))
+			goto out3;
 
 		/* fsid */
 		err = get_int(&mesg, &an_int);
@@ -613,7 +608,7 @@ out:
 }
 
 static void exp_flags(struct seq_file *m, int flag, int fsid,
-		uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs);
+		kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs);
 static void show_secinfo(struct seq_file *m, struct svc_export *exp);
 
 static int svc_export_show(struct seq_file *m,
@@ -670,6 +665,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_fslocs.locations = NULL;
 	new->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = 0;
+	new->ex_uuid = NULL;
 	new->cd = item->cd;
 }
 
@@ -711,7 +707,7 @@ static struct cache_detail svc_export_cache_template = {
 	.hash_size	= EXPORT_HASHMAX,
 	.name		= "nfsd.export",
 	.cache_put	= svc_export_put,
-	.cache_upcall	= svc_export_upcall,
+	.cache_request	= svc_export_request,
 	.cache_parse	= svc_export_parse,
 	.cache_show	= svc_export_show,
 	.match		= svc_export_match,
@@ -1179,15 +1175,17 @@ static void show_secinfo(struct seq_file *m, struct svc_export *exp)
 }
 
 static void exp_flags(struct seq_file *m, int flag, int fsid,
-		uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
+		kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc)
 {
 	show_expflags(m, flag, NFSEXP_ALLFLAGS);
 	if (flag & NFSEXP_FSID)
 		seq_printf(m, ",fsid=%d", fsid);
-	if (anonu != (uid_t)-2 && anonu != (0x10000-2))
-		seq_printf(m, ",anonuid=%u", anonu);
-	if (anong != (gid_t)-2 && anong != (0x10000-2))
-		seq_printf(m, ",anongid=%u", anong);
+	if (!uid_eq(anonu, make_kuid(&init_user_ns, (uid_t)-2)) &&
+	    !uid_eq(anonu, make_kuid(&init_user_ns, 0x10000-2)))
+		seq_printf(m, ",anonuid=%u", from_kuid(&init_user_ns, anonu));
+	if (!gid_eq(anong, make_kgid(&init_user_ns, (gid_t)-2)) &&
+	    !gid_eq(anong, make_kgid(&init_user_ns, 0x10000-2)))
+		seq_printf(m, ",anongid=%u", from_kgid(&init_user_ns, anong));
 	if (fsloc && fsloc->locations_count > 0) {
 		char *loctype = (fsloc->migrated) ? "refer" : "replicas";
 		int i;
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index e761ee95617..d620e7f8142 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -9,7 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/nsproxy.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
 #include <asm/uaccess.h>
 
 #include "state.h"
@@ -101,7 +101,7 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf,
 	loff_t pos = *ppos;
 
 	if (!pos)
-		nfsd_inject_get(file->f_dentry->d_inode->i_private, &val);
+		nfsd_inject_get(file_inode(file)->i_private, &val);
 	size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
 
 	if (pos < 0)
@@ -133,10 +133,10 @@ static ssize_t fault_inject_write(struct file *file, const char __user *buf,
 
 	size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
 	if (size > 0)
-		nfsd_inject_set_client(file->f_dentry->d_inode->i_private, &sa, size);
+		nfsd_inject_set_client(file_inode(file)->i_private, &sa, size);
 	else {
 		val = simple_strtoll(write_buf, NULL, 0);
-		nfsd_inject_set(file->f_dentry->d_inode->i_private, val);
+		nfsd_inject_set(file_inode(file)->i_private, val);
 	}
 	return len; /* on success, claim we got the whole input */
 }
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
index 9d513efc01b..bf95f6b817a 100644
--- a/fs/nfsd/idmap.h
+++ b/fs/nfsd/idmap.h
@@ -54,9 +54,9 @@ static inline void nfsd_idmap_shutdown(struct net *net)
 }
 #endif
 
-__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
-__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
-int nfsd_map_uid_to_name(struct svc_rqst *, __u32, char *);
-int nfsd_map_gid_to_name(struct svc_rqst *, __u32, char *);
+__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *);
+__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *);
+int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *);
+int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *);
 
 #endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 1051bebff1b..849a7c3ced2 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -80,6 +80,7 @@ struct nfsd_net {
 	 */
 	struct list_head client_lru;
 	struct list_head close_lru;
+	struct list_head del_recall_lru;
 
 	struct delayed_work laundromat_work;
 
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 9170861c804..95d76dc6c5d 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -45,6 +45,10 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
 		RETURN_STATUS(nfserr_inval);
 	resp->mask = argp->mask;
 
+	nfserr = fh_getattr(fh, &resp->stat);
+	if (nfserr)
+		goto fail;
+
 	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
 		acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
 		if (IS_ERR(acl)) {
@@ -115,6 +119,9 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
 		nfserr = nfserrno( nfsd_set_posix_acl(
 			fh, ACL_TYPE_DEFAULT, argp->acl_default) );
 	}
+	if (!nfserr) {
+		nfserr = fh_getattr(fh, &resp->stat);
+	}
 
 	/* argp->acl_{access,default} may have been allocated in
 	   nfssvc_decode_setaclargs. */
@@ -129,10 +136,15 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
 static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
 		struct nfsd_fhandle *argp, struct nfsd_attrstat *resp)
 {
+	__be32 nfserr;
 	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
-	return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
+		return nfserr;
+	nfserr = fh_getattr(&resp->fh, &resp->stat);
+	return nfserr;
 }
 
 /*
@@ -150,6 +162,9 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessarg
 	fh_copy(&resp->fh, &argp->fh);
 	resp->access = argp->access;
 	nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+	if (nfserr)
+		return nfserr;
+	nfserr = fh_getattr(&resp->fh, &resp->stat);
 	return nfserr;
 }
 
@@ -243,7 +258,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
 		return 0;
 	inode = dentry->d_inode;
 
-	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	*p++ = htonl(resp->mask);
 	if (!xdr_ressize_check(rqstp, p))
 		return 0;
@@ -274,7 +289,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
 static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd_attrstat *resp)
 {
-	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	return xdr_ressize_check(rqstp, p);
 }
 
@@ -282,7 +297,7 @@ static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
 static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_accessres *resp)
 {
-	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
 	*p++ = htonl(resp->access);
 	return xdr_ressize_check(rqstp, p);
 }
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 1fc02dfdc5c..40128991313 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -43,7 +43,6 @@ static __be32
 nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 					   struct nfsd3_attrstat *resp)
 {
-	int	err;
 	__be32	nfserr;
 
 	dprintk("nfsd: GETATTR(3)  %s\n",
@@ -55,9 +54,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 	if (nfserr)
 		RETURN_STATUS(nfserr);
 
-	err = vfs_getattr(resp->fh.fh_export->ex_path.mnt,
-			  resp->fh.fh_dentry, &resp->stat);
-	nfserr = nfserrno(err);
+	nfserr = fh_getattr(&resp->fh, &resp->stat);
 
 	RETURN_STATUS(nfserr);
 }
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 324c0baf7cd..14d9ecb96cf 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -11,6 +11,7 @@
 #include "xdr3.h"
 #include "auth.h"
 #include "netns.h"
+#include "vfs.h"
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
@@ -105,12 +106,14 @@ decode_sattr3(__be32 *p, struct iattr *iap)
 		iap->ia_mode = ntohl(*p++);
 	}
 	if (*p++) {
-		iap->ia_valid |= ATTR_UID;
-		iap->ia_uid = ntohl(*p++);
+		iap->ia_uid = make_kuid(&init_user_ns, ntohl(*p++));
+		if (uid_valid(iap->ia_uid))
+			iap->ia_valid |= ATTR_UID;
 	}
 	if (*p++) {
-		iap->ia_valid |= ATTR_GID;
-		iap->ia_gid = ntohl(*p++);
+		iap->ia_gid = make_kgid(&init_user_ns, ntohl(*p++));
+		if (gid_valid(iap->ia_gid))
+			iap->ia_valid |= ATTR_GID;
 	}
 	if (*p++) {
 		u64	newsize;
@@ -167,8 +170,8 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
 	*p++ = htonl((u32) stat->mode);
 	*p++ = htonl((u32) stat->nlink);
-	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
-	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
+	*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
+	*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
 	if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
 		p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
 	} else {
@@ -204,10 +207,10 @@ encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
 	struct dentry *dentry = fhp->fh_dentry;
 	if (dentry && dentry->d_inode) {
-	        int err;
+	        __be32 err;
 		struct kstat stat;
 
-		err = vfs_getattr(fhp->fh_export->ex_path.mnt, dentry, &stat);
+		err = fh_getattr(fhp, &stat);
 		if (!err) {
 			*p++ = xdr_one;		/* attributes follow */
 			lease_get_mtime(dentry->d_inode, &stat.mtime);
@@ -254,13 +257,12 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
  */
 void fill_post_wcc(struct svc_fh *fhp)
 {
-	int err;
+	__be32 err;
 
 	if (fhp->fh_post_saved)
 		printk("nfsd: inode locked twice during operation.\n");
 
-	err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
-			&fhp->fh_post_attr);
+	err = fh_getattr(fhp, &fhp->fh_post_attr);
 	fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
 	if (err) {
 		fhp->fh_post_saved = 0;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 9c51aff02ae..8a50b3c1809 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -264,7 +264,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
 			ace->flag = eflag;
 			ace->access_mask = deny_mask_from_posix(deny, flags);
 			ace->whotype = NFS4_ACL_WHO_NAMED;
-			ace->who = pa->e_id;
+			ace->who_uid = pa->e_uid;
 			ace++;
 			acl->naces++;
 		}
@@ -273,7 +273,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
 		ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
 						   flags);
 		ace->whotype = NFS4_ACL_WHO_NAMED;
-		ace->who = pa->e_id;
+		ace->who_uid = pa->e_uid;
 		ace++;
 		acl->naces++;
 		pa++;
@@ -300,7 +300,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
 		ace->access_mask = mask_from_posix(pa->e_perm & pas.mask,
 						   flags);
 		ace->whotype = NFS4_ACL_WHO_NAMED;
-		ace->who = pa->e_id;
+		ace->who_gid = pa->e_gid;
 		ace++;
 		acl->naces++;
 		pa++;
@@ -329,7 +329,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
 			ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
 			ace->access_mask = deny_mask_from_posix(deny, flags);
 			ace->whotype = NFS4_ACL_WHO_NAMED;
-			ace->who = pa->e_id;
+			ace->who_gid = pa->e_gid;
 			ace++;
 			acl->naces++;
 		}
@@ -345,6 +345,18 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
 	acl->naces++;
 }
 
+static bool
+pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2)
+{
+	if (pace1->e_tag != pace2->e_tag)
+		return pace1->e_tag > pace2->e_tag;
+	if (pace1->e_tag == ACL_USER)
+		return uid_gt(pace1->e_uid, pace2->e_uid);
+	if (pace1->e_tag == ACL_GROUP)
+		return gid_gt(pace1->e_gid, pace2->e_gid);
+	return false;
+}
+
 static void
 sort_pacl_range(struct posix_acl *pacl, int start, int end) {
 	int sorted = 0, i;
@@ -355,8 +367,8 @@ sort_pacl_range(struct posix_acl *pacl, int start, int end) {
 	while (!sorted) {
 		sorted = 1;
 		for (i = start; i < end; i++) {
-			if (pacl->a_entries[i].e_id
-					> pacl->a_entries[i+1].e_id) {
+			if (pace_gt(&pacl->a_entries[i],
+				    &pacl->a_entries[i+1])) {
 				sorted = 0;
 				tmp = pacl->a_entries[i];
 				pacl->a_entries[i] = pacl->a_entries[i+1];
@@ -398,7 +410,10 @@ struct posix_ace_state {
 };
 
 struct posix_user_ace_state {
-	uid_t uid;
+	union {
+		kuid_t uid;
+		kgid_t gid;
+	};
 	struct posix_ace_state perms;
 };
 
@@ -521,7 +536,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
 	if (error)
 		goto out_err;
 	low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
-	pace->e_id = ACL_UNDEFINED_ID;
 
 	for (i=0; i < state->users->n; i++) {
 		pace++;
@@ -531,7 +545,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
 			goto out_err;
 		low_mode_from_nfs4(state->users->aces[i].perms.allow,
 					&pace->e_perm, flags);
-		pace->e_id = state->users->aces[i].uid;
+		pace->e_uid = state->users->aces[i].uid;
 		add_to_mask(state, &state->users->aces[i].perms);
 	}
 
@@ -541,7 +555,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
 	if (error)
 		goto out_err;
 	low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
-	pace->e_id = ACL_UNDEFINED_ID;
 	add_to_mask(state, &state->group);
 
 	for (i=0; i < state->groups->n; i++) {
@@ -552,14 +565,13 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
 			goto out_err;
 		low_mode_from_nfs4(state->groups->aces[i].perms.allow,
 					&pace->e_perm, flags);
-		pace->e_id = state->groups->aces[i].uid;
+		pace->e_gid = state->groups->aces[i].gid;
 		add_to_mask(state, &state->groups->aces[i].perms);
 	}
 
 	pace++;
 	pace->e_tag = ACL_MASK;
 	low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
-	pace->e_id = ACL_UNDEFINED_ID;
 
 	pace++;
 	pace->e_tag = ACL_OTHER;
@@ -567,7 +579,6 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
 	if (error)
 		goto out_err;
 	low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
-	pace->e_id = ACL_UNDEFINED_ID;
 
 	return pacl;
 out_err:
@@ -587,12 +598,13 @@ static inline void deny_bits(struct posix_ace_state *astate, u32 mask)
 	astate->deny |= mask & ~astate->allow;
 }
 
-static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid)
+static int find_uid(struct posix_acl_state *state, kuid_t uid)
 {
+	struct posix_ace_state_array *a = state->users;
 	int i;
 
 	for (i = 0; i < a->n; i++)
-		if (a->aces[i].uid == uid)
+		if (uid_eq(a->aces[i].uid, uid))
 			return i;
 	/* Not found: */
 	a->n++;
@@ -603,6 +615,23 @@ static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array
 	return i;
 }
 
+static int find_gid(struct posix_acl_state *state, kgid_t gid)
+{
+	struct posix_ace_state_array *a = state->groups;
+	int i;
+
+	for (i = 0; i < a->n; i++)
+		if (gid_eq(a->aces[i].gid, gid))
+			return i;
+	/* Not found: */
+	a->n++;
+	a->aces[i].gid = gid;
+	a->aces[i].perms.allow = state->everyone.allow;
+	a->aces[i].perms.deny  = state->everyone.deny;
+
+	return i;
+}
+
 static void deny_bits_array(struct posix_ace_state_array *a, u32 mask)
 {
 	int i;
@@ -636,7 +665,7 @@ static void process_one_v4_ace(struct posix_acl_state *state,
 		}
 		break;
 	case ACL_USER:
-		i = find_uid(state, state->users, ace->who);
+		i = find_uid(state, ace->who_uid);
 		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
 			allow_bits(&state->users->aces[i].perms, mask);
 		} else {
@@ -658,7 +687,7 @@ static void process_one_v4_ace(struct posix_acl_state *state,
 		}
 		break;
 	case ACL_GROUP:
-		i = find_uid(state, state->groups, ace->who);
+		i = find_gid(state, ace->who_gid);
 		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
 			allow_bits(&state->groups->aces[i].perms, mask);
 		} else {
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 99bc85ff021..7f05cd140de 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -37,6 +37,7 @@
 #include "nfsd.h"
 #include "state.h"
 #include "netns.h"
+#include "xdr4cb.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -53,30 +54,6 @@ enum {
 	NFSPROC4_CLNT_CB_SEQUENCE,
 };
 
-#define NFS4_MAXTAGLEN		20
-
-#define NFS4_enc_cb_null_sz		0
-#define NFS4_dec_cb_null_sz		0
-#define cb_compound_enc_hdr_sz		4
-#define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
-#define sessionid_sz			(NFS4_MAX_SESSIONID_LEN >> 2)
-#define cb_sequence_enc_sz		(sessionid_sz + 4 +             \
-					1 /* no referring calls list yet */)
-#define cb_sequence_dec_sz		(op_dec_sz + sessionid_sz + 4)
-
-#define op_enc_sz			1
-#define op_dec_sz			2
-#define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
-#define enc_stateid_sz			(NFS4_STATEID_SIZE >> 2)
-#define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
-					cb_sequence_enc_sz +            \
-					1 + enc_stateid_sz +            \
-					enc_nfs4_fh_sz)
-
-#define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
-					cb_sequence_dec_sz +            \
-					op_dec_sz)
-
 struct nfs4_cb_compound_hdr {
 	/* args */
 	u32		ident;	/* minorversion 0 only */
@@ -817,8 +794,7 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task)
 static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
-	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_stid.sc_client;
+	struct nfs4_client *clp = cb->cb_clp;
 	u32 minorversion = clp->cl_minorversion;
 
 	cb->cb_minorversion = minorversion;
@@ -839,8 +815,7 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
-	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_stid.sc_client;
+	struct nfs4_client *clp = cb->cb_clp;
 
 	dprintk("%s: minorversion=%d\n", __func__,
 		clp->cl_minorversion);
@@ -863,7 +838,7 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
 {
 	struct nfsd4_callback *cb = calldata;
 	struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
-	struct nfs4_client *clp = dp->dl_stid.sc_client;
+	struct nfs4_client *clp = cb->cb_clp;
 	struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
 
 	nfsd4_cb_done(task, calldata);
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a1f10c0a625..4832fd819f8 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -65,7 +65,7 @@ MODULE_PARM_DESC(nfs4_disable_idmapping,
 struct ent {
 	struct cache_head h;
 	int               type;		       /* User / Group */
-	uid_t             id;
+	u32               id;
 	char              name[IDMAP_NAMESZ];
 	char              authname[IDMAP_NAMESZ];
 };
@@ -140,12 +140,6 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
 }
 
 static int
-idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
-{
-	return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
-}
-
-static int
 idtoname_match(struct cache_head *ca, struct cache_head *cb)
 {
 	struct ent *a = container_of(ca, struct ent, h);
@@ -192,7 +186,7 @@ static struct cache_detail idtoname_cache_template = {
 	.hash_size	= ENT_HASHMAX,
 	.name		= "nfs4.idtoname",
 	.cache_put	= ent_put,
-	.cache_upcall	= idtoname_upcall,
+	.cache_request	= idtoname_request,
 	.cache_parse	= idtoname_parse,
 	.cache_show	= idtoname_show,
 	.warn_no_listener = warn_no_idmapd,
@@ -321,12 +315,6 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
 }
 
 static int
-nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
-{
-	return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
-}
-
-static int
 nametoid_match(struct cache_head *ca, struct cache_head *cb)
 {
 	struct ent *a = container_of(ca, struct ent, h);
@@ -365,7 +353,7 @@ static struct cache_detail nametoid_cache_template = {
 	.hash_size	= ENT_HASHMAX,
 	.name		= "nfs4.nametoid",
 	.cache_put	= ent_put,
-	.cache_upcall	= nametoid_upcall,
+	.cache_request	= nametoid_request,
 	.cache_parse	= nametoid_parse,
 	.cache_show	= nametoid_show,
 	.warn_no_listener = warn_no_idmapd,
@@ -540,7 +528,7 @@ rqst_authname(struct svc_rqst *rqstp)
 
 static __be32
 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen,
-		uid_t *id)
+		u32 *id)
 {
 	struct ent *item, key = {
 		.type = type,
@@ -564,7 +552,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
 }
 
 static int
-idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
+idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)
 {
 	struct ent *item, key = {
 		.id = id,
@@ -587,7 +575,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
 }
 
 static bool
-numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id)
+numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id)
 {
 	int ret;
 	char buf[11];
@@ -603,7 +591,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel
 }
 
 static __be32
-do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id)
+do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id)
 {
 	if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
 		if (numeric_name_to_id(rqstp, type, name, namelen, id))
@@ -616,7 +604,7 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u
 }
 
 static int
-do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
+do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)
 {
 	if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
 		return sprintf(name, "%u", id);
@@ -625,26 +613,40 @@ do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
 
 __be32
 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen,
-		__u32 *id)
+		kuid_t *uid)
 {
-	return do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, id);
+	__be32 status;
+	u32 id = -1;
+	status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id);
+	*uid = make_kuid(&init_user_ns, id);
+	if (!uid_valid(*uid))
+		status = nfserr_badowner;
+	return status;
 }
 
 __be32
 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
-		__u32 *id)
+		kgid_t *gid)
 {
-	return do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, id);
+	__be32 status;
+	u32 id = -1;
+	status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id);
+	*gid = make_kgid(&init_user_ns, id);
+	if (!gid_valid(*gid))
+		status = nfserr_badowner;
+	return status;
 }
 
 int
-nfsd_map_uid_to_name(struct svc_rqst *rqstp, __u32 id, char *name)
+nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name)
 {
+	u32 id = from_kuid(&init_user_ns, uid);
 	return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name);
 }
 
 int
-nfsd_map_gid_to_name(struct svc_rqst *rqstp, __u32 id, char *name)
+nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name)
 {
+	u32 id = from_kgid(&init_user_ns, gid);
 	return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name);
 }
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9d1c5dba2bb..27d74a29451 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -191,9 +191,18 @@ static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
 	return nfserr_symlink;
 }
 
+static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh *resfh)
+{
+	if (nfsd4_has_session(cstate))
+		return;
+	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
+			&resfh->fh_handle);
+}
+
 static __be32
-do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
 {
+	struct svc_fh *current_fh = &cstate->current_fh;
 	struct svc_fh *resfh;
 	int accmode;
 	__be32 status;
@@ -252,9 +261,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 	if (is_create_with_attrs(open) && open->op_acl != NULL)
 		do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval);
 
-	/* set reply cache */
-	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
-			&resfh->fh_handle);
+	nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
 	accmode = NFSD_MAY_NOP;
 	if (open->op_created)
 		accmode |= NFSD_MAY_OWNER_OVERRIDE;
@@ -268,9 +275,11 @@ out:
 }
 
 static __be32
-do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
 {
+	struct svc_fh *current_fh = &cstate->current_fh;
 	__be32 status;
+	int accmode = 0;
 
 	/* We don't know the target directory, and therefore can not
 	* set the change info
@@ -278,15 +287,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 
 	memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
 
-	/* set replay cache */
-	fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
-			&current_fh->fh_handle);
+	nfsd4_set_open_owner_reply_cache(cstate, open, current_fh);
 
 	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
 		(open->op_iattr.ia_size == 0);
+	/*
+	 * In the delegation case, the client is telling us about an
+	 * open that it *already* performed locally, some time ago.  We
+	 * should let it succeed now if possible.
+	 *
+	 * In the case of a CLAIM_FH open, on the other hand, the client
+	 * may be counting on us to enforce permissions (the Linux 4.1
+	 * client uses this for normal opens, for example).
+	 */
+	if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH)
+		accmode = NFSD_MAY_OWNER_OVERRIDE;
 
-	status = do_open_permission(rqstp, current_fh, open,
-				    NFSD_MAY_OWNER_OVERRIDE);
+	status = do_open_permission(rqstp, current_fh, open, accmode);
 
 	return status;
 }
@@ -351,6 +368,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 	if (status)
 		goto out;
+	if (open->op_xdr_error) {
+		status = open->op_xdr_error;
+		goto out;
+	}
 
 	status = nfsd4_check_open_attributes(rqstp, cstate, open);
 	if (status)
@@ -368,8 +389,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_DELEGATE_CUR:
 		case NFS4_OPEN_CLAIM_NULL:
-			status = do_open_lookup(rqstp, &cstate->current_fh,
-						open);
+			status = do_open_lookup(rqstp, cstate, open);
 			if (status)
 				goto out;
 			break;
@@ -382,8 +402,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				goto out;
 		case NFS4_OPEN_CLAIM_FH:
 		case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
-			status = do_open_fhandle(rqstp, &cstate->current_fh,
-						 open);
+			status = do_open_fhandle(rqstp, cstate, open);
 			if (status)
 				goto out;
 			break;
@@ -409,14 +428,33 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	WARN_ON(status && open->op_created);
 out:
 	nfsd4_cleanup_open_state(open, status);
-	if (open->op_openowner)
+	if (open->op_openowner && !nfsd4_has_session(cstate))
 		cstate->replay_owner = &open->op_openowner->oo_owner;
-	else
+	nfsd4_bump_seqid(cstate, status);
+	if (!cstate->replay_owner)
 		nfs4_unlock_state();
 	return status;
 }
 
 /*
+ * OPEN is the only seqid-mutating operation whose decoding can fail
+ * with a seqid-mutating error (specifically, decoding of user names in
+ * the attributes).  Therefore we have to do some processing to look up
+ * the stateowner so that we can bump the seqid.
+ */
+static __be32 nfsd4_open_omfg(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_op *op)
+{
+	struct nfsd4_open *open = (struct nfsd4_open *)&op->u;
+
+	if (!seqid_mutating_err(ntohl(op->status)))
+		return op->status;
+	if (nfsd4_has_session(cstate))
+		return op->status;
+	open->op_xdr_error = op->status;
+	return nfsd4_open(rqstp, cstate, open);
+}
+
+/*
  * filehandle-manipulating ops.
  */
 static __be32
@@ -786,21 +824,11 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
 			     rename->rn_snamelen, &cstate->current_fh,
 			     rename->rn_tname, rename->rn_tnamelen);
-
-	/* the underlying filesystem returns different error's than required
-	 * by NFSv4. both save_fh and current_fh have been verified.. */
-	if (status == nfserr_isdir)
-		status = nfserr_exist;
-	else if ((status == nfserr_notdir) &&
-                  (S_ISDIR(cstate->save_fh.fh_dentry->d_inode->i_mode) &&
-                   S_ISDIR(cstate->current_fh.fh_dentry->d_inode->i_mode)))
-		status = nfserr_exist;
-
-	if (!status) {
-		set_change_info(&rename->rn_sinfo, &cstate->current_fh);
-		set_change_info(&rename->rn_tinfo, &cstate->save_fh);
-	}
-	return status;
+	if (status)
+		return status;
+	set_change_info(&rename->rn_sinfo, &cstate->current_fh);
+	set_change_info(&rename->rn_tinfo, &cstate->save_fh);
+	return nfs_ok;
 }
 
 static __be32
@@ -931,14 +959,14 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfs4_lock_state();
 	status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
 					cstate, stateid, WR_STATE, &filp);
-	if (filp)
-		get_file(filp);
-	nfs4_unlock_state();
-
 	if (status) {
+		nfs4_unlock_state();
 		dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
 		return status;
 	}
+	if (filp)
+		get_file(filp);
+	nfs4_unlock_state();
 
 	cnt = write->wr_buflen;
 	write->wr_how_written = write->wr_stable_how;
@@ -993,14 +1021,15 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (!buf)
 		return nfserr_jukebox;
 
+	p = buf;
 	status = nfsd4_encode_fattr(&cstate->current_fh,
 				    cstate->current_fh.fh_export,
-				    cstate->current_fh.fh_dentry, buf,
-				    &count, verify->ve_bmval,
+				    cstate->current_fh.fh_dentry, &p,
+				    count, verify->ve_bmval,
 				    rqstp, 0);
 
 	/* this means that nfsd4_encode_fattr() ran out of space */
-	if (status == nfserr_resource && count == 0)
+	if (status == nfserr_resource)
 		status = nfserr_not_same;
 	if (status)
 		goto out_kfree;
@@ -1243,8 +1272,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 		 * for example, if there is a miscellaneous XDR error
 		 * it will be set to nfserr_bad_xdr.
 		 */
-		if (op->status)
+		if (op->status) {
+			if (op->opnum == OP_OPEN)
+				op->status = nfsd4_open_omfg(rqstp, cstate, op);
 			goto encode_op;
+		}
 
 		/* We must be able to encode a successful response to
 		 * this operation, with enough room left over to encode a
@@ -1281,12 +1313,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 		if (op->status)
 			goto encode_op;
 
-		if (opdesc->op_func) {
-			if (opdesc->op_get_currentstateid)
-				opdesc->op_get_currentstateid(cstate, &op->u);
-			op->status = opdesc->op_func(rqstp, cstate, &op->u);
-		} else
-			BUG_ON(op->status == nfs_ok);
+		if (opdesc->op_get_currentstateid)
+			opdesc->op_get_currentstateid(cstate, &op->u);
+		op->status = opdesc->op_func(rqstp, cstate, &op->u);
 
 		if (!op->status) {
 			if (opdesc->op_set_currentstateid)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index ba6fdd4a045..105a3b080d1 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -73,8 +73,8 @@ nfs4_save_creds(const struct cred **original_creds)
 	if (!new)
 		return -ENOMEM;
 
-	new->fsuid = 0;
-	new->fsgid = 0;
+	new->fsuid = GLOBAL_ROOT_UID;
+	new->fsgid = GLOBAL_ROOT_GID;
 	*original_creds = override_creds(new);
 	put_cred(new);
 	return 0;
@@ -146,7 +146,7 @@ out_no_tfm:
  * then disable recovery tracking.
  */
 static void
-legacy_recdir_name_error(int error)
+legacy_recdir_name_error(struct nfs4_client *clp, int error)
 {
 	printk(KERN_ERR "NFSD: unable to generate recoverydir "
 			"name (%d).\n", error);
@@ -159,9 +159,7 @@ legacy_recdir_name_error(int error)
 	if (error == -ENOENT) {
 		printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
 			"Reboot recovery will not function correctly!\n");
-
-		/* the argument is ignored by the legacy exit function */
-		nfsd4_client_tracking_exit(NULL);
+		nfsd4_client_tracking_exit(clp->net);
 	}
 }
 
@@ -184,7 +182,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 
 	status = nfs4_make_rec_clidname(dname, &clp->cl_name);
 	if (status)
-		return legacy_recdir_name_error(status);
+		return legacy_recdir_name_error(clp, status);
 
 	status = nfs4_save_creds(&original_cred);
 	if (status < 0)
@@ -242,11 +240,16 @@ struct name_list {
 	struct list_head list;
 };
 
+struct nfs4_dir_ctx {
+	struct dir_context ctx;
+	struct list_head names;
+};
+
 static int
 nfsd4_build_namelist(void *arg, const char *name, int namlen,
 		loff_t offset, u64 ino, unsigned int d_type)
 {
-	struct list_head *names = arg;
+	struct nfs4_dir_ctx *ctx = arg;
 	struct name_list *entry;
 
 	if (namlen != HEXDIR_LEN - 1)
@@ -256,7 +259,7 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
 		return -ENOMEM;
 	memcpy(entry->name, name, HEXDIR_LEN - 1);
 	entry->name[HEXDIR_LEN - 1] = '\0';
-	list_add(&entry->list, names);
+	list_add(&entry->list, &ctx->names);
 	return 0;
 }
 
@@ -265,7 +268,10 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
 {
 	const struct cred *original_cred;
 	struct dentry *dir = nn->rec_file->f_path.dentry;
-	LIST_HEAD(names);
+	struct nfs4_dir_ctx ctx = {
+		.ctx.actor = nfsd4_build_namelist,
+		.names = LIST_HEAD_INIT(ctx.names)
+	};
 	int status;
 
 	status = nfs4_save_creds(&original_cred);
@@ -278,11 +284,11 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
 		return status;
 	}
 
-	status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);
+	status = iterate_dir(nn->rec_file, &ctx.ctx);
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-	while (!list_empty(&names)) {
+	while (!list_empty(&ctx.names)) {
 		struct name_list *entry;
-		entry = list_entry(names.next, struct name_list, list);
+		entry = list_entry(ctx.names.next, struct name_list, list);
 		if (!status) {
 			struct dentry *dentry;
 			dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
@@ -341,7 +347,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 
 	status = nfs4_make_rec_clidname(dname, &clp->cl_name);
 	if (status)
-		return legacy_recdir_name_error(status);
+		return legacy_recdir_name_error(clp, status);
 
 	status = mnt_want_write_file(nn->rec_file);
 	if (status)
@@ -601,7 +607,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
 
 	status = nfs4_make_rec_clidname(dname, &clp->cl_name);
 	if (status) {
-		legacy_recdir_name_error(status);
+		legacy_recdir_name_error(clp, status);
 		return status;
 	}
 
@@ -1185,6 +1191,12 @@ bin_to_hex_dup(const unsigned char *src, int srclen)
 static int
 nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
 {
+	/* XXX: The usermode helper s not working in container yet. */
+	if (net != &init_net) {
+		WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
+			"tracking in a container!\n");
+		return -EINVAL;
+	}
 	return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
 }
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ac8ed96c419..f17051838b4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -40,8 +40,9 @@
 #include <linux/pagemap.h>
 #include <linux/ratelimit.h>
 #include <linux/sunrpc/svcauth_gss.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
 #include "xdr4.h"
+#include "xdr4cb.h"
 #include "vfs.h"
 #include "current_stateid.h"
 
@@ -94,17 +95,32 @@ nfs4_lock_state(void)
 	mutex_lock(&client_mutex);
 }
 
-static void free_session(struct kref *);
+static void free_session(struct nfsd4_session *);
 
-/* Must be called under the client_lock */
-static void nfsd4_put_session_locked(struct nfsd4_session *ses)
+void nfsd4_put_session(struct nfsd4_session *ses)
 {
-	kref_put(&ses->se_ref, free_session);
+	atomic_dec(&ses->se_ref);
 }
 
-static void nfsd4_get_session(struct nfsd4_session *ses)
+static bool is_session_dead(struct nfsd4_session *ses)
 {
-	kref_get(&ses->se_ref);
+	return ses->se_flags & NFS4_SESSION_DEAD;
+}
+
+static __be32 mark_session_dead_locked(struct nfsd4_session *ses)
+{
+	if (atomic_read(&ses->se_ref))
+		return nfserr_jukebox;
+	ses->se_flags |= NFS4_SESSION_DEAD;
+	return nfs_ok;
+}
+
+static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses)
+{
+	if (is_session_dead(ses))
+		return nfserr_badsession;
+	atomic_inc(&ses->se_ref);
+	return nfs_ok;
 }
 
 void
@@ -113,6 +129,90 @@ nfs4_unlock_state(void)
 	mutex_unlock(&client_mutex);
 }
 
+static bool is_client_expired(struct nfs4_client *clp)
+{
+	return clp->cl_time == 0;
+}
+
+static __be32 mark_client_expired_locked(struct nfs4_client *clp)
+{
+	if (atomic_read(&clp->cl_refcount))
+		return nfserr_jukebox;
+	clp->cl_time = 0;
+	return nfs_ok;
+}
+
+static __be32 mark_client_expired(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	__be32 ret;
+
+	spin_lock(&nn->client_lock);
+	ret = mark_client_expired_locked(clp);
+	spin_unlock(&nn->client_lock);
+	return ret;
+}
+
+static __be32 get_client_locked(struct nfs4_client *clp)
+{
+	if (is_client_expired(clp))
+		return nfserr_expired;
+	atomic_inc(&clp->cl_refcount);
+	return nfs_ok;
+}
+
+/* must be called under the client_lock */
+static inline void
+renew_client_locked(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (is_client_expired(clp)) {
+		WARN_ON(1);
+		printk("%s: client (clientid %08x/%08x) already expired\n",
+			__func__,
+			clp->cl_clientid.cl_boot,
+			clp->cl_clientid.cl_id);
+		return;
+	}
+
+	dprintk("renewing client (clientid %08x/%08x)\n",
+			clp->cl_clientid.cl_boot,
+			clp->cl_clientid.cl_id);
+	list_move_tail(&clp->cl_lru, &nn->client_lru);
+	clp->cl_time = get_seconds();
+}
+
+static inline void
+renew_client(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	spin_lock(&nn->client_lock);
+	renew_client_locked(clp);
+	spin_unlock(&nn->client_lock);
+}
+
+static void put_client_renew_locked(struct nfs4_client *clp)
+{
+	if (!atomic_dec_and_test(&clp->cl_refcount))
+		return;
+	if (!is_client_expired(clp))
+		renew_client_locked(clp);
+}
+
+void put_client_renew(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
+		return;
+	if (!is_client_expired(clp))
+		renew_client_locked(clp);
+	spin_unlock(&nn->client_lock);
+}
+
+
 static inline u32
 opaque_hashval(const void *ptr, int nbytes)
 {
@@ -126,8 +226,6 @@ opaque_hashval(const void *ptr, int nbytes)
 	return x;
 }
 
-static struct list_head del_recall_lru;
-
 static void nfsd4_free_file(struct nfs4_file *f)
 {
 	kmem_cache_free(file_slab, f);
@@ -137,7 +235,7 @@ static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
 	if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
-		list_del(&fi->fi_hash);
+		hlist_del(&fi->fi_hash);
 		spin_unlock(&recall_lock);
 		iput(fi->fi_inode);
 		nfsd4_free_file(fi);
@@ -151,7 +249,7 @@ get_nfs4_file(struct nfs4_file *fi)
 }
 
 static int num_delegations;
-unsigned int max_delegations;
+unsigned long max_delegations;
 
 /*
  * Open owner state (share locks)
@@ -181,7 +279,7 @@ static unsigned int file_hashval(struct inode *ino)
 	return hash_ptr(ino, FILE_HASH_BITS);
 }
 
-static struct list_head file_hashtbl[FILE_HASH_SIZE];
+static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
 
 static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
 {
@@ -210,13 +308,7 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 {
 	if (atomic_dec_and_test(&fp->fi_access[oflag])) {
 		nfs4_file_put_fd(fp, oflag);
-		/*
-		 * It's also safe to get rid of the RDWR open *if*
-		 * we no longer have need of the other kind of access
-		 * or if we already have the other kind of open:
-		 */
-		if (fp->fi_fds[1-oflag]
-			|| atomic_read(&fp->fi_access[1 - oflag]) == 0)
+		if (atomic_read(&fp->fi_access[1 - oflag]) == 0)
 			nfs4_file_put_fd(fp, O_RDWR);
 	}
 }
@@ -230,21 +322,27 @@ static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 		__nfs4_file_put_access(fp, oflag);
 }
 
-static inline int get_new_stid(struct nfs4_stid *stid)
+static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct
+kmem_cache *slab)
 {
-	static int min_stateid = 0;
-	struct idr *stateids = &stid->sc_client->cl_stateids;
-	int new_stid;
-	int error;
+	struct idr *stateids = &cl->cl_stateids;
+	struct nfs4_stid *stid;
+	int new_id;
+
+	stid = kmem_cache_alloc(slab, GFP_KERNEL);
+	if (!stid)
+		return NULL;
+
+	new_id = idr_alloc_cyclic(stateids, stid, 0, 0, GFP_KERNEL);
+	if (new_id < 0)
+		goto out_free;
+	stid->sc_client = cl;
+	stid->sc_type = 0;
+	stid->sc_stateid.si_opaque.so_id = new_id;
+	stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
+	/* Will be incremented before return to client: */
+	stid->sc_stateid.si_generation = 0;
 
-	error = idr_get_new_above(stateids, stid, min_stateid, &new_stid);
-	/*
-	 * Note: the necessary preallocation was done in
-	 * nfs4_alloc_stateid().  The idr code caps the number of
-	 * preallocations that can exist at a time, but the state lock
-	 * prevents anyone from using ours before we get here:
-	 */
-	WARN_ON_ONCE(error);
 	/*
 	 * It shouldn't be a problem to reuse an opaque stateid value.
 	 * I don't think it is for 4.1.  But with 4.0 I worry that, for
@@ -254,40 +352,10 @@ static inline int get_new_stid(struct nfs4_stid *stid)
 	 * amount of time until an id is reused, by ensuring they always
 	 * "increase" (mod INT_MAX):
 	 */
-
-	min_stateid = new_stid+1;
-	if (min_stateid == INT_MAX)
-		min_stateid = 0;
-	return new_stid;
-}
-
-static void init_stid(struct nfs4_stid *stid, struct nfs4_client *cl, unsigned char type)
-{
-	stateid_t *s = &stid->sc_stateid;
-	int new_id;
-
-	stid->sc_type = type;
-	stid->sc_client = cl;
-	s->si_opaque.so_clid = cl->cl_clientid;
-	new_id = get_new_stid(stid);
-	s->si_opaque.so_id = (u32)new_id;
-	/* Will be incremented before return to client: */
-	s->si_generation = 0;
-}
-
-static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab)
-{
-	struct idr *stateids = &cl->cl_stateids;
-
-	if (!idr_pre_get(stateids, GFP_KERNEL))
-		return NULL;
-	/*
-	 * Note: if we fail here (or any time between now and the time
-	 * we actually get the new idr), we won't need to undo the idr
-	 * preallocation, since the idr code caps the number of
-	 * preallocated entries.
-	 */
-	return kmem_cache_alloc(slab, GFP_KERNEL);
+	return stid;
+out_free:
+	kmem_cache_free(slab, stid);
+	return NULL;
 }
 
 static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
@@ -316,7 +384,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 	dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
 	if (dp == NULL)
 		return dp;
-	init_stid(&dp->dl_stid, clp, NFS4_DELEG_STID);
+	dp->dl_stid.sc_type = NFS4_DELEG_STID;
 	/*
 	 * delegation seqid's are never incremented.  The 4.1 special
 	 * meaning of seqid 0 isn't meaningful, really, but let's avoid
@@ -337,12 +405,17 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
 	return dp;
 }
 
+static void remove_stid(struct nfs4_stid *s)
+{
+	struct idr *stateids = &s->sc_client->cl_stateids;
+
+	idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+}
+
 void
 nfs4_put_delegation(struct nfs4_delegation *dp)
 {
 	if (atomic_dec_and_test(&dp->dl_count)) {
-		dprintk("NFSD: freeing dp %p\n",dp);
-		put_nfs4_file(dp->dl_file);
 		kmem_cache_free(deleg_slab, dp);
 		num_delegations--;
 	}
@@ -360,25 +433,52 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 
 static void unhash_stid(struct nfs4_stid *s)
 {
-	struct idr *stateids = &s->sc_client->cl_stateids;
-
-	idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+	s->sc_type = 0;
 }
 
 /* Called under the state lock. */
 static void
 unhash_delegation(struct nfs4_delegation *dp)
 {
-	unhash_stid(&dp->dl_stid);
 	list_del_init(&dp->dl_perclnt);
 	spin_lock(&recall_lock);
 	list_del_init(&dp->dl_perfile);
 	list_del_init(&dp->dl_recall_lru);
 	spin_unlock(&recall_lock);
 	nfs4_put_deleg_lease(dp->dl_file);
+	put_nfs4_file(dp->dl_file);
+	dp->dl_file = NULL;
+}
+
+
+
+static void destroy_revoked_delegation(struct nfs4_delegation *dp)
+{
+	list_del_init(&dp->dl_recall_lru);
+	remove_stid(&dp->dl_stid);
 	nfs4_put_delegation(dp);
 }
 
+static void destroy_delegation(struct nfs4_delegation *dp)
+{
+	unhash_delegation(dp);
+	remove_stid(&dp->dl_stid);
+	nfs4_put_delegation(dp);
+}
+
+static void revoke_delegation(struct nfs4_delegation *dp)
+{
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
+
+	if (clp->cl_minorversion == 0)
+		destroy_delegation(dp);
+	else {
+		unhash_delegation(dp);
+		dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
+		list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+	}
+}
+
 /* 
  * SETCLIENTID state 
  */
@@ -519,6 +619,7 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp)
 
 static void free_generic_stateid(struct nfs4_ol_stateid *stp)
 {
+	remove_stid(&stp->st_stid);
 	kmem_cache_free(stateid_slab, stp);
 }
 
@@ -635,6 +736,28 @@ dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
 }
 #endif
 
+/*
+ * Bump the seqid on cstate->replay_owner, and clear replay_owner if it
+ * won't be used for replay.
+ */
+void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr)
+{
+	struct nfs4_stateowner *so = cstate->replay_owner;
+
+	if (nfserr == nfserr_replay_me)
+		return;
+
+	if (!seqid_mutating_err(ntohl(nfserr))) {
+		cstate->replay_owner = NULL;
+		return;
+	}
+	if (!so)
+		return;
+	if (so->so_is_open_owner)
+		release_last_closed_stateid(openowner(so));
+	so->so_seqid++;
+	return;
+}
 
 static void
 gen_sessionid(struct nfsd4_session *ses)
@@ -675,17 +798,15 @@ free_session_slots(struct nfsd4_session *ses)
  * We don't actually need to cache the rpc and session headers, so we
  * can allocate a little less for each slot:
  */
-static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
-{
-	return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
-}
-
-static int nfsd4_sanitize_slot_size(u32 size)
+static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca)
 {
-	size -= NFSD_MIN_HDR_SEQ_SZ; /* We don't cache the rpc header */
-	size = min_t(u32, size, NFSD_SLOT_CACHE_SIZE);
+	u32 size;
 
-	return size;
+	if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ)
+		size = 0;
+	else
+		size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+	return size + sizeof(struct nfsd4_slot);
 }
 
 /*
@@ -693,15 +814,15 @@ static int nfsd4_sanitize_slot_size(u32 size)
  * re-negotiate active sessions and reduce their slot usage to make
  * room for new connections. For now we just fail the create session.
  */
-static int nfsd4_get_drc_mem(int slotsize, u32 num)
+static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
 {
+	u32 slotsize = slot_bytes(ca);
+	u32 num = ca->maxreqs;
 	int avail;
 
-	num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
-
 	spin_lock(&nfsd_drc_lock);
-	avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
-			nfsd_drc_max_mem - nfsd_drc_mem_used);
+	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
+		    nfsd_drc_max_mem - nfsd_drc_mem_used);
 	num = min_t(int, num, avail / slotsize);
 	nfsd_drc_mem_used += num * slotsize;
 	spin_unlock(&nfsd_drc_lock);
@@ -709,15 +830,19 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num)
 	return num;
 }
 
-static void nfsd4_put_drc_mem(int slotsize, int num)
+static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca)
 {
+	int slotsize = slot_bytes(ca);
+
 	spin_lock(&nfsd_drc_lock);
-	nfsd_drc_mem_used -= slotsize * num;
+	nfsd_drc_mem_used -= slotsize * ca->maxreqs;
 	spin_unlock(&nfsd_drc_lock);
 }
 
-static struct nfsd4_session *__alloc_session(int slotsize, int numslots)
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs)
 {
+	int numslots = attrs->maxreqs;
+	int slotsize = slot_bytes(attrs);
 	struct nfsd4_session *new;
 	int mem, i;
 
@@ -730,8 +855,7 @@ static struct nfsd4_session *__alloc_session(int slotsize, int numslots)
 		return NULL;
 	/* allocate each struct nfsd4_slot and data cache in one piece */
 	for (i = 0; i < numslots; i++) {
-		mem = sizeof(struct nfsd4_slot) + slotsize;
-		new->se_slots[i] = kzalloc(mem, GFP_KERNEL);
+		new->se_slots[i] = kzalloc(slotsize, GFP_KERNEL);
 		if (!new->se_slots[i])
 			goto out_free;
 	}
@@ -743,21 +867,6 @@ out_free:
 	return NULL;
 }
 
-static void init_forechannel_attrs(struct nfsd4_channel_attrs *new,
-				   struct nfsd4_channel_attrs *req,
-				   int numslots, int slotsize,
-				   struct nfsd_net *nn)
-{
-	u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
-
-	new->maxreqs = numslots;
-	new->maxresp_cached = min_t(u32, req->maxresp_cached,
-					slotsize + NFSD_MIN_HDR_SEQ_SZ);
-	new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
-	new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
-	new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
-}
-
 static void free_conn(struct nfsd4_conn *c)
 {
 	svc_xprt_put(c->cn_xprt);
@@ -774,8 +883,8 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
 		list_del(&c->cn_persession);
 		free_conn(c);
 	}
-	spin_unlock(&clp->cl_lock);
 	nfsd4_probe_callback(clp);
+	spin_unlock(&clp->cl_lock);
 }
 
 static struct nfsd4_conn *alloc_conn(struct svc_rqst *rqstp, u32 flags)
@@ -859,59 +968,20 @@ static void nfsd4_del_conns(struct nfsd4_session *s)
 
 static void __free_session(struct nfsd4_session *ses)
 {
-	nfsd4_put_drc_mem(slot_bytes(&ses->se_fchannel), ses->se_fchannel.maxreqs);
 	free_session_slots(ses);
 	kfree(ses);
 }
 
-static void free_session(struct kref *kref)
+static void free_session(struct nfsd4_session *ses)
 {
-	struct nfsd4_session *ses;
-	struct nfsd_net *nn;
-
-	ses = container_of(kref, struct nfsd4_session, se_ref);
-	nn = net_generic(ses->se_client->net, nfsd_net_id);
+	struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
 
 	lockdep_assert_held(&nn->client_lock);
 	nfsd4_del_conns(ses);
+	nfsd4_put_drc_mem(&ses->se_fchannel);
 	__free_session(ses);
 }
 
-void nfsd4_put_session(struct nfsd4_session *ses)
-{
-	struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
-
-	spin_lock(&nn->client_lock);
-	nfsd4_put_session_locked(ses);
-	spin_unlock(&nn->client_lock);
-}
-
-static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan,
-					   struct nfsd_net *nn)
-{
-	struct nfsd4_session *new;
-	int numslots, slotsize;
-	/*
-	 * Note decreasing slot size below client's request may
-	 * make it difficult for client to function correctly, whereas
-	 * decreasing the number of slots will (just?) affect
-	 * performance.  When short on memory we therefore prefer to
-	 * decrease number of slots instead of their size.
-	 */
-	slotsize = nfsd4_sanitize_slot_size(fchan->maxresp_cached);
-	numslots = nfsd4_get_drc_mem(slotsize, fchan->maxreqs);
-	if (numslots < 1)
-		return NULL;
-
-	new = __alloc_session(slotsize, numslots);
-	if (!new) {
-		nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
-		return NULL;
-	}
-	init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn);
-	return new;
-}
-
 static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
 {
 	int idx;
@@ -926,7 +996,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
 	new->se_flags = cses->flags;
 	new->se_cb_prog = cses->callback_prog;
 	new->se_cb_sec = cses->cb_sec;
-	kref_init(&new->se_ref);
+	atomic_set(&new->se_ref, 0);
 	idx = hash_sessionid(&new->se_sessionid);
 	spin_lock(&nn->client_lock);
 	list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
@@ -934,7 +1004,8 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
 	list_add(&new->se_perclnt, &clp->cl_sessions);
 	spin_unlock(&clp->cl_lock);
 	spin_unlock(&nn->client_lock);
-
+	memcpy(&new->se_fchannel, &cses->fore_channel,
+			sizeof(struct nfsd4_channel_attrs));
 	if (cses->flags & SESSION4_BACK_CHAN) {
 		struct sockaddr *sa = svc_addr(rqstp);
 		/*
@@ -981,38 +1052,6 @@ unhash_session(struct nfsd4_session *ses)
 	spin_unlock(&ses->se_client->cl_lock);
 }
 
-/* must be called under the client_lock */
-static inline void
-renew_client_locked(struct nfs4_client *clp)
-{
-	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
-	if (is_client_expired(clp)) {
-		WARN_ON(1);
-		printk("%s: client (clientid %08x/%08x) already expired\n",
-			__func__,
-			clp->cl_clientid.cl_boot,
-			clp->cl_clientid.cl_id);
-		return;
-	}
-
-	dprintk("renewing client (clientid %08x/%08x)\n", 
-			clp->cl_clientid.cl_boot, 
-			clp->cl_clientid.cl_id);
-	list_move_tail(&clp->cl_lru, &nn->client_lru);
-	clp->cl_time = get_seconds();
-}
-
-static inline void
-renew_client(struct nfs4_client *clp)
-{
-	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
-	spin_lock(&nn->client_lock);
-	renew_client_locked(clp);
-	spin_unlock(&nn->client_lock);
-}
-
 /* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
 static int
 STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
@@ -1048,7 +1087,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 static inline void
 free_client(struct nfs4_client *clp)
 {
-	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id);
 
 	lockdep_assert_held(&nn->client_lock);
 	while (!list_empty(&clp->cl_sessions)) {
@@ -1056,36 +1095,21 @@ free_client(struct nfs4_client *clp)
 		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
 				se_perclnt);
 		list_del(&ses->se_perclnt);
-		nfsd4_put_session_locked(ses);
+		WARN_ON_ONCE(atomic_read(&ses->se_ref));
+		free_session(ses);
 	}
 	free_svc_cred(&clp->cl_cred);
 	kfree(clp->cl_name.data);
+	idr_destroy(&clp->cl_stateids);
 	kfree(clp);
 }
 
-void
-release_session_client(struct nfsd4_session *session)
-{
-	struct nfs4_client *clp = session->se_client;
-	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
-	if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
-		return;
-	if (is_client_expired(clp)) {
-		free_client(clp);
-		session->se_client = NULL;
-	} else
-		renew_client_locked(clp);
-	spin_unlock(&nn->client_lock);
-}
-
 /* must be called under the client_lock */
 static inline void
 unhash_client_locked(struct nfs4_client *clp)
 {
 	struct nfsd4_session *ses;
 
-	mark_client_expired(clp);
 	list_del(&clp->cl_lru);
 	spin_lock(&clp->cl_lock);
 	list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
@@ -1111,7 +1135,7 @@ destroy_client(struct nfs4_client *clp)
 	spin_unlock(&recall_lock);
 	while (!list_empty(&reaplist)) {
 		dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
-		unhash_delegation(dp);
+		destroy_delegation(dp);
 	}
 	while (!list_empty(&clp->cl_openowners)) {
 		oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
@@ -1127,8 +1151,8 @@ destroy_client(struct nfs4_client *clp)
 		rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
 	spin_lock(&nn->client_lock);
 	unhash_client_locked(clp);
-	if (atomic_read(&clp->cl_refcount) == 0)
-		free_client(clp);
+	WARN_ON_ONCE(atomic_read(&clp->cl_refcount));
+	free_client(clp);
 	spin_unlock(&nn->client_lock);
 }
 
@@ -1202,7 +1226,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2)
 	if (g1->ngroups != g2->ngroups)
 		return false;
 	for (i=0; i<g1->ngroups; i++)
-		if (GROUP_AT(g1, i) != GROUP_AT(g2, i))
+		if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i)))
 			return false;
 	return true;
 }
@@ -1227,8 +1251,8 @@ static bool
 same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
 {
 	if ((is_gss_cred(cr1) != is_gss_cred(cr2))
-		|| (cr1->cr_uid != cr2->cr_uid)
-		|| (cr1->cr_gid != cr2->cr_gid)
+		|| (!uid_eq(cr1->cr_uid, cr2->cr_uid))
+		|| (!gid_eq(cr1->cr_gid, cr2->cr_gid))
 		|| !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
 		return false;
 	if (cr1->cr_principal == cr2->cr_principal)
@@ -1258,7 +1282,12 @@ static void gen_confirm(struct nfs4_client *clp)
 
 static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
 {
-	return idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+	struct nfs4_stid *ret;
+
+	ret = idr_find(&cl->cl_stateids, t->si_opaque.so_id);
+	if (!ret || !ret->sc_type)
+		return NULL;
+	return ret;
 }
 
 static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
@@ -1302,6 +1331,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
 	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_lru);
 	INIT_LIST_HEAD(&clp->cl_callbacks);
+	INIT_LIST_HEAD(&clp->cl_revoked);
 	spin_lock_init(&clp->cl_lock);
 	nfsd4_init_callback(&clp->cl_cb_null);
 	clp->cl_time = get_seconds();
@@ -1383,12 +1413,12 @@ move_to_confirmed(struct nfs4_client *clp)
 }
 
 static struct nfs4_client *
-find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions)
 {
 	struct nfs4_client *clp;
 	unsigned int idhashval = clientid_hashval(clid->cl_id);
 
-	list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) {
+	list_for_each_entry(clp, &tbl[idhashval], cl_idhash) {
 		if (same_clid(&clp->cl_clientid, clid)) {
 			if ((bool)clp->cl_minorversion != sessions)
 				return NULL;
@@ -1400,19 +1430,19 @@ find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
 }
 
 static struct nfs4_client *
+find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+{
+	struct list_head *tbl = nn->conf_id_hashtbl;
+
+	return find_client_in_id_table(tbl, clid, sessions);
+}
+
+static struct nfs4_client *
 find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
 {
-	struct nfs4_client *clp;
-	unsigned int idhashval = clientid_hashval(clid->cl_id);
+	struct list_head *tbl = nn->unconf_id_hashtbl;
 
-	list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) {
-		if (same_clid(&clp->cl_clientid, clid)) {
-			if ((bool)clp->cl_minorversion != sessions)
-				return NULL;
-			return clp;
-		}
-	}
-	return NULL;
+	return find_client_in_id_table(tbl, clid, sessions);
 }
 
 static bool clp_used_exchangeid(struct nfs4_client *clp)
@@ -1616,6 +1646,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 	default:				/* checked by xdr code */
 		WARN_ON_ONCE(1);
 	case SP4_SSV:
+		return nfserr_encr_alg_unsupp;
 	case SP4_MACH_CRED:
 		return nfserr_serverfault;	/* no excuse :-/ */
 	}
@@ -1757,10 +1788,55 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
 				/* seqid, slotID, slotID, slotID, status */ \
 			5 ) * sizeof(__be32))
 
-static bool check_forechannel_attrs(struct nfsd4_channel_attrs fchannel)
+static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn)
+{
+	u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
+
+	if (ca->maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ)
+		return nfserr_toosmall;
+	if (ca->maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ)
+		return nfserr_toosmall;
+	ca->headerpadsz = 0;
+	ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc);
+	ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc);
+	ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND);
+	ca->maxresp_cached = min_t(u32, ca->maxresp_cached,
+			NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ);
+	ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION);
+	/*
+	 * Note decreasing slot size below client's request may make it
+	 * difficult for client to function correctly, whereas
+	 * decreasing the number of slots will (just?) affect
+	 * performance.  When short on memory we therefore prefer to
+	 * decrease number of slots instead of their size.  Clients that
+	 * request larger slots than they need will get poor results:
+	 */
+	ca->maxreqs = nfsd4_get_drc_mem(ca);
+	if (!ca->maxreqs)
+		return nfserr_jukebox;
+
+	return nfs_ok;
+}
+
+static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
 {
-	return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ
-		|| fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ;
+	ca->headerpadsz = 0;
+
+	/*
+	 * These RPC_MAX_HEADER macros are overkill, especially since we
+	 * don't even do gss on the backchannel yet.  But this is still
+	 * less than 1k.  Tighten up this estimate in the unlikely event
+	 * it turns out to be a problem for some client:
+	 */
+	if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH)
+		return nfserr_toosmall;
+	if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH)
+		return nfserr_toosmall;
+	ca->maxresp_cached = 0;
+	if (ca->maxops < 2)
+		return nfserr_toosmall;
+
+	return nfs_ok;
 }
 
 __be32
@@ -1778,12 +1854,16 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 
 	if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
 		return nfserr_inval;
-	if (check_forechannel_attrs(cr_ses->fore_channel))
-		return nfserr_toosmall;
-	new = alloc_session(&cr_ses->fore_channel, nn);
-	if (!new)
-		return nfserr_jukebox;
+	status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
+	if (status)
+		return status;
+	status = check_backchannel_attrs(&cr_ses->back_channel);
+	if (status)
+		return status;
 	status = nfserr_jukebox;
+	new = alloc_session(&cr_ses->fore_channel);
+	if (!new)
+		goto out_release_drc_mem;
 	conn = alloc_conn_from_crses(rqstp, cr_ses);
 	if (!conn)
 		goto out_free_session;
@@ -1791,6 +1871,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	nfs4_lock_state();
 	unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
 	conf = find_confirmed_client(&cr_ses->clientid, true, nn);
+	WARN_ON_ONCE(conf && unconf);
 
 	if (conf) {
 		cs_slot = &conf->cl_cs_slot;
@@ -1817,8 +1898,12 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 			goto out_free_conn;
 		}
 		old = find_confirmed_client_by_name(&unconf->cl_name, nn);
-		if (old)
+		if (old) {
+			status = mark_client_expired(old);
+			if (status)
+				goto out_free_conn;
 			expire_client(old);
+		}
 		move_to_confirmed(unconf);
 		conf = unconf;
 	} else {
@@ -1837,22 +1922,21 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 
 	memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
-	memcpy(&cr_ses->fore_channel, &new->se_fchannel,
-		sizeof(struct nfsd4_channel_attrs));
 	cs_slot->sl_seqid++;
 	cr_ses->seqid = cs_slot->sl_seqid;
 
 	/* cache solo and embedded create sessions under the state lock */
 	nfsd4_cache_create_session(cr_ses, cs_slot, status);
-out:
 	nfs4_unlock_state();
-	dprintk("%s returns %d\n", __func__, ntohl(status));
 	return status;
 out_free_conn:
+	nfs4_unlock_state();
 	free_conn(conn);
 out_free_session:
 	__free_session(new);
-	goto out;
+out_release_drc_mem:
+	nfsd4_put_drc_mem(&cr_ses->fore_channel);
+	return status;
 }
 
 static __be32 nfsd4_map_bcts_dir(u32 *dir)
@@ -1890,30 +1974,30 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
 {
 	__be32 status;
 	struct nfsd4_conn *conn;
+	struct nfsd4_session *session;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
 	if (!nfsd4_last_compound_op(rqstp))
 		return nfserr_not_only_op;
+	nfs4_lock_state();
 	spin_lock(&nn->client_lock);
-	cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
-	/* Sorta weird: we only need the refcnt'ing because new_conn acquires
-	 * client_lock iself: */
-	if (cstate->session) {
-		nfsd4_get_session(cstate->session);
-		atomic_inc(&cstate->session->se_client->cl_refcount);
-	}
+	session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
 	spin_unlock(&nn->client_lock);
-	if (!cstate->session)
-		return nfserr_badsession;
-
+	status = nfserr_badsession;
+	if (!session)
+		goto out;
 	status = nfsd4_map_bcts_dir(&bcts->dir);
 	if (status)
-		return status;
+		goto out;
 	conn = alloc_conn(rqstp, bcts->dir);
+	status = nfserr_jukebox;
 	if (!conn)
-		return nfserr_jukebox;
-	nfsd4_init_conn(rqstp, conn, cstate->session);
-	return nfs_ok;
+		goto out;
+	nfsd4_init_conn(rqstp, conn, session);
+	status = nfs_ok;
+out:
+	nfs4_unlock_state();
+	return status;
 }
 
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
@@ -1929,42 +2013,36 @@ nfsd4_destroy_session(struct svc_rqst *r,
 		      struct nfsd4_destroy_session *sessionid)
 {
 	struct nfsd4_session *ses;
-	__be32 status = nfserr_badsession;
+	__be32 status;
 	struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
 
-	/* Notes:
-	 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
-	 * - Should we return nfserr_back_chan_busy if waiting for
-	 *   callbacks on to-be-destroyed session?
-	 * - Do we need to clear any callback info from previous session?
-	 */
-
+	nfs4_lock_state();
+	status = nfserr_not_only_op;
 	if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
 		if (!nfsd4_last_compound_op(r))
-			return nfserr_not_only_op;
+			goto out;
 	}
 	dump_sessionid(__func__, &sessionid->sessionid);
 	spin_lock(&nn->client_lock);
 	ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r));
-	if (!ses) {
-		spin_unlock(&nn->client_lock);
-		goto out;
-	}
-
+	status = nfserr_badsession;
+	if (!ses)
+		goto out_client_lock;
+	status = mark_session_dead_locked(ses);
+	if (status)
+		goto out_client_lock;
 	unhash_session(ses);
 	spin_unlock(&nn->client_lock);
 
-	nfs4_lock_state();
 	nfsd4_probe_callback_sync(ses->se_client);
-	nfs4_unlock_state();
 
 	spin_lock(&nn->client_lock);
-	nfsd4_del_conns(ses);
-	nfsd4_put_session_locked(ses);
-	spin_unlock(&nn->client_lock);
+	free_session(ses);
 	status = nfs_ok;
+out_client_lock:
+	spin_unlock(&nn->client_lock);
 out:
-	dprintk("%s returns %d\n", __func__, ntohl(status));
+	nfs4_unlock_state();
 	return status;
 }
 
@@ -2024,6 +2102,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 {
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	struct nfsd4_session *session;
+	struct nfs4_client *clp;
 	struct nfsd4_slot *slot;
 	struct nfsd4_conn *conn;
 	__be32 status;
@@ -2044,19 +2123,26 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	status = nfserr_badsession;
 	session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));
 	if (!session)
-		goto out;
+		goto out_no_session;
+	clp = session->se_client;
+	status = get_client_locked(clp);
+	if (status)
+		goto out_no_session;
+	status = nfsd4_get_session_locked(session);
+	if (status)
+		goto out_put_client;
 
 	status = nfserr_too_many_ops;
 	if (nfsd4_session_too_many_ops(rqstp, session))
-		goto out;
+		goto out_put_session;
 
 	status = nfserr_req_too_big;
 	if (nfsd4_request_too_big(rqstp, session))
-		goto out;
+		goto out_put_session;
 
 	status = nfserr_badslot;
 	if (seq->slotid >= session->se_fchannel.maxreqs)
-		goto out;
+		goto out_put_session;
 
 	slot = session->se_slots[seq->slotid];
 	dprintk("%s: slotid %d\n", __func__, seq->slotid);
@@ -2071,7 +2157,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	if (status == nfserr_replay_cache) {
 		status = nfserr_seq_misordered;
 		if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED))
-			goto out;
+			goto out_put_session;
 		cstate->slot = slot;
 		cstate->session = session;
 		/* Return the cached reply status and set cstate->status
@@ -2081,7 +2167,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 		goto out;
 	}
 	if (status)
-		goto out;
+		goto out_put_session;
 
 	nfsd4_sequence_check_conn(conn, session);
 	conn = NULL;
@@ -2098,27 +2184,27 @@ nfsd4_sequence(struct svc_rqst *rqstp,
 	cstate->session = session;
 
 out:
-	/* Hold a session reference until done processing the compound. */
-	if (cstate->session) {
-		struct nfs4_client *clp = session->se_client;
-
-		nfsd4_get_session(cstate->session);
-		atomic_inc(&clp->cl_refcount);
-		switch (clp->cl_cb_state) {
-		case NFSD4_CB_DOWN:
-			seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
-			break;
-		case NFSD4_CB_FAULT:
-			seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
-			break;
-		default:
-			seq->status_flags = 0;
-		}
+	switch (clp->cl_cb_state) {
+	case NFSD4_CB_DOWN:
+		seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
+		break;
+	case NFSD4_CB_FAULT:
+		seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
+		break;
+	default:
+		seq->status_flags = 0;
 	}
+	if (!list_empty(&clp->cl_revoked))
+		seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+out_no_session:
 	kfree(conn);
 	spin_unlock(&nn->client_lock);
-	dprintk("%s: return %d\n", __func__, ntohl(status));
 	return status;
+out_put_session:
+	nfsd4_put_session(session);
+out_put_client:
+	put_client_renew_locked(clp);
+	goto out_no_session;
 }
 
 __be32
@@ -2131,17 +2217,12 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
 	nfs4_lock_state();
 	unconf = find_unconfirmed_client(&dc->clientid, true, nn);
 	conf = find_confirmed_client(&dc->clientid, true, nn);
+	WARN_ON_ONCE(conf && unconf);
 
 	if (conf) {
 		clp = conf;
 
-		if (!is_client_expired(conf) && client_has_state(conf)) {
-			status = nfserr_clientid_busy;
-			goto out;
-		}
-
-		/* rfc5661 18.50.3 */
-		if (cstate->session && conf == cstate->session->se_client) {
+		if (client_has_state(conf)) {
 			status = nfserr_clientid_busy;
 			goto out;
 		}
@@ -2155,7 +2236,6 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
 	expire_client(clp);
 out:
 	nfs4_unlock_state();
-	dprintk("%s return %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -2293,8 +2373,12 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		expire_client(unconf);
 	} else { /* case 3: normal case; new or rebooted client */
 		conf = find_confirmed_client_by_name(&unconf->cl_name, nn);
-		if (conf)
+		if (conf) {
+			status = mark_client_expired(conf);
+			if (status)
+				goto out;
 			expire_client(conf);
+		}
 		move_to_confirmed(unconf);
 		nfsd4_probe_callback(unconf);
 	}
@@ -2314,7 +2398,6 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
 	unsigned int hashval = file_hashval(ino);
 
 	atomic_set(&fp->fi_ref, 1);
-	INIT_LIST_HEAD(&fp->fi_hash);
 	INIT_LIST_HEAD(&fp->fi_stateids);
 	INIT_LIST_HEAD(&fp->fi_delegations);
 	fp->fi_inode = igrab(ino);
@@ -2323,7 +2406,7 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
 	memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
 	memset(fp->fi_access, 0, sizeof(fp->fi_access));
 	spin_lock(&recall_lock);
-	list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+	hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]);
 	spin_unlock(&recall_lock);
 }
 
@@ -2443,9 +2526,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 
 static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
 	struct nfs4_openowner *oo = open->op_openowner;
-	struct nfs4_client *clp = oo->oo_owner.so_client;
 
-	init_stid(&stp->st_stid, clp, NFS4_OPEN_STID);
+	stp->st_stid.sc_type = NFS4_OPEN_STID;
 	INIT_LIST_HEAD(&stp->st_lockowners);
 	list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
@@ -2510,7 +2592,7 @@ find_file(struct inode *ino)
 	struct nfs4_file *fp;
 
 	spin_lock(&recall_lock);
-	list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
+	hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
 		if (fp->fi_inode == ino) {
 			get_nfs4_file(fp);
 			spin_unlock(&recall_lock);
@@ -2533,8 +2615,6 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 	struct nfs4_ol_stateid *stp;
 	__be32 ret;
 
-	dprintk("NFSD: nfs4_share_conflict\n");
-
 	fp = find_file(ino);
 	if (!fp)
 		return nfs_ok;
@@ -2553,6 +2633,9 @@ out:
 
 static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 {
+	struct nfs4_client *clp = dp->dl_stid.sc_client;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
 	/* We're assuming the state code never drops its reference
 	 * without first removing the lease.  Since we're in this lease
 	 * callback (and since the lease code is serialized by the kernel
@@ -2560,15 +2643,15 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 	 * it's safe to take a reference: */
 	atomic_inc(&dp->dl_count);
 
-	list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
+	list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
 
-	/* only place dl_time is set. protected by lock_flocks*/
+	/* Only place dl_time is set; protected by i_lock: */
 	dp->dl_time = get_seconds();
 
 	nfsd4_cb_recall(dp);
 }
 
-/* Called from break_lease() with lock_flocks() held. */
+/* Called from break_lease() with i_lock held. */
 static void nfsd_break_deleg_cb(struct file_lock *fl)
 {
 	struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
@@ -2706,7 +2789,7 @@ static bool nfsd4_is_deleg_cur(struct nfsd4_open *open)
 }
 
 static __be32
-nfs4_check_deleg(struct nfs4_client *cl, struct nfs4_file *fp, struct nfsd4_open *open,
+nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 		struct nfs4_delegation **dp)
 {
 	int flags;
@@ -3031,7 +3114,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	if (fp) {
 		if ((status = nfs4_check_open(fp, open, &stp)))
 			goto out;
-		status = nfs4_check_deleg(cl, fp, open, &dp);
+		status = nfs4_check_deleg(cl, open, &dp);
 		if (status)
 			goto out;
 	} else {
@@ -3209,13 +3292,12 @@ nfs4_laundromat(struct nfsd_net *nn)
 				clientid_val = t;
 			break;
 		}
-		if (atomic_read(&clp->cl_refcount)) {
+		if (mark_client_expired_locked(clp)) {
 			dprintk("NFSD: client in use (clientid %08x)\n",
 				clp->cl_clientid.cl_id);
 			continue;
 		}
-		unhash_client_locked(clp);
-		list_add(&clp->cl_lru, &reaplist);
+		list_move(&clp->cl_lru, &reaplist);
 	}
 	spin_unlock(&nn->client_lock);
 	list_for_each_safe(pos, next, &reaplist) {
@@ -3225,7 +3307,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 		expire_client(clp);
 	}
 	spin_lock(&recall_lock);
-	list_for_each_safe(pos, next, &del_recall_lru) {
+	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
 			continue;
@@ -3240,7 +3322,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 	spin_unlock(&recall_lock);
 	list_for_each_safe(pos, next, &reaplist) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		unhash_delegation(dp);
+		revoke_delegation(dp);
 	}
 	test_val = nn->nfsd4_lease;
 	list_for_each_safe(pos, next, &nn->close_lru) {
@@ -3283,16 +3365,6 @@ static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *s
 	return nfs_ok;
 }
 
-static int
-STALE_STATEID(stateid_t *stateid, struct nfsd_net *nn)
-{
-	if (stateid->si_opaque.so_clid.cl_boot == nn->boot_time)
-		return 0;
-	dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
-		STATEID_VAL(stateid));
-	return 1;
-}
-
 static inline int
 access_permit_read(struct nfs4_ol_stateid *stp)
 {
@@ -3409,13 +3481,24 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 	status = check_stateid_generation(stateid, &s->sc_stateid, 1);
 	if (status)
 		return status;
-	if (!(s->sc_type & (NFS4_OPEN_STID | NFS4_LOCK_STID)))
+	switch (s->sc_type) {
+	case NFS4_DELEG_STID:
+		return nfs_ok;
+	case NFS4_REVOKED_DELEG_STID:
+		return nfserr_deleg_revoked;
+	case NFS4_OPEN_STID:
+	case NFS4_LOCK_STID:
+		ols = openlockstateid(s);
+		if (ols->st_stateowner->so_is_open_owner
+	    			&& !(openowner(ols->st_stateowner)->oo_flags
+						& NFS4_OO_CONFIRMED))
+			return nfserr_bad_stateid;
 		return nfs_ok;
-	ols = openlockstateid(s);
-	if (ols->st_stateowner->so_is_open_owner
-	    && !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+	default:
+		printk("unknown stateid type %x\n", s->sc_type);
+	case NFS4_CLOSED_STID:
 		return nfserr_bad_stateid;
-	return nfs_ok;
+	}
 }
 
 static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
@@ -3423,19 +3506,20 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
 				   struct nfsd_net *nn)
 {
 	struct nfs4_client *cl;
+	__be32 status;
 
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
 		return nfserr_bad_stateid;
-	if (STALE_STATEID(stateid, nn))
+	status = lookup_clientid(&stateid->si_opaque.so_clid, sessions,
+							nn, &cl);
+	if (status == nfserr_stale_clientid)
 		return nfserr_stale_stateid;
-	cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn);
-	if (!cl)
-		return nfserr_expired;
+	if (status)
+		return status;
 	*s = find_stateid_by_type(cl, stateid, typemask);
 	if (!*s)
 		return nfserr_bad_stateid;
 	return nfs_ok;
-
 }
 
 /*
@@ -3545,6 +3629,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	stateid_t *stateid = &free_stateid->fr_stateid;
 	struct nfs4_stid *s;
+	struct nfs4_delegation *dp;
 	struct nfs4_client *cl = cstate->session->se_client;
 	__be32 ret = nfserr_bad_stateid;
 
@@ -3566,6 +3651,11 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		else
 			ret = nfserr_locks_held;
 		break;
+	case NFS4_REVOKED_DELEG_STID:
+		dp = delegstateid(s);
+		destroy_revoked_delegation(dp);
+		ret = nfs_ok;
+		break;
 	default:
 		ret = nfserr_bad_stateid;
 	}
@@ -3590,10 +3680,12 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
 	status = nfsd4_check_seqid(cstate, sop, seqid);
 	if (status)
 		return status;
-	if (stp->st_stid.sc_type == NFS4_CLOSED_STID)
+	if (stp->st_stid.sc_type == NFS4_CLOSED_STID
+		|| stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID)
 		/*
 		 * "Closed" stateid's exist *only* to return
-		 * nfserr_replay_me from the previous step.
+		 * nfserr_replay_me from the previous step, and
+		 * revoked delegations are kept only for free_stateid.
 		 */
 		return nfserr_bad_stateid;
 	status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
@@ -3623,7 +3715,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	if (status)
 		return status;
 	*stpp = openlockstateid(s);
-	cstate->replay_owner = (*stpp)->st_stateowner;
+	if (!nfsd4_has_session(cstate))
+		cstate->replay_owner = (*stpp)->st_stateowner;
 
 	return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
 }
@@ -3681,6 +3774,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfsd4_client_record_create(oo->oo_owner.so_client);
 	status = nfs_ok;
 out:
+	nfsd4_bump_seqid(cstate, status);
 	if (!cstate->replay_owner)
 		nfs4_unlock_state();
 	return status;
@@ -3764,31 +3858,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 	status = nfs_ok;
 out:
+	nfsd4_bump_seqid(cstate, status);
 	if (!cstate->replay_owner)
 		nfs4_unlock_state();
 	return status;
 }
 
-void nfsd4_purge_closed_stateid(struct nfs4_stateowner *so)
-{
-	struct nfs4_openowner *oo;
-	struct nfs4_ol_stateid *s;
-
-	if (!so->so_is_open_owner)
-		return;
-	oo = openowner(so);
-	s = oo->oo_last_closed_stid;
-	if (!s)
-		return;
-	if (!(oo->oo_flags & NFS4_OO_PURGE_CLOSE)) {
-		/* Release the last_closed_stid on the next seqid bump: */
-		oo->oo_flags |= NFS4_OO_PURGE_CLOSE;
-		return;
-	}
-	oo->oo_flags &= ~NFS4_OO_PURGE_CLOSE;
-	release_last_closed_stateid(oo);
-}
-
 static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 {
 	unhash_open_stateid(s);
@@ -3817,28 +3892,30 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 					&close->cl_stateid,
 					NFS4_OPEN_STID|NFS4_CLOSED_STID,
 					&stp, nn);
+	nfsd4_bump_seqid(cstate, status);
 	if (status)
 		goto out; 
 	oo = openowner(stp->st_stateowner);
-	status = nfs_ok;
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
 	nfsd4_close_open_stateid(stp);
-	release_last_closed_stateid(oo);
-	oo->oo_last_closed_stid = stp;
+
+	if (cstate->minorversion) {
+		unhash_stid(&stp->st_stid);
+		free_generic_stateid(stp);
+	} else
+		oo->oo_last_closed_stid = stp;
 
 	if (list_empty(&oo->oo_owner.so_stateids)) {
-		if (cstate->minorversion) {
+		if (cstate->minorversion)
 			release_openowner(oo);
-			cstate->replay_owner = NULL;
-		} else {
+		else {
 			/*
 			 * In the 4.0 case we need to keep the owners around a
 			 * little while to handle CLOSE replay.
 			 */
-			if (list_empty(&oo->oo_owner.so_stateids))
-				move_to_close_lru(oo, SVC_NET(rqstp));
+			move_to_close_lru(oo, SVC_NET(rqstp));
 		}
 	}
 out:
@@ -3870,7 +3947,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		goto out;
 
-	unhash_delegation(dp);
+	destroy_delegation(dp);
 out:
 	nfs4_unlock_state();
 
@@ -4031,7 +4108,7 @@ alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct
 	stp = nfs4_alloc_stateid(clp);
 	if (stp == NULL)
 		return NULL;
-	init_stid(&stp->st_stid, clp, NFS4_LOCK_STID);
+	stp->st_stid.sc_type = NFS4_LOCK_STID;
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
 	stp->st_stateowner = &lo->lo_owner;
@@ -4248,6 +4325,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 out:
 	if (status && new_state)
 		release_lockowner(lock_sop);
+	nfsd4_bump_seqid(cstate, status);
 	if (!cstate->replay_owner)
 		nfs4_unlock_state();
 	if (file_lock)
@@ -4357,6 +4435,7 @@ __be32
 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    struct nfsd4_locku *locku)
 {
+	struct nfs4_lockowner *lo;
 	struct nfs4_ol_stateid *stp;
 	struct file *filp = NULL;
 	struct file_lock *file_lock = NULL;
@@ -4389,9 +4468,10 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfserr_jukebox;
 		goto out;
 	}
+	lo = lockowner(stp->st_stateowner);
 	locks_init_lock(file_lock);
 	file_lock->fl_type = F_UNLCK;
-	file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
+	file_lock->fl_owner = (fl_owner_t)lo;
 	file_lock->fl_pid = current->tgid;
 	file_lock->fl_file = filp;
 	file_lock->fl_flags = FL_POSIX;
@@ -4402,21 +4482,21 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 						locku->lu_length);
 	nfs4_transform_lock_offset(file_lock);
 
-	/*
-	*  Try to unlock the file in the VFS.
-	*/
 	err = vfs_lock_file(filp, F_SETLK, file_lock, NULL);
 	if (err) {
 		dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
 		goto out_nfserr;
 	}
-	/*
-	* OK, unlock succeeded; the only thing left to do is update the stateid.
-	*/
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
+	if (nfsd4_has_session(cstate) && !check_for_locks(stp->st_file, lo)) {
+		WARN_ON_ONCE(cstate->replay_owner);
+		release_lockowner(lo);
+	}
+
 out:
+	nfsd4_bump_seqid(cstate, status);
 	if (!cstate->replay_owner)
 		nfs4_unlock_state();
 	if (file_lock)
@@ -4440,7 +4520,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
 	struct inode *inode = filp->fi_inode;
 	int status = 0;
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
 		if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
 			status = 1;
@@ -4448,7 +4528,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
 		}
 	}
 out:
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return status;
 }
 
@@ -4609,6 +4689,8 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn)
 
 u64 nfsd_forget_client(struct nfs4_client *clp, u64 max)
 {
+	if (mark_client_expired(clp))
+		return 0;
 	expire_client(clp);
 	return 1;
 }
@@ -4715,7 +4797,7 @@ u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
 	spin_unlock(&recall_lock);
 
 	list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
-		unhash_delegation(dp);
+		revoke_delegation(dp);
 
 	return count;
 }
@@ -4787,12 +4869,6 @@ struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_
 void
 nfs4_state_init(void)
 {
-	int i;
-
-	for (i = 0; i < FILE_HASH_SIZE; i++) {
-		INIT_LIST_HEAD(&file_hashtbl[i]);
-	}
-	INIT_LIST_HEAD(&del_recall_lru);
 }
 
 /*
@@ -4856,6 +4932,7 @@ static int nfs4_state_create_net(struct net *net)
 	nn->unconf_name_tree = RB_ROOT;
 	INIT_LIST_HEAD(&nn->client_lru);
 	INIT_LIST_HEAD(&nn->close_lru);
+	INIT_LIST_HEAD(&nn->del_recall_lru);
 	spin_lock_init(&nn->client_lock);
 
 	INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
@@ -4913,16 +4990,6 @@ nfs4_state_start_net(struct net *net)
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	int ret;
 
-	/*
-	 * FIXME: For now, we hang most of the pernet global stuff off of
-	 * init_net until nfsd is fully containerized. Eventually, we'll
-	 * need to pass a net pointer into this function, take a reference
-	 * to that instead and then do most of the rest of this on a per-net
-	 * basis.
-	 */
-	if (net != &init_net)
-		return -EINVAL;
-
 	ret = nfs4_state_create_net(net);
 	if (ret)
 		return ret;
@@ -4978,16 +5045,14 @@ nfs4_state_shutdown_net(struct net *net)
 
 	INIT_LIST_HEAD(&reaplist);
 	spin_lock(&recall_lock);
-	list_for_each_safe(pos, next, &del_recall_lru) {
+	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		if (dp->dl_stid.sc_client->net != net)
-			continue;
 		list_move(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&recall_lock);
 	list_for_each_safe(pos, next, &reaplist) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		unhash_delegation(dp);
+		destroy_delegation(dp);
 	}
 
 	nfsd4_client_tracking_exit(net);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0dc11586682..6cd86e0fe45 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -264,7 +264,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		iattr->ia_valid |= ATTR_SIZE;
 	}
 	if (bmval[0] & FATTR4_WORD0_ACL) {
-		int nace;
+		u32 nace;
 		struct nfs4_ace *ace;
 
 		READ_BUF(4); len += 4;
@@ -293,13 +293,13 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 			ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
 			status = nfs_ok;
 			if (ace->whotype != NFS4_ACL_WHO_NAMED)
-				ace->who = 0;
+				;
 			else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
 				status = nfsd_map_name_to_gid(argp->rqstp,
-						buf, dummy32, &ace->who);
+						buf, dummy32, &ace->who_gid);
 			else
 				status = nfsd_map_name_to_uid(argp->rqstp,
-						buf, dummy32, &ace->who);
+						buf, dummy32, &ace->who_uid);
 			if (status)
 				return status;
 		}
@@ -344,10 +344,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 			   all 32 bits of 'nseconds'. */
 			READ_BUF(12);
 			len += 12;
-			READ32(dummy32);
-			if (dummy32)
-				return nfserr_inval;
-			READ32(iattr->ia_atime.tv_sec);
+			READ64(iattr->ia_atime.tv_sec);
 			READ32(iattr->ia_atime.tv_nsec);
 			if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
 				return nfserr_inval;
@@ -370,10 +367,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 			   all 32 bits of 'nseconds'. */
 			READ_BUF(12);
 			len += 12;
-			READ32(dummy32);
-			if (dummy32)
-				return nfserr_inval;
-			READ32(iattr->ia_mtime.tv_sec);
+			READ64(iattr->ia_mtime.tv_sec);
 			READ32(iattr->ia_mtime.tv_nsec);
 			if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
 				return nfserr_inval;
@@ -464,9 +458,16 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
 			READ32(dummy);
 			READ_BUF(dummy * 4);
 			if (cbs->flavor == (u32)(-1)) {
-				cbs->uid = uid;
-				cbs->gid = gid;
-				cbs->flavor = RPC_AUTH_UNIX;
+				kuid_t kuid = make_kuid(&init_user_ns, uid);
+				kgid_t kgid = make_kgid(&init_user_ns, gid);
+				if (uid_valid(kuid) && gid_valid(kgid)) {
+					cbs->uid = kuid;
+					cbs->gid = kgid;
+					cbs->flavor = RPC_AUTH_UNIX;
+				} else {
+					dprintk("RPC_AUTH_UNIX with invalid"
+						"uid or gid ignoring!\n");
+				}
 			}
 			break;
 		case RPC_AUTH_GSS:
@@ -797,6 +798,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 	open->op_iattr.ia_valid = 0;
 	open->op_openowner = NULL;
 
+	open->op_xdr_error = 0;
 	/* seqid, share_access, share_deny, clientid, ownerlen */
 	READ_BUF(4);
 	READ32(open->op_seqid);
@@ -1685,36 +1687,6 @@ static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
 } while (0)
 #define ADJUST_ARGS()		resp->p = p
 
-/*
- * Header routine to setup seqid operation replay cache
- */
-#define ENCODE_SEQID_OP_HEAD					\
-	__be32 *save;						\
-								\
-	save = resp->p;
-
-/*
- * Routine for encoding the result of a "seqid-mutating" NFSv4 operation.  This
- * is where sequence id's are incremented, and the replay cache is filled.
- * Note that we increment sequence id's here, at the last moment, so we're sure
- * we know whether the error to be returned is a sequence id mutating error.
- */
-
-static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, __be32 nfserr)
-{
-	struct nfs4_stateowner *stateowner = resp->cstate.replay_owner;
-
-	if (seqid_mutating_err(ntohl(nfserr)) && stateowner) {
-		stateowner->so_seqid++;
-		stateowner->so_replay.rp_status = nfserr;
-		stateowner->so_replay.rp_buflen =
-			  (char *)resp->p - (char *)save;
-		memcpy(stateowner->so_replay.rp_buf, save,
-			stateowner->so_replay.rp_buflen);
-		nfsd4_purge_closed_stateid(stateowner);
-	}
-}
-
 /* Encode as an array of strings the string given with components
  * separated @sep, escaped with esc_enter and esc_exit.
  */
@@ -1926,7 +1898,7 @@ static u32 nfs4_file_type(umode_t mode)
 }
 
 static __be32
-nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
+nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid,
 			__be32 **p, int *buflen)
 {
 	int status;
@@ -1935,10 +1907,10 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
 		return nfserr_resource;
 	if (whotype != NFS4_ACL_WHO_NAMED)
 		status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1));
-	else if (group)
-		status = nfsd_map_gid_to_name(rqstp, id, (u8 *)(*p + 1));
+	else if (gid_valid(gid))
+		status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1));
 	else
-		status = nfsd_map_uid_to_name(rqstp, id, (u8 *)(*p + 1));
+		status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1));
 	if (status < 0)
 		return nfserrno(status);
 	*p = xdr_encode_opaque(*p, NULL, status);
@@ -1948,22 +1920,33 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
 }
 
 static inline __be32
-nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, __be32 **p, int *buflen)
+nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen)
 {
-	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen);
+	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID,
+				 p, buflen);
 }
 
 static inline __be32
-nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, __be32 **p, int *buflen)
+nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen)
 {
-	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen);
+	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group,
+				 p, buflen);
 }
 
 static inline __be32
-nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
+nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
 		__be32 **p, int *buflen)
 {
-	return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen);
+	kuid_t uid = INVALID_UID;
+	kgid_t gid = INVALID_GID;
+
+	if (ace->whotype == NFS4_ACL_WHO_NAMED) {
+		if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+			gid = ace->who_gid;
+		else
+			uid = ace->who_uid;
+	}
+	return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen);
 }
 
 #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
@@ -1997,7 +1980,7 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
 		if (path.dentry != path.mnt->mnt_root)
 			break;
 	}
-	err = vfs_getattr(path.mnt, path.dentry, stat);
+	err = vfs_getattr(&path, stat);
 	path_put(&path);
 	return err;
 }
@@ -2006,12 +1989,11 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
  * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
  * ourselves.
  *
- * @countp is the buffer size in _words_; upon successful return this becomes
- * replaced with the number of words written.
+ * countp is the buffer size in _words_
  */
 __be32
 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
-		struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
+		struct dentry *dentry, __be32 **buffer, int count, u32 *bmval,
 		struct svc_rqst *rqstp, int ignore_crossmnt)
 {
 	u32 bmval0 = bmval[0];
@@ -2020,12 +2002,12 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	struct kstat stat;
 	struct svc_fh tempfh;
 	struct kstatfs statfs;
-	int buflen = *countp << 2;
+	int buflen = count << 2;
 	__be32 *attrlenp;
 	u32 dummy;
 	u64 dummy64;
 	u32 rdattr_err = 0;
-	__be32 *p = buffer;
+	__be32 *p = *buffer;
 	__be32 status;
 	int err;
 	int aclsupport = 0;
@@ -2050,7 +2032,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 			goto out;
 	}
 
-	err = vfs_getattr(exp->ex_path.mnt, dentry, &stat);
+	err = vfs_getattr(&path, &stat);
 	if (err)
 		goto out_nfserr;
 	if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL |
@@ -2224,9 +2206,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 			WRITE32(ace->type);
 			WRITE32(ace->flag);
 			WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL);
-			status = nfsd4_encode_aclname(rqstp, ace->whotype,
-				ace->who, ace->flag & NFS4_ACE_IDENTIFIER_GROUP,
-				&p, &buflen);
+			status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen);
 			if (status == nfserr_resource)
 				goto out_resource;
 			if (status)
@@ -2386,8 +2366,7 @@ out_acl:
 	if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
 		if ((buflen -= 12) < 0)
 			goto out_resource;
-		WRITE32(0);
-		WRITE32(stat.atime.tv_sec);
+		WRITE64((s64)stat.atime.tv_sec);
 		WRITE32(stat.atime.tv_nsec);
 	}
 	if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
@@ -2400,15 +2379,13 @@ out_acl:
 	if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
 		if ((buflen -= 12) < 0)
 			goto out_resource;
-		WRITE32(0);
-		WRITE32(stat.ctime.tv_sec);
+		WRITE64((s64)stat.ctime.tv_sec);
 		WRITE32(stat.ctime.tv_nsec);
 	}
 	if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
 		if ((buflen -= 12) < 0)
 			goto out_resource;
-		WRITE32(0);
-		WRITE32(stat.mtime.tv_sec);
+		WRITE64((s64)stat.mtime.tv_sec);
 		WRITE32(stat.mtime.tv_nsec);
 	}
 	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
@@ -2431,7 +2408,7 @@ out_acl:
 	}
 
 	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
-	*countp = p - buffer;
+	*buffer = p;
 	status = nfs_ok;
 
 out:
@@ -2443,7 +2420,6 @@ out_nfserr:
 	status = nfserrno(err);
 	goto out;
 out_resource:
-	*countp = 0;
 	status = nfserr_resource;
 	goto out;
 out_serverfault:
@@ -2462,7 +2438,7 @@ static inline int attributes_need_mount(u32 *bmval)
 
 static __be32
 nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
-		const char *name, int namlen, __be32 *p, int *buflen)
+		const char *name, int namlen, __be32 **p, int buflen)
 {
 	struct svc_export *exp = cd->rd_fhp->fh_export;
 	struct dentry *dentry;
@@ -2568,10 +2544,9 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 	p = xdr_encode_hyper(p, NFS_OFFSET_MAX);    /* offset of next entry */
 	p = xdr_encode_array(p, name, namlen);      /* name length & name */
 
-	nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, p, &buflen);
+	nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, &p, buflen);
 	switch (nfserr) {
 	case nfs_ok:
-		p += buflen;
 		break;
 	case nfserr_resource:
 		nfserr = nfserr_toosmall;
@@ -2648,12 +2623,9 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
 static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
-	ENCODE_SEQID_OP_HEAD;
-
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &close->cl_stateid);
 
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2698,10 +2670,8 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 
 	buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);
 	nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
-				    resp->p, &buflen, getattr->ga_bmval,
+				    &resp->p, buflen, getattr->ga_bmval,
 				    resp->rqstp, 0);
-	if (!nfserr)
-		resp->p += buflen;
 	return nfserr;
 }
 
@@ -2751,14 +2721,11 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
 static __be32
 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
 {
-	ENCODE_SEQID_OP_HEAD;
-
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
 	else if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(resp, &lock->lk_denied);
 
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2773,12 +2740,9 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 static __be32
 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
 {
-	ENCODE_SEQID_OP_HEAD;
-
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &locku->lu_stateid);
 
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -2801,7 +2765,6 @@ static __be32
 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
 	__be32 *p;
-	ENCODE_SEQID_OP_HEAD;
 
 	if (nfserr)
 		goto out;
@@ -2873,31 +2836,24 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 	}
 	/* XXX save filehandle here */
 out:
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
 static __be32
 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
 {
-	ENCODE_SEQID_OP_HEAD;
-
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
 
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
 static __be32
 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
 {
-	ENCODE_SEQID_OP_HEAD;
-
 	if (!nfserr)
 		nfsd4_encode_stateid(resp, &od->od_stateid);
 
-	encode_seqid_op_tail(resp, save, nfserr);
 	return nfserr;
 }
 
@@ -3127,13 +3083,13 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 
 static __be32
 nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
-			 __be32 nfserr,struct svc_export *exp)
+			 __be32 nfserr, struct svc_export *exp)
 {
-	int i = 0;
-	u32 nflavs;
+	u32 i, nflavs, supported;
 	struct exp_flavor_info *flavs;
 	struct exp_flavor_info def_flavs[2];
-	__be32 *p;
+	__be32 *p, *flavorsp;
+	static bool report = true;
 
 	if (nfserr)
 		goto out;
@@ -3157,34 +3113,40 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
 		}
 	}
 
+	supported = 0;
 	RESERVE_SPACE(4);
-	WRITE32(nflavs);
+	flavorsp = p++;		/* to be backfilled later */
 	ADJUST_ARGS();
+
 	for (i = 0; i < nflavs; i++) {
-		u32 flav = flavs[i].pseudoflavor;
-		struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav);
+		rpc_authflavor_t pf = flavs[i].pseudoflavor;
+		struct rpcsec_gss_info info;
 
-		if (gm) {
-			RESERVE_SPACE(4);
+		if (rpcauth_get_gssinfo(pf, &info) == 0) {
+			supported++;
+			RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4);
 			WRITE32(RPC_AUTH_GSS);
+			WRITE32(info.oid.len);
+			WRITEMEM(info.oid.data, info.oid.len);
+			WRITE32(info.qop);
+			WRITE32(info.service);
 			ADJUST_ARGS();
-			RESERVE_SPACE(4 + gm->gm_oid.len);
-			WRITE32(gm->gm_oid.len);
-			WRITEMEM(gm->gm_oid.data, gm->gm_oid.len);
-			ADJUST_ARGS();
+		} else if (pf < RPC_AUTH_MAXFLAVOR) {
+			supported++;
 			RESERVE_SPACE(4);
-			WRITE32(0); /* qop */
+			WRITE32(pf);
 			ADJUST_ARGS();
-			RESERVE_SPACE(4);
-			WRITE32(gss_pseudoflavor_to_service(gm, flav));
-			ADJUST_ARGS();
-			gss_mech_put(gm);
 		} else {
-			RESERVE_SPACE(4);
-			WRITE32(flav);
-			ADJUST_ARGS();
+			if (report)
+				pr_warn("NFS: SECINFO: security flavor %u "
+					"is not supported\n", pf);
 		}
 	}
+
+	if (nflavs != supported)
+		report = false;
+	*flavorsp = htonl(supported);
+
 out:
 	if (exp)
 		exp_put(exp);
@@ -3555,6 +3517,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
+	struct nfs4_stateowner *so = resp->cstate.replay_owner;
 	__be32 *statp;
 	__be32 *p;
 
@@ -3571,6 +3534,11 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 	/* nfsd4_check_drc_limit guarantees enough room for error status */
 	if (!op->status)
 		op->status = nfsd4_check_resp_size(resp, 0);
+	if (so) {
+		so->so_replay.rp_status = op->status;
+		so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1);
+		memcpy(so->so_replay.rp_buf, statp+1, so->so_replay.rp_buflen);
+	}
 status:
 	/*
 	 * Note: We write the status directly, instead of using WRITE32(),
@@ -3672,7 +3640,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 			cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
 		}
 		/* Renew the clientid on success and on replay */
-		release_session_client(cs->session);
+		put_client_renew(cs->session->se_client);
 		nfsd4_put_session(cs->session);
 	}
 	return 1;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 2cbac34a55d..e76244edd74 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -9,34 +9,63 @@
  */
 
 #include <linux/slab.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/hash.h>
+#include <net/checksum.h>
 
 #include "nfsd.h"
 #include "cache.h"
 
-/* Size of reply cache. Common values are:
- * 4.3BSD:	128
- * 4.4BSD:	256
- * Solaris2:	1024
- * DEC Unix:	512-4096
+#define NFSDDBG_FACILITY	NFSDDBG_REPCACHE
+
+/*
+ * We use this value to determine the number of hash buckets from the max
+ * cache size, the idea being that when the cache is at its maximum number
+ * of entries, then this should be the average number of entries per bucket.
  */
-#define CACHESIZE		1024
-#define HASHSIZE		64
+#define TARGET_BUCKET_SIZE	64
 
 static struct hlist_head *	cache_hash;
 static struct list_head 	lru_head;
-static int			cache_disabled = 1;
+static struct kmem_cache	*drc_slab;
+
+/* max number of entries allowed in the cache */
+static unsigned int		max_drc_entries;
+
+/* number of significant bits in the hash value */
+static unsigned int		maskbits;
 
 /*
- * Calculate the hash index from an XID.
+ * Stats and other tracking of on the duplicate reply cache. All of these and
+ * the "rc" fields in nfsdstats are protected by the cache_lock
  */
-static inline u32 request_hash(u32 xid)
-{
-	u32 h = xid;
-	h ^= (xid >> 24);
-	return h & (HASHSIZE-1);
-}
+
+/* total number of entries */
+static unsigned int		num_drc_entries;
+
+/* cache misses due only to checksum comparison failures */
+static unsigned int		payload_misses;
+
+/* amount of memory (in bytes) currently consumed by the DRC */
+static unsigned int		drc_mem_usage;
+
+/* longest hash chain seen */
+static unsigned int		longest_chain;
+
+/* size of cache when we saw the longest hash chain */
+static unsigned int		longest_chain_cachesize;
 
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
+static void	cache_cleaner_func(struct work_struct *unused);
+static int 	nfsd_reply_cache_shrink(struct shrinker *shrink,
+					struct shrink_control *sc);
+
+static struct shrinker nfsd_reply_cache_shrinker = {
+	.shrink	= nfsd_reply_cache_shrink,
+	.seeks	= 1,
+};
 
 /*
  * locking for the reply cache:
@@ -44,30 +73,104 @@ static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
  * Otherwise, it when accessing _prev or _next, the lock must be held.
  */
 static DEFINE_SPINLOCK(cache_lock);
+static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
 
-int nfsd_reply_cache_init(void)
+/*
+ * Put a cap on the size of the DRC based on the amount of available
+ * low memory in the machine.
+ *
+ *  64MB:    8192
+ * 128MB:   11585
+ * 256MB:   16384
+ * 512MB:   23170
+ *   1GB:   32768
+ *   2GB:   46340
+ *   4GB:   65536
+ *   8GB:   92681
+ *  16GB:  131072
+ *
+ * ...with a hard cap of 256k entries. In the worst case, each entry will be
+ * ~1k, so the above numbers should give a rough max of the amount of memory
+ * used in k.
+ */
+static unsigned int
+nfsd_cache_size_limit(void)
+{
+	unsigned int limit;
+	unsigned long low_pages = totalram_pages - totalhigh_pages;
+
+	limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10);
+	return min_t(unsigned int, limit, 256*1024);
+}
+
+/*
+ * Compute the number of hash buckets we need. Divide the max cachesize by
+ * the "target" max bucket size, and round up to next power of two.
+ */
+static unsigned int
+nfsd_hashsize(unsigned int limit)
+{
+	return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
+}
+
+static struct svc_cacherep *
+nfsd_reply_cache_alloc(void)
 {
 	struct svc_cacherep	*rp;
-	int			i;
 
-	INIT_LIST_HEAD(&lru_head);
-	i = CACHESIZE;
-	while (i) {
-		rp = kmalloc(sizeof(*rp), GFP_KERNEL);
-		if (!rp)
-			goto out_nomem;
-		list_add(&rp->c_lru, &lru_head);
+	rp = kmem_cache_alloc(drc_slab, GFP_KERNEL);
+	if (rp) {
 		rp->c_state = RC_UNUSED;
 		rp->c_type = RC_NOCACHE;
+		INIT_LIST_HEAD(&rp->c_lru);
 		INIT_HLIST_NODE(&rp->c_hash);
-		i--;
 	}
+	return rp;
+}
+
+static void
+nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
+{
+	if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
+		drc_mem_usage -= rp->c_replvec.iov_len;
+		kfree(rp->c_replvec.iov_base);
+	}
+	if (!hlist_unhashed(&rp->c_hash))
+		hlist_del(&rp->c_hash);
+	list_del(&rp->c_lru);
+	--num_drc_entries;
+	drc_mem_usage -= sizeof(*rp);
+	kmem_cache_free(drc_slab, rp);
+}
 
-	cache_hash = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
+static void
+nfsd_reply_cache_free(struct svc_cacherep *rp)
+{
+	spin_lock(&cache_lock);
+	nfsd_reply_cache_free_locked(rp);
+	spin_unlock(&cache_lock);
+}
+
+int nfsd_reply_cache_init(void)
+{
+	unsigned int hashsize;
+
+	INIT_LIST_HEAD(&lru_head);
+	max_drc_entries = nfsd_cache_size_limit();
+	num_drc_entries = 0;
+	hashsize = nfsd_hashsize(max_drc_entries);
+	maskbits = ilog2(hashsize);
+
+	register_shrinker(&nfsd_reply_cache_shrinker);
+	drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),
+					0, 0, NULL);
+	if (!drc_slab)
+		goto out_nomem;
+
+	cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL);
 	if (!cache_hash)
 		goto out_nomem;
 
-	cache_disabled = 0;
 	return 0;
 out_nomem:
 	printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
@@ -79,27 +182,33 @@ void nfsd_reply_cache_shutdown(void)
 {
 	struct svc_cacherep	*rp;
 
+	unregister_shrinker(&nfsd_reply_cache_shrinker);
+	cancel_delayed_work_sync(&cache_cleaner);
+
 	while (!list_empty(&lru_head)) {
 		rp = list_entry(lru_head.next, struct svc_cacherep, c_lru);
-		if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF)
-			kfree(rp->c_replvec.iov_base);
-		list_del(&rp->c_lru);
-		kfree(rp);
+		nfsd_reply_cache_free_locked(rp);
 	}
 
-	cache_disabled = 1;
-
 	kfree (cache_hash);
 	cache_hash = NULL;
+
+	if (drc_slab) {
+		kmem_cache_destroy(drc_slab);
+		drc_slab = NULL;
+	}
 }
 
 /*
- * Move cache entry to end of LRU list
+ * Move cache entry to end of LRU list, and queue the cleaner to run if it's
+ * not already scheduled.
  */
 static void
 lru_put_end(struct svc_cacherep *rp)
 {
+	rp->c_timestamp = jiffies;
 	list_move_tail(&rp->c_lru, &lru_head);
+	schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
 }
 
 /*
@@ -109,89 +218,247 @@ static void
 hash_refile(struct svc_cacherep *rp)
 {
 	hlist_del_init(&rp->c_hash);
-	hlist_add_head(&rp->c_hash, cache_hash + request_hash(rp->c_xid));
+	hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits));
+}
+
+static inline bool
+nfsd_cache_entry_expired(struct svc_cacherep *rp)
+{
+	return rp->c_state != RC_INPROG &&
+	       time_after(jiffies, rp->c_timestamp + RC_EXPIRE);
+}
+
+/*
+ * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
+ * Also prune the oldest ones when the total exceeds the max number of entries.
+ */
+static void
+prune_cache_entries(void)
+{
+	struct svc_cacherep *rp, *tmp;
+
+	list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
+		if (!nfsd_cache_entry_expired(rp) &&
+		    num_drc_entries <= max_drc_entries)
+			break;
+		nfsd_reply_cache_free_locked(rp);
+	}
+
+	/*
+	 * Conditionally rearm the job. If we cleaned out the list, then
+	 * cancel any pending run (since there won't be any work to do).
+	 * Otherwise, we rearm the job or modify the existing one to run in
+	 * RC_EXPIRE since we just ran the pruner.
+	 */
+	if (list_empty(&lru_head))
+		cancel_delayed_work(&cache_cleaner);
+	else
+		mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
+}
+
+static void
+cache_cleaner_func(struct work_struct *unused)
+{
+	spin_lock(&cache_lock);
+	prune_cache_entries();
+	spin_unlock(&cache_lock);
+}
+
+static int
+nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+	unsigned int num;
+
+	spin_lock(&cache_lock);
+	if (sc->nr_to_scan)
+		prune_cache_entries();
+	num = num_drc_entries;
+	spin_unlock(&cache_lock);
+
+	return num;
+}
+
+/*
+ * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
+ */
+static __wsum
+nfsd_cache_csum(struct svc_rqst *rqstp)
+{
+	int idx;
+	unsigned int base;
+	__wsum csum;
+	struct xdr_buf *buf = &rqstp->rq_arg;
+	const unsigned char *p = buf->head[0].iov_base;
+	size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
+				RC_CSUMLEN);
+	size_t len = min(buf->head[0].iov_len, csum_len);
+
+	/* rq_arg.head first */
+	csum = csum_partial(p, len, 0);
+	csum_len -= len;
+
+	/* Continue into page array */
+	idx = buf->page_base / PAGE_SIZE;
+	base = buf->page_base & ~PAGE_MASK;
+	while (csum_len) {
+		p = page_address(buf->pages[idx]) + base;
+		len = min_t(size_t, PAGE_SIZE - base, csum_len);
+		csum = csum_partial(p, len, csum);
+		csum_len -= len;
+		base = 0;
+		++idx;
+	}
+	return csum;
+}
+
+static bool
+nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
+{
+	/* Check RPC header info first */
+	if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc ||
+	    rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers ||
+	    rqstp->rq_arg.len != rp->c_len ||
+	    !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
+	    rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
+		return false;
+
+	/* compare checksum of NFS data */
+	if (csum != rp->c_csum) {
+		++payload_misses;
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Search the request hash for an entry that matches the given rqstp.
+ * Must be called with cache_lock held. Returns the found entry or
+ * NULL on failure.
+ */
+static struct svc_cacherep *
+nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
+{
+	struct svc_cacherep	*rp, *ret = NULL;
+	struct hlist_head 	*rh;
+	unsigned int		entries = 0;
+
+	rh = &cache_hash[hash_32(rqstp->rq_xid, maskbits)];
+	hlist_for_each_entry(rp, rh, c_hash) {
+		++entries;
+		if (nfsd_cache_match(rqstp, csum, rp)) {
+			ret = rp;
+			break;
+		}
+	}
+
+	/* tally hash chain length stats */
+	if (entries > longest_chain) {
+		longest_chain = entries;
+		longest_chain_cachesize = num_drc_entries;
+	} else if (entries == longest_chain) {
+		/* prefer to keep the smallest cachesize possible here */
+		longest_chain_cachesize = min(longest_chain_cachesize,
+						num_drc_entries);
+	}
+
+	return ret;
 }
 
 /*
  * Try to find an entry matching the current call in the cache. When none
- * is found, we grab the oldest unlocked entry off the LRU list.
- * Note that no operation within the loop may sleep.
+ * is found, we try to grab the oldest expired entry off the LRU list. If
+ * a suitable one isn't there, then drop the cache_lock and allocate a
+ * new one, then search again in case one got inserted while this thread
+ * didn't hold the lock.
  */
 int
 nfsd_cache_lookup(struct svc_rqst *rqstp)
 {
-	struct hlist_node	*hn;
-	struct hlist_head 	*rh;
-	struct svc_cacherep	*rp;
+	struct svc_cacherep	*rp, *found;
 	__be32			xid = rqstp->rq_xid;
 	u32			proto =  rqstp->rq_prot,
 				vers = rqstp->rq_vers,
 				proc = rqstp->rq_proc;
+	__wsum			csum;
 	unsigned long		age;
 	int type = rqstp->rq_cachetype;
-	int rtn;
+	int rtn = RC_DOIT;
 
 	rqstp->rq_cacherep = NULL;
-	if (cache_disabled || type == RC_NOCACHE) {
+	if (type == RC_NOCACHE) {
 		nfsdstats.rcnocache++;
-		return RC_DOIT;
+		return rtn;
 	}
 
-	spin_lock(&cache_lock);
-	rtn = RC_DOIT;
+	csum = nfsd_cache_csum(rqstp);
 
-	rh = &cache_hash[request_hash(xid)];
-	hlist_for_each_entry(rp, hn, rh, c_hash) {
-		if (rp->c_state != RC_UNUSED &&
-		    xid == rp->c_xid && proc == rp->c_proc &&
-		    proto == rp->c_prot && vers == rp->c_vers &&
-		    time_before(jiffies, rp->c_timestamp + 120*HZ) &&
-		    memcmp((char*)&rqstp->rq_addr, (char*)&rp->c_addr, sizeof(rp->c_addr))==0) {
-			nfsdstats.rchits++;
-			goto found_entry;
+	/*
+	 * Since the common case is a cache miss followed by an insert,
+	 * preallocate an entry. First, try to reuse the first entry on the LRU
+	 * if it works, then go ahead and prune the LRU list.
+	 */
+	spin_lock(&cache_lock);
+	if (!list_empty(&lru_head)) {
+		rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
+		if (nfsd_cache_entry_expired(rp) ||
+		    num_drc_entries >= max_drc_entries) {
+			lru_put_end(rp);
+			prune_cache_entries();
+			goto search_cache;
 		}
 	}
-	nfsdstats.rcmisses++;
 
-	/* This loop shouldn't take more than a few iterations normally */
-	{
-	int	safe = 0;
-	list_for_each_entry(rp, &lru_head, c_lru) {
-		if (rp->c_state != RC_INPROG)
-			break;
-		if (safe++ > CACHESIZE) {
-			printk("nfsd: loop in repcache LRU list\n");
-			cache_disabled = 1;
-			goto out;
-		}
-	}
+	/* No expired ones available, allocate a new one. */
+	spin_unlock(&cache_lock);
+	rp = nfsd_reply_cache_alloc();
+	spin_lock(&cache_lock);
+	if (likely(rp)) {
+		++num_drc_entries;
+		drc_mem_usage += sizeof(*rp);
 	}
 
-	/* All entries on the LRU are in-progress. This should not happen */
-	if (&rp->c_lru == &lru_head) {
-		static int	complaints;
+search_cache:
+	found = nfsd_cache_search(rqstp, csum);
+	if (found) {
+		if (likely(rp))
+			nfsd_reply_cache_free_locked(rp);
+		rp = found;
+		goto found_entry;
+	}
 
-		printk(KERN_WARNING "nfsd: all repcache entries locked!\n");
-		if (++complaints > 5) {
-			printk(KERN_WARNING "nfsd: disabling repcache.\n");
-			cache_disabled = 1;
-		}
+	if (!rp) {
+		dprintk("nfsd: unable to allocate DRC entry!\n");
 		goto out;
 	}
 
+	/*
+	 * We're keeping the one we just allocated. Are we now over the
+	 * limit? Prune one off the tip of the LRU in trade for the one we
+	 * just allocated if so.
+	 */
+	if (num_drc_entries >= max_drc_entries)
+		nfsd_reply_cache_free_locked(list_first_entry(&lru_head,
+						struct svc_cacherep, c_lru));
+
+	nfsdstats.rcmisses++;
 	rqstp->rq_cacherep = rp;
 	rp->c_state = RC_INPROG;
 	rp->c_xid = xid;
 	rp->c_proc = proc;
-	memcpy(&rp->c_addr, svc_addr_in(rqstp), sizeof(rp->c_addr));
+	rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
+	rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
 	rp->c_prot = proto;
 	rp->c_vers = vers;
-	rp->c_timestamp = jiffies;
+	rp->c_len = rqstp->rq_arg.len;
+	rp->c_csum = csum;
 
 	hash_refile(rp);
+	lru_put_end(rp);
 
 	/* release any buffer */
 	if (rp->c_type == RC_REPLBUFF) {
+		drc_mem_usage -= rp->c_replvec.iov_len;
 		kfree(rp->c_replvec.iov_base);
 		rp->c_replvec.iov_base = NULL;
 	}
@@ -201,9 +468,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 	return rtn;
 
 found_entry:
+	nfsdstats.rchits++;
 	/* We found a matching entry which is either in progress or done. */
 	age = jiffies - rp->c_timestamp;
-	rp->c_timestamp = jiffies;
 	lru_put_end(rp);
 
 	rtn = RC_DROPIT;
@@ -232,7 +499,7 @@ found_entry:
 		break;
 	default:
 		printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
-		rp->c_state = RC_UNUSED;
+		nfsd_reply_cache_free_locked(rp);
 	}
 
 	goto out;
@@ -257,11 +524,12 @@ found_entry:
 void
 nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 {
-	struct svc_cacherep *rp;
+	struct svc_cacherep *rp = rqstp->rq_cacherep;
 	struct kvec	*resv = &rqstp->rq_res.head[0], *cachv;
 	int		len;
+	size_t		bufsize = 0;
 
-	if (!(rp = rqstp->rq_cacherep) || cache_disabled)
+	if (!rp)
 		return;
 
 	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
@@ -269,7 +537,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 
 	/* Don't cache excessive amounts of data and XDR failures */
 	if (!statp || len > (256 >> 2)) {
-		rp->c_state = RC_UNUSED;
+		nfsd_reply_cache_free(rp);
 		return;
 	}
 
@@ -281,23 +549,25 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 		break;
 	case RC_REPLBUFF:
 		cachv = &rp->c_replvec;
-		cachv->iov_base = kmalloc(len << 2, GFP_KERNEL);
+		bufsize = len << 2;
+		cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
 		if (!cachv->iov_base) {
-			spin_lock(&cache_lock);
-			rp->c_state = RC_UNUSED;
-			spin_unlock(&cache_lock);
+			nfsd_reply_cache_free(rp);
 			return;
 		}
-		cachv->iov_len = len << 2;
-		memcpy(cachv->iov_base, statp, len << 2);
+		cachv->iov_len = bufsize;
+		memcpy(cachv->iov_base, statp, bufsize);
 		break;
+	case RC_NOCACHE:
+		nfsd_reply_cache_free(rp);
+		return;
 	}
 	spin_lock(&cache_lock);
+	drc_mem_usage += bufsize;
 	lru_put_end(rp);
 	rp->c_secure = rqstp->rq_secure;
 	rp->c_type = cachetype;
 	rp->c_state = RC_DONE;
-	rp->c_timestamp = jiffies;
 	spin_unlock(&cache_lock);
 	return;
 }
@@ -321,3 +591,30 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
 	vec->iov_len += data->iov_len;
 	return 1;
 }
+
+/*
+ * Note that fields may be added, removed or reordered in the future. Programs
+ * scraping this file for info should test the labels to ensure they're
+ * getting the correct field.
+ */
+static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
+{
+	spin_lock(&cache_lock);
+	seq_printf(m, "max entries:           %u\n", max_drc_entries);
+	seq_printf(m, "num entries:           %u\n", num_drc_entries);
+	seq_printf(m, "hash buckets:          %u\n", 1 << maskbits);
+	seq_printf(m, "mem usage:             %u\n", drc_mem_usage);
+	seq_printf(m, "cache hits:            %u\n", nfsdstats.rchits);
+	seq_printf(m, "cache misses:          %u\n", nfsdstats.rcmisses);
+	seq_printf(m, "not cached:            %u\n", nfsdstats.rcnocache);
+	seq_printf(m, "payload misses:        %u\n", payload_misses);
+	seq_printf(m, "longest chain len:     %u\n", longest_chain);
+	seq_printf(m, "cachesize at longest:  %u\n", longest_chain_cachesize);
+	spin_unlock(&cache_lock);
+	return 0;
+}
+
+int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, nfsd_reply_cache_stats_show, NULL);
+}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 74934284d9a..7f555179bf8 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -10,7 +10,7 @@
 
 #include <linux/sunrpc/svcsock.h>
 #include <linux/lockd/lockd.h>
-#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/gss_api.h>
 #include <linux/sunrpc/gss_krb5_enctypes.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
@@ -35,6 +35,7 @@ enum {
 	NFSD_Threads,
 	NFSD_Pool_Threads,
 	NFSD_Pool_Stats,
+	NFSD_Reply_Cache_Stats,
 	NFSD_Versions,
 	NFSD_Ports,
 	NFSD_MaxBlkSize,
@@ -85,7 +86,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 
 static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
 {
-	ino_t ino =  file->f_path.dentry->d_inode->i_ino;
+	ino_t ino =  file_inode(file)->i_ino;
 	char *data;
 	ssize_t rv;
 
@@ -125,11 +126,11 @@ static const struct file_operations transaction_ops = {
 	.llseek		= default_llseek,
 };
 
-static int exports_open(struct inode *inode, struct file *file)
+static int exports_net_open(struct net *net, struct file *file)
 {
 	int err;
 	struct seq_file *seq;
-	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	err = seq_open(file, &nfs_exports_op);
 	if (err)
@@ -140,8 +141,26 @@ static int exports_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static const struct file_operations exports_operations = {
-	.open		= exports_open,
+static int exports_proc_open(struct inode *inode, struct file *file)
+{
+	return exports_net_open(current->nsproxy->net_ns, file);
+}
+
+static const struct file_operations exports_proc_operations = {
+	.open		= exports_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+	.owner		= THIS_MODULE,
+};
+
+static int exports_nfsd_open(struct inode *inode, struct file *file)
+{
+	return exports_net_open(inode->i_sb->s_fs_info, file);
+}
+
+static const struct file_operations exports_nfsd_operations = {
+	.open		= exports_nfsd_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
@@ -159,7 +178,7 @@ static int export_features_open(struct inode *inode, struct file *file)
 	return single_open(file, export_features_show, NULL);
 }
 
-static struct file_operations export_features_operations = {
+static const struct file_operations export_features_operations = {
 	.open		= export_features_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -178,7 +197,7 @@ static int supported_enctypes_open(struct inode *inode, struct file *file)
 	return single_open(file, supported_enctypes_show, NULL);
 }
 
-static struct file_operations supported_enctypes_ops = {
+static const struct file_operations supported_enctypes_ops = {
 	.open		= supported_enctypes_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -194,6 +213,13 @@ static const struct file_operations pool_stats_operations = {
 	.owner		= THIS_MODULE,
 };
 
+static struct file_operations reply_cache_stats_operations = {
+	.open		= nfsd_reply_cache_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 /*----------------------------------------------------------------------------*/
 /*
  * payload - write methods
@@ -220,6 +246,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 	struct sockaddr *sap = (struct sockaddr *)&address;
 	size_t salen = sizeof(address);
 	char *fo_path;
+	struct net *net = file->f_dentry->d_sb->s_fs_info;
 
 	/* sanity check */
 	if (size == 0)
@@ -232,7 +259,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 	if (qword_get(&buf, fo_path, size) < 0)
 		return -EINVAL;
 
-	if (rpc_pton(&init_net, fo_path, size, sap, salen) == 0)
+	if (rpc_pton(net, fo_path, size, sap, salen) == 0)
 		return -EINVAL;
 
 	return nlmsvc_unlock_all_by_ip(sap);
@@ -317,6 +344,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 	int len;
 	struct auth_domain *dom;
 	struct knfsd_fh fh;
+	struct net *net = file->f_dentry->d_sb->s_fs_info;
 
 	if (size == 0)
 		return -EINVAL;
@@ -352,7 +380,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 	if (!dom)
 		return -ENOMEM;
 
-	len = exp_rootfh(&init_net, dom, path, &fh,  maxsize);
+	len = exp_rootfh(net, dom, path, &fh,  maxsize);
 	auth_domain_put(dom);
 	if (len)
 		return len;
@@ -396,7 +424,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
 {
 	char *mesg = buf;
 	int rv;
-	struct net *net = &init_net;
+	struct net *net = file->f_dentry->d_sb->s_fs_info;
 
 	if (size > 0) {
 		int newthreads;
@@ -447,7 +475,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
 	int len;
 	int npools;
 	int *nthreads;
-	struct net *net = &init_net;
+	struct net *net = file->f_dentry->d_sb->s_fs_info;
 
 	mutex_lock(&nfsd_mutex);
 	npools = nfsd_nrpools(net);
@@ -510,7 +538,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 	unsigned minor;
 	ssize_t tlen = 0;
 	char *sep;
-	struct net *net = &init_net;
+	struct net *net = file->f_dentry->d_sb->s_fs_info;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	if (size>0) {
@@ -534,7 +562,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 			else
 				num = simple_strtol(vers, &minorp, 0);
 			if (*minorp == '.') {
-				if (num < 4)
+				if (num != 4)
 					return -EINVAL;
 				minor = simple_strtoul(minorp+1, NULL, 0);
 				if (minor == 0)
@@ -792,7 +820,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size,
 static ssize_t write_ports(struct file *file, char *buf, size_t size)
 {
 	ssize_t rv;
-	struct net *net = &init_net;
+	struct net *net = file->f_dentry->d_sb->s_fs_info;
 
 	mutex_lock(&nfsd_mutex);
 	rv = __write_ports(file, buf, size, net);
@@ -827,7 +855,7 @@ int nfsd_max_blksize;
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 {
 	char *mesg = buf;
-	struct net *net = &init_net;
+	struct net *net = file->f_dentry->d_sb->s_fs_info;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	if (size > 0) {
@@ -923,7 +951,8 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size,
  */
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 {
-	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+	struct net *net = file->f_dentry->d_sb->s_fs_info;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);
 }
 
@@ -939,7 +968,8 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
  */
 static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
 {
-	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+	struct net *net = file->f_dentry->d_sb->s_fs_info;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);
 }
 
@@ -995,7 +1025,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 {
 	ssize_t rv;
-	struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+	struct net *net = file->f_dentry->d_sb->s_fs_info;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	mutex_lock(&nfsd_mutex);
 	rv = __write_recoverydir(file, buf, size, nn);
@@ -1013,7 +1044,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 {
 	static struct tree_descr nfsd_files[] = {
-		[NFSD_List] = {"exports", &exports_operations, S_IRUGO},
+		[NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
 		[NFSD_Export_features] = {"export_features",
 					&export_features_operations, S_IRUGO},
 		[NFSD_FO_UnlockIP] = {"unlock_ip",
@@ -1024,6 +1055,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
+		[NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO},
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
@@ -1037,21 +1069,37 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 #endif
 		/* last one */ {""}
 	};
-	return simple_fill_super(sb, 0x6e667364, nfsd_files);
+	struct net *net = data;
+	int ret;
+
+	ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
+	if (ret)
+		return ret;
+	sb->s_fs_info = get_net(net);
+	return 0;
 }
 
 static struct dentry *nfsd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	return mount_single(fs_type, flags, data, nfsd_fill_super);
+	return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
+}
+
+static void nfsd_umount(struct super_block *sb)
+{
+	struct net *net = sb->s_fs_info;
+
+	kill_litter_super(sb);
+	put_net(net);
 }
 
 static struct file_system_type nfsd_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfsd",
 	.mount		= nfsd_mount,
-	.kill_sb	= kill_litter_super,
+	.kill_sb	= nfsd_umount,
 };
+MODULE_ALIAS_FS("nfsd");
 
 #ifdef CONFIG_PROC_FS
 static int create_proc_exports_entry(void)
@@ -1061,9 +1109,12 @@ static int create_proc_exports_entry(void)
 	entry = proc_mkdir("fs/nfs", NULL);
 	if (!entry)
 		return -ENOMEM;
-	entry = proc_create("exports", 0, entry, &exports_operations);
-	if (!entry)
+	entry = proc_create("exports", 0, entry,
+				 &exports_proc_operations);
+	if (!entry) {
+		remove_proc_entry("fs/nfs", NULL);
 		return -ENOMEM;
+	}
 	return 0;
 }
 #else /* CONFIG_PROC_FS */
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index de23db255c6..c0d93170585 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -56,8 +56,8 @@ extern struct svc_version	nfsd_version2, nfsd_version3,
 extern u32			nfsd_supported_minorversion;
 extern struct mutex		nfsd_mutex;
 extern spinlock_t		nfsd_drc_lock;
-extern unsigned int		nfsd_drc_max_mem;
-extern unsigned int		nfsd_drc_mem_used;
+extern unsigned long		nfsd_drc_max_mem;
+extern unsigned long		nfsd_drc_mem_used;
 
 extern const struct seq_operations nfs_exports_op;
 
@@ -106,7 +106,7 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
  * NFSv4 State
  */
 #ifdef CONFIG_NFSD_V4
-extern unsigned int max_delegations;
+extern unsigned long max_delegations;
 void nfs4_state_init(void);
 int nfsd4_init_slabs(void);
 void nfsd4_free_slabs(void);
@@ -243,6 +243,12 @@ void		nfsd_lockd_shutdown(void);
 #define nfserr_reject_deleg		cpu_to_be32(NFS4ERR_REJECT_DELEG)
 #define nfserr_returnconflict		cpu_to_be32(NFS4ERR_RETURNCONFLICT)
 #define nfserr_deleg_revoked		cpu_to_be32(NFS4ERR_DELEG_REVOKED)
+#define nfserr_partner_notsupp		cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
+#define nfserr_partner_no_auth		cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
+#define nfserr_metadata_notsupp		cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
+#define nfserr_offload_denied		cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
+#define nfserr_wrong_lfs		cpu_to_be32(NFS4ERR_WRONG_LFS)
+#define nfserr_badlabel		cpu_to_be32(NFS4ERR_BADLABEL)
 
 /* error codes for internal use */
 /* if a request fails due to kmalloc failure, it gets dropped.
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index aad6d457b9e..54c6b3d3cc7 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -26,17 +26,13 @@ static __be32
 nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp)
 {
 	if (err) return err;
-	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt,
-				    resp->fh.fh_dentry,
-				    &resp->stat));
+	return fh_getattr(&resp->fh, &resp->stat);
 }
 static __be32
 nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp)
 {
 	if (err) return err;
-	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt,
-				    resp->fh.fh_dentry,
-				    &resp->stat));
+	return fh_getattr(&resp->fh, &resp->stat);
 }
 /*
  * Get a file's attributes
@@ -150,9 +146,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
 				  &resp->count);
 
 	if (nfserr) return nfserr;
-	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_path.mnt,
-				    resp->fh.fh_dentry,
-				    &resp->stat));
+	return fh_getattr(&resp->fh, &resp->stat);
 }
 
 /*
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cee62ab9d4a..262df5ccbf5 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -59,8 +59,8 @@ DEFINE_MUTEX(nfsd_mutex);
  * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
  */
 spinlock_t	nfsd_drc_lock;
-unsigned int	nfsd_drc_max_mem;
-unsigned int	nfsd_drc_mem_used;
+unsigned long	nfsd_drc_max_mem;
+unsigned long	nfsd_drc_mem_used;
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 static struct svc_stat	nfsd_acl_svcstats;
@@ -342,7 +342,7 @@ static void set_max_drc(void)
 					>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
 	nfsd_drc_mem_used = 0;
 	spin_lock_init(&nfsd_drc_lock);
-	dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
+	dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);
 }
 
 static int nfsd_get_default_max_blksize(void)
@@ -652,7 +652,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 
 	/* Check whether we have this call in the cache. */
 	switch (nfsd_cache_lookup(rqstp)) {
-	case RC_INTR:
 	case RC_DROPIT:
 		return 0;
 	case RC_REPLY:
@@ -703,8 +702,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 int nfsd_pool_stats_open(struct inode *inode, struct file *file)
 {
 	int ret;
-	struct net *net = &init_net;
-	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id);
 
 	mutex_lock(&nfsd_mutex);
 	if (nn->nfsd_serv == NULL) {
@@ -721,7 +719,7 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
 int nfsd_pool_stats_release(struct inode *inode, struct file *file)
 {
 	int ret = seq_release(inode, file);
-	struct net *net = &init_net;
+	struct net *net = inode->i_sb->s_fs_info;
 
 	mutex_lock(&nfsd_mutex);
 	/* this function really, really should have been called svc_put() */
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 979b4210697..9c769a47ac5 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -4,6 +4,7 @@
  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
  */
 
+#include "vfs.h"
 #include "xdr.h"
 #include "auth.h"
 
@@ -100,12 +101,14 @@ decode_sattr(__be32 *p, struct iattr *iap)
 		iap->ia_mode = tmp;
 	}
 	if ((tmp = ntohl(*p++)) != (u32)-1) {
-		iap->ia_valid |= ATTR_UID;
-		iap->ia_uid = tmp;
+		iap->ia_uid = make_kuid(&init_user_ns, tmp);
+		if (uid_valid(iap->ia_uid))
+			iap->ia_valid |= ATTR_UID;
 	}
 	if ((tmp = ntohl(*p++)) != (u32)-1) {
-		iap->ia_valid |= ATTR_GID;
-		iap->ia_gid = tmp;
+		iap->ia_gid = make_kgid(&init_user_ns, tmp);
+		if (gid_valid(iap->ia_gid))
+			iap->ia_valid |= ATTR_GID;
 	}
 	if ((tmp = ntohl(*p++)) != (u32)-1) {
 		iap->ia_valid |= ATTR_SIZE;
@@ -151,8 +154,8 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	*p++ = htonl(nfs_ftypes[type >> 12]);
 	*p++ = htonl((u32) stat->mode);
 	*p++ = htonl((u32) stat->nlink);
-	*p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
-	*p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
+	*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
+	*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
 
 	if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
 		*p++ = htonl(NFS_MAXPATHLEN);
@@ -194,11 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 }
 
 /* Helper function for NFSv2 ACL code */
-__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat)
 {
-	struct kstat stat;
-	vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, &stat);
-	return encode_fattr(rqstp, p, fhp, &stat);
+	return encode_fattr(rqstp, p, fhp, stat);
 }
 
 /*
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index d1c229feed5..274e2a114e0 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -79,6 +79,8 @@ struct nfs4_stid {
 #define NFS4_DELEG_STID 4
 /* For an open stateid kept around *only* to process close replays: */
 #define NFS4_CLOSED_STID 8
+/* For a deleg stateid kept around only to process free_stateid's: */
+#define NFS4_REVOKED_DELEG_STID 16
 	unsigned char sc_type;
 	stateid_t sc_stateid;
 	struct nfs4_client *sc_client;
@@ -152,8 +154,8 @@ struct nfsd4_channel_attrs {
 
 struct nfsd4_cb_sec {
 	u32	flavor; /* (u32)(-1) used to mean "no valid flavor" */
-	u32	uid;
-	u32	gid;
+	kuid_t	uid;
+	kgid_t	gid;
 };
 
 struct nfsd4_create_session {
@@ -194,9 +196,11 @@ struct nfsd4_conn {
 };
 
 struct nfsd4_session {
-	struct kref		se_ref;
+	atomic_t		se_ref;
 	struct list_head	se_hash;	/* hash by sessionid */
 	struct list_head	se_perclnt;
+/* See SESSION4_PERSIST, etc. for standard flags; this is internal-only: */
+#define NFS4_SESSION_DEAD	0x010
 	u32			se_flags;
 	struct nfs4_client	*se_client;
 	struct nfs4_sessionid	se_sessionid;
@@ -236,6 +240,7 @@ struct nfs4_client {
 	struct list_head	cl_openowners;
 	struct idr		cl_stateids;	/* stateid lookup */
 	struct list_head	cl_delegations;
+	struct list_head	cl_revoked;	/* unacknowledged, revoked 4.1 state */
 	struct list_head        cl_lru;         /* tail queue */
 	struct xdr_netobj	cl_name; 	/* id generated by client */
 	nfs4_verifier		cl_verifier; 	/* generated by client */
@@ -286,18 +291,6 @@ struct nfs4_client {
 	struct net		*net;
 };
 
-static inline void
-mark_client_expired(struct nfs4_client *clp)
-{
-	clp->cl_time = 0;
-}
-
-static inline bool
-is_client_expired(struct nfs4_client *clp)
-{
-	return clp->cl_time == 0;
-}
-
 /* struct nfs4_client_reset
  * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
  * upon lease reset, or from upcall to state_daemon (to read in state
@@ -365,7 +358,6 @@ struct nfs4_openowner {
 	struct nfs4_ol_stateid *oo_last_closed_stid;
 	time_t			oo_time; /* time of placement on so_close_lru */
 #define NFS4_OO_CONFIRMED   1
-#define NFS4_OO_PURGE_CLOSE 2
 #define NFS4_OO_NEW         4
 	unsigned char		oo_flags;
 };
@@ -373,7 +365,7 @@ struct nfs4_openowner {
 struct nfs4_lockowner {
 	struct nfs4_stateowner	lo_owner; /* must be first element */
 	struct list_head	lo_owner_ino_hash; /* hash by owner,file */
-	struct list_head        lo_perstateid; /* for lockowners only */
+	struct list_head        lo_perstateid;
 	struct list_head	lo_list; /* for temporary uses */
 };
 
@@ -390,7 +382,7 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
 /* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */
 struct nfs4_file {
 	atomic_t		fi_ref;
-	struct list_head        fi_hash;    /* hash by "struct inode *" */
+	struct hlist_node       fi_hash;    /* hash by "struct inode *" */
 	struct list_head        fi_stateids;
 	struct list_head	fi_delegations;
 	/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
@@ -486,8 +478,7 @@ extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
 							struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
-extern void release_session_client(struct nfsd4_session *);
-extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
+extern void put_client_renew(struct nfs4_client *clp);
 
 /* nfs4recover operations */
 extern int nfsd4_client_tracking_init(struct net *net);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d586117fa94..a6bc8a7423d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -401,8 +401,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 
 	/* Revoke setuid/setgid on chown */
 	if (!S_ISDIR(inode->i_mode) &&
-	    (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
-	     ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
+	    (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) ||
+	     ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) {
 		iap->ia_valid |= ATTR_KILL_PRIV;
 		if (iap->ia_valid & ATTR_MODE) {
 			/* we're setting mode too, just clear the s*id bits */
@@ -979,7 +979,7 @@ static void kill_suid(struct dentry *dentry)
  */
 static int wait_for_concurrent_writes(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	static ino_t last_ino;
 	static dev_t last_dev;
 	int err = 0;
@@ -1013,6 +1013,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	int			host_err;
 	int			stable = *stablep;
 	int			use_wgather;
+	loff_t			pos = offset;
 
 	dentry = file->f_path.dentry;
 	inode = dentry->d_inode;
@@ -1025,7 +1026,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 	/* Write the data. */
 	oldfs = get_fs(); set_fs(KERNEL_DS);
-	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
+	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos);
 	set_fs(oldfs);
 	if (host_err < 0)
 		goto out_nfserr;
@@ -1070,7 +1071,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (err)
 		return err;
 
-	inode = file->f_path.dentry->d_inode;
+	inode = file_inode(file);
 
 	/* Get readahead parameters */
 	ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
@@ -1205,7 +1206,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
 	 * send along the gid on create when it tries to implement
 	 * setgid directories via NFS:
 	 */
-	if (current_fsuid() != 0)
+	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
 		iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
 	if (iap->ia_valid)
 		return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
@@ -1757,10 +1758,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	tdentry = tfhp->fh_dentry;
 	tdir = tdentry->d_inode;
 
-	err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
-	if (ffhp->fh_export != tfhp->fh_export)
-		goto out;
-
 	err = nfserr_perm;
 	if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
 		goto out;
@@ -1801,6 +1798,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	host_err = -EXDEV;
 	if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
 		goto out_dput_new;
+	if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
+		goto out_dput_new;
 
 	host_err = nfsd_break_lease(odentry->d_inode);
 	if (host_err)
@@ -1913,6 +1912,7 @@ struct buffered_dirent {
 };
 
 struct readdir_data {
+	struct dir_context ctx;
 	char		*dirent;
 	size_t		used;
 	int		full;
@@ -1944,27 +1944,29 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
 static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
 				    struct readdir_cd *cdp, loff_t *offsetp)
 {
-	struct readdir_data buf;
 	struct buffered_dirent *de;
 	int host_err;
 	int size;
 	loff_t offset;
+	struct readdir_data buf = {
+		.ctx.actor = nfsd_buffered_filldir,
+		.dirent = (void *)__get_free_page(GFP_KERNEL)
+	};
 
-	buf.dirent = (void *)__get_free_page(GFP_KERNEL);
 	if (!buf.dirent)
 		return nfserrno(-ENOMEM);
 
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file->f_path.dentry->d_inode;
+		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
 		buf.used = 0;
 		buf.full = 0;
 
-		host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf);
+		host_err = iterate_dir(file, &buf.ctx);
 		if (buf.full)
 			host_err = 0;
 
@@ -2150,7 +2152,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	 * with NFSv3.
 	 */
 	if ((acc & NFSD_MAY_OWNER_OVERRIDE) &&
-	    inode->i_uid == current_fsuid())
+	    uid_eq(inode->i_uid, current_fsuid()))
 		return 0;
 
 	/* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 359594c393d..5b5894159f2 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -6,6 +6,7 @@
 #define LINUX_NFSD_VFS_H
 
 #include "nfsfh.h"
+#include "nfsd.h"
 
 /*
  * Flags for nfsd_permission
@@ -125,4 +126,11 @@ static inline void fh_drop_write(struct svc_fh *fh)
 	}
 }
 
+static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat)
+{
+	struct path p = {.mnt = fh->fh_export->ex_path.mnt,
+			 .dentry = fh->fh_dentry};
+	return nfserrno(vfs_getattr(&p, stat));
+}
+
 #endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index 53b1863dd8f..4f0481d6380 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -167,7 +167,7 @@ int nfssvc_encode_entry(void *, const char *name,
 int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
 
 /* Helper functions for NFSv2 ACL code */
-__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp);
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat);
 __be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp);
 
 #endif /* LINUX_NFSD_H */
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 7df980eb056..b6d5542a4ac 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -136,6 +136,7 @@ struct nfsd3_accessres {
 	__be32			status;
 	struct svc_fh		fh;
 	__u32			access;
+	struct kstat		stat;
 };
 
 struct nfsd3_readlinkres {
@@ -225,6 +226,7 @@ struct nfsd3_getaclres {
 	int			mask;
 	struct posix_acl	*acl_access;
 	struct posix_acl	*acl_default;
+	struct kstat		stat;
 };
 
 /* dummy type for release */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 0889bfb43dc..3b271d2092b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -184,7 +184,6 @@ struct nfsd4_lock {
 #define lk_old_lock_stateid     v.old.lock_stateid
 #define lk_old_lock_seqid       v.old.lock_seqid
 
-#define lk_rflags       u.ok.rflags
 #define lk_resp_stateid u.ok.stateid
 #define lk_denied       u.denied
 
@@ -237,6 +236,7 @@ struct nfsd4_open {
 	u32		op_share_deny;      /* request */
 	u32		op_deleg_want;      /* request */
 	stateid_t	op_stateid;         /* response */
+	__be32		op_xdr_error;       /* see nfsd4_open_omfg() */
 	u32		op_recall;          /* recall */
 	struct nfsd4_change_info  op_cinfo; /* response */
 	u32		op_rflags;          /* response */
@@ -563,7 +563,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
 void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
 void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
 __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
-		       struct dentry *dentry, __be32 *buffer, int *countp,
+		       struct dentry *dentry, __be32 **buffer, int countp,
 		       u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
 extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *,
@@ -623,6 +623,7 @@ extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, struct nfsd4_test_stateid *test_stateid);
 extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid);
+extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr);
 #endif
 
 /*
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
new file mode 100644
index 00000000000..c5c55dfb91a
--- /dev/null
+++ b/fs/nfsd/xdr4cb.h
@@ -0,0 +1,23 @@
+#define NFS4_MAXTAGLEN		20
+
+#define NFS4_enc_cb_null_sz		0
+#define NFS4_dec_cb_null_sz		0
+#define cb_compound_enc_hdr_sz		4
+#define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
+#define sessionid_sz			(NFS4_MAX_SESSIONID_LEN >> 2)
+#define cb_sequence_enc_sz		(sessionid_sz + 4 +             \
+					1 /* no referring calls list yet */)
+#define cb_sequence_dec_sz		(op_dec_sz + sessionid_sz + 4)
+
+#define op_enc_sz			1
+#define op_dec_sz			2
+#define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
+#define enc_stateid_sz			(NFS4_STATEID_SIZE >> 2)
+#define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + enc_stateid_sz +            \
+					enc_nfs4_fh_sz)
+
+#define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 251da07b2a1..80da8eb2739 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,5 @@
 config NILFS2_FS
-	tristate "NILFS2 file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	tristate "NILFS2 file system support"
 	select CRC32
 	help
 	  NILFS2 is a log-structured file system (LFS) supporting continuous
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index eed4d7b2624..741fd02e044 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -398,6 +398,69 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
 }
 
 /**
+ * nilfs_palloc_count_desc_blocks - count descriptor blocks number
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: descriptor blocks number [out]
+ */
+static int nilfs_palloc_count_desc_blocks(struct inode *inode,
+					    unsigned long *desc_blocks)
+{
+	unsigned long blknum;
+	int ret;
+
+	ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
+	if (likely(!ret))
+		*desc_blocks = DIV_ROUND_UP(
+			blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
+	return ret;
+}
+
+/**
+ * nilfs_palloc_mdt_file_can_grow - check potential opportunity for
+ *					MDT file growing
+ * @inode: inode of metadata file using this allocator
+ * @desc_blocks: known current descriptor blocks count
+ */
+static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
+						    unsigned long desc_blocks)
+{
+	return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) <
+			nilfs_palloc_groups_count(inode);
+}
+
+/**
+ * nilfs_palloc_count_max_entries - count max number of entries that can be
+ *					described by descriptor blocks count
+ * @inode: inode of metadata file using this allocator
+ * @nused: current number of used entries
+ * @nmaxp: max number of entries [out]
+ */
+int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
+{
+	unsigned long desc_blocks = 0;
+	u64 entries_per_desc_block, nmax;
+	int err;
+
+	err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks);
+	if (unlikely(err))
+		return err;
+
+	entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) *
+				nilfs_palloc_groups_per_desc_block(inode);
+	nmax = entries_per_desc_block * desc_blocks;
+
+	if (nused == nmax &&
+			nilfs_palloc_mdt_file_can_grow(inode, desc_blocks))
+		nmax += entries_per_desc_block;
+
+	if (nused > nmax)
+		return -ERANGE;
+
+	*nmaxp = nmax;
+	return 0;
+}
+
+/**
  * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
  * @inode: inode of metadata file using this allocator
  * @req: nilfs_palloc_req structure exchanged for the allocation
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index fb723810054..4bd6451b570 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -48,6 +48,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
 void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
 				   const struct buffer_head *, void *);
 
+int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *);
+
 /**
  * nilfs_palloc_req - persistent allocator request and reply
  * @pr_entry_nr: entry number (vblocknr or inode number)
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index df1a7fb238d..197a63e9d10 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -256,22 +256,18 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
 	de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
 
-static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int nilfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	loff_t pos = filp->f_pos;
-	struct inode *inode = filp->f_dentry->d_inode;
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	unsigned int offset = pos & ~PAGE_CACHE_MASK;
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 /*	unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
-	unsigned char *types = NULL;
-	int ret;
 
 	if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
-		goto success;
-
-	types = nilfs_filetype_table;
+		return 0;
 
 	for ( ; n < npages; n++, offset = 0) {
 		char *kaddr, *limit;
@@ -281,9 +277,8 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (IS_ERR(page)) {
 			nilfs_error(sb, __func__, "bad page in #%lu",
 				    inode->i_ino);
-			filp->f_pos += PAGE_CACHE_SIZE - offset;
-			ret = -EIO;
-			goto done;
+			ctx->pos += PAGE_CACHE_SIZE - offset;
+			return -EIO;
 		}
 		kaddr = page_address(page);
 		de = (struct nilfs_dir_entry *)(kaddr + offset);
@@ -293,35 +288,28 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			if (de->rec_len == 0) {
 				nilfs_error(sb, __func__,
 					    "zero-length directory entry");
-				ret = -EIO;
 				nilfs_put_page(page);
-				goto done;
+				return -EIO;
 			}
 			if (de->inode) {
-				int over;
-				unsigned char d_type = DT_UNKNOWN;
+				unsigned char t;
 
-				if (types && de->file_type < NILFS_FT_MAX)
-					d_type = types[de->file_type];
+				if (de->file_type < NILFS_FT_MAX)
+					t = nilfs_filetype_table[de->file_type];
+				else
+					t = DT_UNKNOWN;
 
-				offset = (char *)de - kaddr;
-				over = filldir(dirent, de->name, de->name_len,
-						(n<<PAGE_CACHE_SHIFT) | offset,
-						le64_to_cpu(de->inode), d_type);
-				if (over) {
+				if (!dir_emit(ctx, de->name, de->name_len,
+						le64_to_cpu(de->inode), t)) {
 					nilfs_put_page(page);
-					goto success;
+					return 0;
 				}
 			}
-			filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
+			ctx->pos += nilfs_rec_len_from_disk(de->rec_len);
 		}
 		nilfs_put_page(page);
 	}
-
-success:
-	ret = 0;
-done:
-	return ret;
+	return 0;
 }
 
 /*
@@ -678,7 +666,7 @@ not_empty:
 const struct file_operations nilfs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= nilfs_readdir,
+	.iterate	= nilfs_readdir,
 	.unlocked_ioctl	= nilfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= nilfs_compat_ioctl,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 61946883025..08fdb77852a 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -67,7 +67,7 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct inode *inode = file_inode(vma->vm_file);
 	struct nilfs_transaction_info ti;
 	int ret = 0;
 
@@ -126,7 +126,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	nilfs_transaction_commit(inode->i_sb);
 
  mapped:
-	wait_on_page_writeback(page);
+	wait_for_stable_page(page);
  out:
 	sb_end_pagefault(inode->i_sb);
 	return block_page_mkwrite_return(ret);
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index d8e65bde083..6548c7851b4 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -160,6 +160,28 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
 }
 
 /**
+ * nilfs_ifile_count_free_inodes - calculate free inodes count
+ * @ifile: ifile inode
+ * @nmaxinodes: current maximum of available inodes count [out]
+ * @nfreeinodes: free inodes count [out]
+ */
+int nilfs_ifile_count_free_inodes(struct inode *ifile,
+				    u64 *nmaxinodes, u64 *nfreeinodes)
+{
+	u64 nused;
+	int err;
+
+	*nmaxinodes = 0;
+	*nfreeinodes = 0;
+
+	nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count);
+	err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes);
+	if (likely(!err))
+		*nfreeinodes = *nmaxinodes - nused;
+	return err;
+}
+
+/**
  * nilfs_ifile_read - read or get ifile inode
  * @sb: super block instance
  * @root: root object
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 59b6f2b51df..679674d1337 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
 int nilfs_ifile_delete_inode(struct inode *, ino_t);
 int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
 
+int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
+
 int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
 		     size_t inode_size, struct nilfs_inode *raw_inode,
 		     struct inode **inodep);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6b49f14eac8..b1a5277cfd1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -25,7 +25,7 @@
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "segment.h"
@@ -54,7 +54,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n)
 
 	inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
 	if (root)
-		atomic_add(n, &root->blocks_count);
+		atomic64_add(n, &root->blocks_count);
 }
 
 void nilfs_inode_sub_blocks(struct inode *inode, int n)
@@ -63,7 +63,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
 
 	inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
 	if (root)
-		atomic_sub(n, &root->blocks_count);
+		atomic64_sub(n, &root->blocks_count);
 }
 
 /**
@@ -175,6 +175,11 @@ static int nilfs_writepages(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	int err = 0;
 
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		nilfs_clear_dirty_pages(mapping, false);
+		return -EROFS;
+	}
+
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		err = nilfs_construct_dsync_segment(inode->i_sb, inode,
 						    wbc->range_start,
@@ -187,6 +192,18 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
 	struct inode *inode = page->mapping->host;
 	int err;
 
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		/*
+		 * It means that filesystem was remounted in read-only
+		 * mode because of error or metadata corruption. But we
+		 * have dirty pages that try to be flushed in background.
+		 * So, here we simply discard this dirty page.
+		 */
+		nilfs_clear_dirty_page(page, false);
+		unlock_page(page);
+		return -EROFS;
+	}
+
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 
@@ -202,13 +219,32 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
 
 static int nilfs_set_page_dirty(struct page *page)
 {
-	int ret = __set_page_dirty_buffers(page);
+	int ret = __set_page_dirty_nobuffers(page);
 
-	if (ret) {
+	if (page_has_buffers(page)) {
 		struct inode *inode = page->mapping->host;
-		unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+		unsigned nr_dirty = 0;
+		struct buffer_head *bh, *head;
 
-		nilfs_set_file_dirty(inode, nr_dirty);
+		/*
+		 * This page is locked by callers, and no other thread
+		 * concurrently marks its buffers dirty since they are
+		 * only dirtied through routines in fs/buffer.c in
+		 * which call sites of mark_buffer_dirty are protected
+		 * by page lock.
+		 */
+		bh = head = page_buffers(page);
+		do {
+			/* Do not mark hole blocks dirty */
+			if (buffer_dirty(bh) || !buffer_mapped(bh))
+				continue;
+
+			set_buffer_dirty(bh);
+			nr_dirty++;
+		} while (bh = bh->b_this_page, bh != head);
+
+		if (nr_dirty)
+			nilfs_set_file_dirty(inode, nr_dirty);
 	}
 	return ret;
 }
@@ -333,7 +369,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
 		goto failed_ifile_create_inode;
 	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
 
-	atomic_inc(&root->inodes_count);
+	atomic64_inc(&root->inodes_count);
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = ino;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -765,7 +801,7 @@ void nilfs_evict_inode(struct inode *inode)
 
 	ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
 	if (!ret)
-		atomic_dec(&ii->i_root->inodes_count);
+		atomic64_dec(&ii->i_root->inodes_count);
 
 	nilfs_clear_inode(inode);
 
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f3859354e41..b44bdb291b8 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -796,7 +796,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
 
 long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index f9897d09c69..c4dcd1db57e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -375,14 +375,25 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
 static int
 nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 {
-	struct inode *inode;
+	struct inode *inode = page->mapping->host;
 	struct super_block *sb;
 	int err = 0;
 
+	if (inode && (inode->i_sb->s_flags & MS_RDONLY)) {
+		/*
+		 * It means that filesystem was remounted in read-only
+		 * mode because of error or metadata corruption. But we
+		 * have dirty pages that try to be flushed in background.
+		 * So, here we simply discard this dirty page.
+		 */
+		nilfs_clear_dirty_page(page, false);
+		unlock_page(page);
+		return -EROFS;
+	}
+
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 
-	inode = page->mapping->host;
 	if (!inode)
 		return 0;
 
@@ -561,10 +572,10 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode)
 	if (mi->mi_palloc_cache)
 		nilfs_palloc_clear_cache(inode);
 
-	nilfs_clear_dirty_pages(inode->i_mapping);
+	nilfs_clear_dirty_pages(inode->i_mapping, true);
 	nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
 
-	nilfs_clear_dirty_pages(&ii->i_btnode_cache);
+	nilfs_clear_dirty_pages(&ii->i_btnode_cache, true);
 	nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
 
 	nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1d0c0b84c5a..9de78f08989 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -517,11 +517,11 @@ static int nilfs_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
 
 	if (parent && *lenp < NILFS_FID_SIZE_CONNECTABLE) {
 		*lenp = NILFS_FID_SIZE_CONNECTABLE;
-		return 255;
+		return FILEID_INVALID;
 	}
 	if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE) {
 		*lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
-		return 255;
+		return FILEID_INVALID;
 	}
 
 	fid->cno = root->cno;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 07f76db04ec..0ba679866e5 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -370,7 +370,12 @@ repeat:
 	goto repeat;
 }
 
-void nilfs_clear_dirty_pages(struct address_space *mapping)
+/**
+ * nilfs_clear_dirty_pages - discard dirty pages in address space
+ * @mapping: address space with dirty pages for discarding
+ * @silent: suppress [true] or print [false] warning messages
+ */
+void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
 {
 	struct pagevec pvec;
 	unsigned int i;
@@ -382,25 +387,9 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
 				  PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
-			struct buffer_head *bh, *head;
 
 			lock_page(page);
-			ClearPageUptodate(page);
-			ClearPageMappedToDisk(page);
-			bh = head = page_buffers(page);
-			do {
-				lock_buffer(bh);
-				clear_buffer_dirty(bh);
-				clear_buffer_nilfs_volatile(bh);
-				clear_buffer_nilfs_checked(bh);
-				clear_buffer_nilfs_redirected(bh);
-				clear_buffer_uptodate(bh);
-				clear_buffer_mapped(bh);
-				unlock_buffer(bh);
-				bh = bh->b_this_page;
-			} while (bh != head);
-
-			__nilfs_clear_page_dirty(page);
+			nilfs_clear_dirty_page(page, silent);
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
@@ -408,6 +397,51 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
 	}
 }
 
+/**
+ * nilfs_clear_dirty_page - discard dirty page
+ * @page: dirty page that will be discarded
+ * @silent: suppress [true] or print [false] warning messages
+ */
+void nilfs_clear_dirty_page(struct page *page, bool silent)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+
+	BUG_ON(!PageLocked(page));
+
+	if (!silent) {
+		nilfs_warning(sb, __func__,
+				"discard page: offset %lld, ino %lu",
+				page_offset(page), inode->i_ino);
+	}
+
+	ClearPageUptodate(page);
+	ClearPageMappedToDisk(page);
+
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh, *head;
+
+		bh = head = page_buffers(page);
+		do {
+			lock_buffer(bh);
+			if (!silent) {
+				nilfs_warning(sb, __func__,
+					"discard block %llu, size %zu",
+					(u64)bh->b_blocknr, bh->b_size);
+			}
+			clear_buffer_dirty(bh);
+			clear_buffer_nilfs_volatile(bh);
+			clear_buffer_nilfs_checked(bh);
+			clear_buffer_nilfs_redirected(bh);
+			clear_buffer_uptodate(bh);
+			clear_buffer_mapped(bh);
+			unlock_buffer(bh);
+		} while (bh = bh->b_this_page, bh != head);
+	}
+
+	__nilfs_clear_page_dirty(page);
+}
+
 unsigned nilfs_page_count_clean_buffers(struct page *page,
 					unsigned from, unsigned to)
 {
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index fb7de71605a..ef30c5c2426 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -55,7 +55,8 @@ void nilfs_page_bug(struct page *);
 
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
-void nilfs_clear_dirty_pages(struct address_space *);
+void nilfs_clear_dirty_page(struct page *, bool);
+void nilfs_clear_dirty_pages(struct address_space *, bool);
 void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
 			struct backing_dev_info *bdi);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a5752a58993..bd88a746106 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -835,9 +835,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 	raw_cp->cp_snapshot_list.ssl_next = 0;
 	raw_cp->cp_snapshot_list.ssl_prev = 0;
 	raw_cp->cp_inodes_count =
-		cpu_to_le64(atomic_read(&sci->sc_root->inodes_count));
+		cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
 	raw_cp->cp_blocks_count =
-		cpu_to_le64(atomic_read(&sci->sc_root->blocks_count));
+		cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
 	raw_cp->cp_nblk_inc =
 		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
 	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3c991dc84f2..af3ba0478cd 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -554,8 +554,10 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 	if (err)
 		goto failed_bh;
 
-	atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
-	atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+	atomic64_set(&root->inodes_count,
+			le64_to_cpu(raw_cp->cp_inodes_count));
+	atomic64_set(&root->blocks_count,
+			le64_to_cpu(raw_cp->cp_blocks_count));
 
 	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
 
@@ -609,6 +611,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	unsigned long overhead;
 	unsigned long nrsvblocks;
 	sector_t nfreeblocks;
+	u64 nmaxinodes, nfreeinodes;
 	int err;
 
 	/*
@@ -633,14 +636,34 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (unlikely(err))
 		return err;
 
+	err = nilfs_ifile_count_free_inodes(root->ifile,
+					    &nmaxinodes, &nfreeinodes);
+	if (unlikely(err)) {
+		printk(KERN_WARNING
+			"NILFS warning: fail to count free inodes: err %d.\n",
+			err);
+		if (err == -ERANGE) {
+			/*
+			 * If nilfs_palloc_count_max_entries() returns
+			 * -ERANGE error code then we simply treat
+			 * curent inodes count as maximum possible and
+			 * zero as free inodes value.
+			 */
+			nmaxinodes = atomic64_read(&root->inodes_count);
+			nfreeinodes = 0;
+			err = 0;
+		} else
+			return err;
+	}
+
 	buf->f_type = NILFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = blocks - overhead;
 	buf->f_bfree = nfreeblocks;
 	buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
 		(buf->f_bfree - nrsvblocks) : 0;
-	buf->f_files = atomic_read(&root->inodes_count);
-	buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
+	buf->f_files = nmaxinodes;
+	buf->f_ffree = nfreeinodes;
 	buf->f_namelen = NILFS_NAME_LEN;
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
@@ -973,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
 
 static int nilfs_tree_was_touched(struct dentry *root_dentry)
 {
-	return root_dentry->d_count > 1;
+	return d_count(root_dentry) > 1;
 }
 
 /**
@@ -1361,6 +1384,7 @@ struct file_system_type nilfs_fs_type = {
 	.kill_sb  = kill_block_super,
 	.fs_flags = FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("nilfs2");
 
 static void nilfs_inode_init_once(void *obj)
 {
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 41e6a04a561..94c451ce6d2 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -764,8 +764,8 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 	new->ifile = NULL;
 	new->nilfs = nilfs;
 	atomic_set(&new->count, 1);
-	atomic_set(&new->inodes_count, 0);
-	atomic_set(&new->blocks_count, 0);
+	atomic64_set(&new->inodes_count, 0);
+	atomic64_set(&new->blocks_count, 0);
 
 	rb_link_node(&new->rb_node, parent, p);
 	rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index be1267a34ce..de8cc53b4a5 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -241,8 +241,8 @@ struct nilfs_root {
 	struct the_nilfs *nilfs;
 	struct inode *ifile;
 
-	atomic_t inodes_count;
-	atomic_t blocks_count;
+	atomic64_t inodes_count;
+	atomic64_t blocks_count;
 };
 
 /* Special checkpoint number */
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 08b886f119c..1fedd5f7ccc 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1;
 static struct kmem_cache *dnotify_struct_cache __read_mostly;
 static struct kmem_cache *dnotify_mark_cache __read_mostly;
 static struct fsnotify_group *dnotify_group __read_mostly;
-static DEFINE_MUTEX(dnotify_mark_mutex);
 
 /*
  * dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@ -174,7 +173,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	struct dnotify_struct **prev;
 	struct inode *inode;
 
-	inode = filp->f_path.dentry->d_inode;
+	inode = file_inode(filp);
 	if (!S_ISDIR(inode->i_mode))
 		return;
 
@@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 		return;
 	dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
 
-	mutex_lock(&dnotify_mark_mutex);
+	mutex_lock(&dnotify_group->mark_mutex);
 
 	spin_lock(&fsn_mark->lock);
 	prev = &dn_mark->dn;
@@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 
 	spin_unlock(&fsn_mark->lock);
 
-	/* nothing else could have found us thanks to the dnotify_mark_mutex */
+	/* nothing else could have found us thanks to the dnotify_groups
+	   mark_mutex */
 	if (dn_mark->dn == NULL)
-		fsnotify_destroy_mark(fsn_mark, dnotify_group);
+		fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
 
-	mutex_unlock(&dnotify_mark_mutex);
+	mutex_unlock(&dnotify_group->mark_mutex);
 
 	fsnotify_put_mark(fsn_mark);
 }
@@ -296,7 +296,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	}
 
 	/* dnotify only works on directories */
-	inode = filp->f_path.dentry->d_inode;
+	inode = file_inode(filp);
 	if (!S_ISDIR(inode->i_mode)) {
 		error = -ENOTDIR;
 		goto out_err;
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	new_dn_mark->dn = NULL;
 
 	/* this is needed to prevent the fcntl/close race described below */
-	mutex_lock(&dnotify_mark_mutex);
+	mutex_lock(&dnotify_group->mark_mutex);
 
 	/* add the new_fsn_mark or find an old one. */
 	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
@@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 		dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
 		spin_lock(&fsn_mark->lock);
 	} else {
-		fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0);
+		fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
+					 NULL, 0);
 		spin_lock(&new_fsn_mark->lock);
 		fsn_mark = new_fsn_mark;
 		dn_mark = new_dn_mark;
@@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 
 	/* if (f != filp) means that we lost a race and another task/thread
 	 * actually closed the fd we are still playing with before we grabbed
-	 * the dnotify_mark_mutex and fsn_mark->lock.  Since closing the fd is the
-	 * only time we clean up the marks we need to get our mark off
-	 * the list. */
+	 * the dnotify_groups mark_mutex and fsn_mark->lock.  Since closing the
+	 * fd is the only time we clean up the marks we need to get our mark
+	 * off the list. */
 	if (f != filp) {
 		/* if we added ourselves, shoot ourselves, it's possible that
 		 * the flush actually did shoot this fsn_mark.  That's fine too
@@ -385,9 +386,9 @@ out:
 	spin_unlock(&fsn_mark->lock);
 
 	if (destroy)
-		fsnotify_destroy_mark(fsn_mark, dnotify_group);
+		fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
 
-	mutex_unlock(&dnotify_mark_mutex);
+	mutex_unlock(&dnotify_group->mark_mutex);
 	fsnotify_put_mark(fsn_mark);
 out_err:
 	if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9ff4a5ee6e2..e44cb6427df 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
+#include <linux/compat.h>
 
 #include <asm/ioctls.h>
 
@@ -121,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
 	metadata->event_len = FAN_EVENT_METADATA_LEN;
 	metadata->metadata_len = FAN_EVENT_METADATA_LEN;
 	metadata->vers = FANOTIFY_METADATA_VERSION;
+	metadata->reserved = 0;
 	metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
 	metadata->pid = pid_vnr(event->tgid);
 	if (unlikely(event->mask & FAN_Q_OVERFLOW))
@@ -398,9 +400,6 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 	wake_up(&group->fanotify_data.access_waitq);
 #endif
 
-	if (file->f_flags & FASYNC)
-		fsnotify_fasync(-1, file, 0);
-
 	/* matches the fanotify_init->fsnotify_alloc_group */
 	fsnotify_destroy_group(group);
 
@@ -466,7 +465,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
 
 		ret = -ENOTDIR;
 		if ((flags & FAN_MARK_ONLYDIR) &&
-		    !(S_ISDIR(f.file->f_path.dentry->d_inode->i_mode))) {
+		    !(S_ISDIR(file_inode(f.file)->i_mode))) {
 			fdput(f);
 			goto out;
 		}
@@ -525,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 	__u32 removed;
 	int destroy_mark;
 
+	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
-	if (!fsn_mark)
+	if (!fsn_mark) {
+		mutex_unlock(&group->mark_mutex);
 		return -ENOENT;
+	}
 
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
 	if (destroy_mark)
-		fsnotify_destroy_mark(fsn_mark, group);
+		fsnotify_destroy_mark_locked(fsn_mark, group);
+	mutex_unlock(&group->mark_mutex);
 
 	fsnotify_put_mark(fsn_mark);
 	if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -549,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 	__u32 removed;
 	int destroy_mark;
 
+	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_inode_mark(group, inode);
-	if (!fsn_mark)
+	if (!fsn_mark) {
+		mutex_unlock(&group->mark_mutex);
 		return -ENOENT;
+	}
 
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
 	if (destroy_mark)
-		fsnotify_destroy_mark(fsn_mark, group);
+		fsnotify_destroy_mark_locked(fsn_mark, group);
+	mutex_unlock(&group->mark_mutex);
+
 	/* matches the fsnotify_find_inode_mark() */
 	fsnotify_put_mark(fsn_mark);
 	if (removed & inode->i_fsnotify_mask)
@@ -592,35 +600,55 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 	return mask & ~oldmask;
 }
 
+static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
+						   struct inode *inode,
+						   struct vfsmount *mnt)
+{
+	struct fsnotify_mark *mark;
+	int ret;
+
+	if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+		return ERR_PTR(-ENOSPC);
+
+	mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+	if (!mark)
+		return ERR_PTR(-ENOMEM);
+
+	fsnotify_init_mark(mark, fanotify_free_mark);
+	ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
+	if (ret) {
+		fsnotify_put_mark(mark);
+		return ERR_PTR(ret);
+	}
+
+	return mark;
+}
+
+
 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 				      struct vfsmount *mnt, __u32 mask,
 				      unsigned int flags)
 {
 	struct fsnotify_mark *fsn_mark;
 	__u32 added;
-	int ret = 0;
 
+	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
 	if (!fsn_mark) {
-		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
-			return -ENOSPC;
-
-		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
-		if (!fsn_mark)
-			return -ENOMEM;
-
-		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
-		ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
-		if (ret)
-			goto err;
+		fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
+		if (IS_ERR(fsn_mark)) {
+			mutex_unlock(&group->mark_mutex);
+			return PTR_ERR(fsn_mark);
+		}
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+	mutex_unlock(&group->mark_mutex);
 
 	if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
 		fsnotify_recalc_vfsmount_mask(mnt);
-err:
+
 	fsnotify_put_mark(fsn_mark);
-	return ret;
+	return 0;
 }
 
 static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -629,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 {
 	struct fsnotify_mark *fsn_mark;
 	__u32 added;
-	int ret = 0;
 
 	pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
 
@@ -643,27 +670,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 	    (atomic_read(&inode->i_writecount) > 0))
 		return 0;
 
+	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark) {
-		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
-			return -ENOSPC;
-
-		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
-		if (!fsn_mark)
-			return -ENOMEM;
-
-		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
-		ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
-		if (ret)
-			goto err;
+		fsn_mark = fanotify_add_new_mark(group, inode, NULL);
+		if (IS_ERR(fsn_mark)) {
+			mutex_unlock(&group->mark_mutex);
+			return PTR_ERR(fsn_mark);
+		}
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+	mutex_unlock(&group->mark_mutex);
 
 	if (added & ~inode->i_fsnotify_mask)
 		fsnotify_recalc_inode_mask(inode);
-err:
+
 	fsnotify_put_mark(fsn_mark);
-	return ret;
+	return 0;
 }
 
 /* fanotify syscalls */
@@ -755,9 +778,9 @@ out_destroy_group:
 	return fd;
 }
 
-SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
-			      __u64 mask, int dfd,
-			      const char  __user * pathname)
+SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
+			      __u64, mask, int, dfd,
+			      const char  __user *, pathname)
 {
 	struct inode *inode = NULL;
 	struct vfsmount *mnt = NULL;
@@ -857,15 +880,20 @@ fput_and_out:
 	return ret;
 }
 
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask,
-				  long dfd, long pathname)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(fanotify_mark,
+				int, fanotify_fd, unsigned int, flags,
+				__u32, mask0, __u32, mask1, int, dfd,
+				const char  __user *, pathname)
 {
-	return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags,
-				  mask, (int) dfd,
-				  (const char  __user *) pathname);
+	return sys_fanotify_mark(fanotify_fd, flags,
+#ifdef __BIG_ENDIAN
+				((__u64)mask1 << 32) | mask0,
+#else
+				((__u64)mask0 << 32) | mask1,
+#endif
+				 dfd, pathname);
 }
-SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
 #endif
 
 /*
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 6baadb5a843..4bb21d67d9b 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -52,7 +52,6 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
 void __fsnotify_update_child_dentry_flags(struct inode *inode)
 {
 	struct dentry *alias;
-	struct hlist_node *p;
 	int watched;
 
 	if (!S_ISDIR(inode->i_mode))
@@ -64,7 +63,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 	spin_lock(&inode->i_lock);
 	/* run all of the dentries associated with this inode.  Since this is a
 	 * directory, there damn well better only be one item on this list */
-	hlist_for_each_entry(alias, p, &inode->i_dentry, d_alias) {
+	hlist_for_each_entry(alias, &inode->i_dentry, d_alias) {
 		struct dentry *child;
 
 		/* run all of the children of the original inode and fix their
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index f31e90fc050..74825be65b7 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -36,12 +36,11 @@
 static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
 {
 	struct fsnotify_mark *mark;
-	struct hlist_node *pos;
 	__u32 new_mask = 0;
 
 	assert_spin_locked(&inode->i_lock);
 
-	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list)
+	hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list)
 		new_mask |= mark->mask;
 	inode->i_fsnotify_mask = new_mask;
 }
@@ -87,11 +86,11 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 void fsnotify_clear_marks_by_inode(struct inode *inode)
 {
 	struct fsnotify_mark *mark, *lmark;
-	struct hlist_node *pos, *n;
+	struct hlist_node *n;
 	LIST_HEAD(free_list);
 
 	spin_lock(&inode->i_lock);
-	hlist_for_each_entry_safe(mark, pos, n, &inode->i_fsnotify_marks, i.i_list) {
+	hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, i.i_list) {
 		list_add(&mark->i.free_i_list, &free_list);
 		hlist_del_init_rcu(&mark->i.i_list);
 		fsnotify_get_mark(mark);
@@ -129,11 +128,10 @@ static struct fsnotify_mark *fsnotify_find_inode_mark_locked(
 		struct inode *inode)
 {
 	struct fsnotify_mark *mark;
-	struct hlist_node *pos;
 
 	assert_spin_locked(&inode->i_lock);
 
-	hlist_for_each_entry(mark, pos, &inode->i_fsnotify_marks, i.i_list) {
+	hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) {
 		if (mark->group == group) {
 			fsnotify_get_mark(mark);
 			return mark;
@@ -194,8 +192,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 			    struct fsnotify_group *group, struct inode *inode,
 			    int allow_dups)
 {
-	struct fsnotify_mark *lmark;
-	struct hlist_node *node, *last = NULL;
+	struct fsnotify_mark *lmark, *last = NULL;
 	int ret = 0;
 
 	mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
@@ -214,8 +211,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 	}
 
 	/* should mark be in the middle of the current list? */
-	hlist_for_each_entry(lmark, node, &inode->i_fsnotify_marks, i.i_list) {
-		last = node;
+	hlist_for_each_entry(lmark, &inode->i_fsnotify_marks, i.i_list) {
+		last = lmark;
 
 		if ((lmark->group == group) && !allow_dups) {
 			ret = -EEXIST;
@@ -235,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 
 	BUG_ON(last == NULL);
 	/* mark should be the last entry.  last is the current last entry */
-	hlist_add_after_rcu(last, &mark->i.i_list);
+	hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list);
 out:
 	fsnotify_recalc_inode_mask_locked(inode);
 	spin_unlock(&inode->i_lock);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 871569c7d60..4216308b81b 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -197,7 +197,6 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
 {
 	/* ideally the idr is empty and we won't hit the BUG in the callback */
 	idr_for_each(&group->inotify_data.idr, idr_callback, group);
-	idr_remove_all(&group->inotify_data.idr);
 	idr_destroy(&group->inotify_data.idr);
 	atomic_dec(&group->inotify_data.user->inotify_devs);
 	free_uid(group->inotify_data.user);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 228a2c2ad8d..60f954a891a 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -287,9 +287,6 @@ static int inotify_release(struct inode *ignored, struct file *file)
 
 	pr_debug("%s: group=%p\n", __func__, group);
 
-	if (file->f_flags & FASYNC)
-		fsnotify_fasync(-1, file, 0);
-
 	/* free this group, matching get was inotify_init->fsnotify_obtain_group */
 	fsnotify_destroy_group(group);
 
@@ -359,27 +356,23 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
 }
 
 static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
-			      int *last_wd,
 			      struct inotify_inode_mark *i_mark)
 {
 	int ret;
 
-	do {
-		if (unlikely(!idr_pre_get(idr, GFP_KERNEL)))
-			return -ENOMEM;
+	idr_preload(GFP_KERNEL);
+	spin_lock(idr_lock);
 
-		spin_lock(idr_lock);
-		ret = idr_get_new_above(idr, i_mark, *last_wd + 1,
-					&i_mark->wd);
+	ret = idr_alloc_cyclic(idr, i_mark, 1, 0, GFP_NOWAIT);
+	if (ret >= 0) {
 		/* we added the mark to the idr, take a reference */
-		if (!ret) {
-			*last_wd = i_mark->wd;
-			fsnotify_get_mark(&i_mark->fsn_mark);
-		}
-		spin_unlock(idr_lock);
-	} while (ret == -EAGAIN);
+		i_mark->wd = ret;
+		fsnotify_get_mark(&i_mark->fsn_mark);
+	}
 
-	return ret;
+	spin_unlock(idr_lock);
+	idr_preload_end();
+	return ret < 0 ? ret : 0;
 }
 
 static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group,
@@ -574,10 +567,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 	int add = (arg & IN_MASK_ADD);
 	int ret;
 
-	/* don't allow invalid bits: we don't want flags set */
 	mask = inotify_arg_to_mask(arg);
-	if (unlikely(!(mask & IN_ALL_EVENTS)))
-		return -EINVAL;
 
 	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark)
@@ -627,10 +617,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	struct idr *idr = &group->inotify_data.idr;
 	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
 
-	/* don't allow invalid bits: we don't want flags set */
 	mask = inotify_arg_to_mask(arg);
-	if (unlikely(!(mask & IN_ALL_EVENTS)))
-		return -EINVAL;
 
 	tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
 	if (unlikely(!tmp_i_mark))
@@ -644,13 +631,13 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
 		goto out_err;
 
-	ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd,
-				 tmp_i_mark);
+	ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
 	if (ret)
 		goto out_err;
 
 	/* we are on the idr, now get on the inode */
-	ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0);
+	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
+				       NULL, 0);
 	if (ret) {
 		/* we failed to get on the inode, get off the idr */
 		inotify_remove_from_idr(group, tmp_i_mark);
@@ -674,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
 {
 	int ret = 0;
 
-retry:
+	mutex_lock(&group->mark_mutex);
 	/* try to update and existing watch with the new arg */
 	ret = inotify_update_existing_watch(group, inode, arg);
 	/* no mark present, try to add a new one */
 	if (ret == -ENOENT)
 		ret = inotify_new_watch(group, inode, arg);
-	/*
-	 * inotify_new_watch could race with another thread which did an
-	 * inotify_new_watch between the update_existing and the add watch
-	 * here, go back and try to update an existing mark again.
-	 */
-	if (ret == -EEXIST)
-		goto retry;
+	mutex_unlock(&group->mark_mutex);
 
 	return ret;
 }
@@ -703,7 +684,6 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 
 	spin_lock_init(&group->inotify_data.idr_lock);
 	idr_init(&group->inotify_data.idr);
-	group->inotify_data.last_wd = 0;
 	group->inotify_data.user = get_current_user();
 
 	if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
@@ -757,6 +737,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 	int ret;
 	unsigned flags = 0;
 
+	/* don't allow invalid bits: we don't want flags set */
+	if (unlikely(!(mask & ALL_INOTIFY_BITS)))
+		return -EINVAL;
+
 	f = fdget(fd);
 	if (unlikely(!f.file))
 		return -EBADF;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc6b49bf736..923fe4a5f50 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -20,28 +20,29 @@
  * fsnotify inode mark locking/lifetime/and refcnting
  *
  * REFCNT:
- * The mark->refcnt tells how many "things" in the kernel currently are
- * referencing this object.  The object typically will live inside the kernel
- * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
- * which can find this object holding the appropriete locks, can take a reference
- * and the object itself is guaranteed to survive until the reference is dropped.
+ * The group->recnt and mark->refcnt tell how many "things" in the kernel
+ * currently are referencing the objects. Both kind of objects typically will
+ * live inside the kernel with a refcnt of 2, one for its creation and one for
+ * the reference a group and a mark hold to each other.
+ * If you are holding the appropriate locks, you can take a reference and the
+ * object itself is guaranteed to survive until the reference is dropped.
  *
  * LOCKING:
- * There are 3 spinlocks involved with fsnotify inode marks and they MUST
- * be taken in order as follows:
+ * There are 3 locks involved with fsnotify inode marks and they MUST be taken
+ * in order as follows:
  *
+ * group->mark_mutex
  * mark->lock
- * group->mark_lock
  * inode->i_lock
  *
- * mark->lock protects 2 things, mark->group and mark->inode.  You must hold
- * that lock to dereference either of these things (they could be NULL even with
- * the lock)
- *
- * group->mark_lock protects the marks_list anchored inside a given group
- * and each mark is hooked via the g_list.  It also sorta protects the
- * free_g_list, which when used is anchored by a private list on the stack of the
- * task which held the group->mark_lock.
+ * group->mark_mutex protects the marks_list anchored inside a given group and
+ * each mark is hooked via the g_list.  It also protects the groups private
+ * data (i.e group limits).
+
+ * mark->lock protects the marks attributes like its masks and flags.
+ * Furthermore it protects the access to a reference of the group that the mark
+ * is assigned to as well as the access to a reference of the inode/vfsmount
+ * that is being watched by the mark.
  *
  * inode->i_lock protects the i_fsnotify_marks list anchored inside a
  * given inode and each mark is hooked via the i_list. (and sorta the
@@ -64,18 +65,11 @@
  * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
  * mark on the list we take a reference (so the mark can't disappear under us).
  * We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list;  At this point we no
- * longer fear anything finding the mark using the inode's list of marks.
- *
- * We can safely and locklessly run the private list on the stack of everything
- * we just unattached from the original inode.  For each mark on the private list
- * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
- * we see the group and inode are not NULL we take those locks.  Now holding all
- * 3 locks we can completely remove the mark from other tasks finding it in the
- * future.  Remember, 10 things might already be referencing this mark, but they
- * better be holding a ref.  We drop our reference we took before we unhooked it
- * from the inode.  When the ref hits 0 we can free the mark.
- *
+ * private list anchored on the stack using i_free_list; we walk i_free_list
+ * and before we destroy the mark we make sure that we dont race with a
+ * concurrent destroy_group by getting a ref to the marks group and taking the
+ * groups mutex.
+
  * Very similarly for freeing by group, except we use free_g_list.
  *
  * This has the very interesting property of being able to run concurrently with
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 4df58b8ea64..68ca5a8704b 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -33,12 +33,12 @@
 void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
 {
 	struct fsnotify_mark *mark, *lmark;
-	struct hlist_node *pos, *n;
+	struct hlist_node *n;
 	struct mount *m = real_mount(mnt);
 	LIST_HEAD(free_list);
 
 	spin_lock(&mnt->mnt_root->d_lock);
-	hlist_for_each_entry_safe(mark, pos, n, &m->mnt_fsnotify_marks, m.m_list) {
+	hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, m.m_list) {
 		list_add(&mark->m.free_m_list, &free_list);
 		hlist_del_init_rcu(&mark->m.m_list);
 		fsnotify_get_mark(mark);
@@ -71,12 +71,11 @@ static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt)
 {
 	struct mount *m = real_mount(mnt);
 	struct fsnotify_mark *mark;
-	struct hlist_node *pos;
 	__u32 new_mask = 0;
 
 	assert_spin_locked(&mnt->mnt_root->d_lock);
 
-	hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list)
+	hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list)
 		new_mask |= mark->mask;
 	m->mnt_fsnotify_mask = new_mask;
 }
@@ -114,11 +113,10 @@ static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_
 {
 	struct mount *m = real_mount(mnt);
 	struct fsnotify_mark *mark;
-	struct hlist_node *pos;
 
 	assert_spin_locked(&mnt->mnt_root->d_lock);
 
-	hlist_for_each_entry(mark, pos, &m->mnt_fsnotify_marks, m.m_list) {
+	hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) {
 		if (mark->group == group) {
 			fsnotify_get_mark(mark);
 			return mark;
@@ -153,8 +151,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 			       int allow_dups)
 {
 	struct mount *m = real_mount(mnt);
-	struct fsnotify_mark *lmark;
-	struct hlist_node *node, *last = NULL;
+	struct fsnotify_mark *lmark, *last = NULL;
 	int ret = 0;
 
 	mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
@@ -173,8 +170,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 	}
 
 	/* should mark be in the middle of the current list? */
-	hlist_for_each_entry(lmark, node, &m->mnt_fsnotify_marks, m.m_list) {
-		last = node;
+	hlist_for_each_entry(lmark, &m->mnt_fsnotify_marks, m.m_list) {
+		last = lmark;
 
 		if ((lmark->group == group) && !allow_dups) {
 			ret = -EEXIST;
@@ -194,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 
 	BUG_ON(last == NULL);
 	/* mark should be the last entry.  last is the current last entry */
-	hlist_add_after_rcu(last, &mark->m.m_list);
+	hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list);
 out:
 	fsnotify_recalc_vfsmount_mask_locked(mnt);
 	spin_unlock(&mnt->mnt_root->d_lock);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f97af..d267ea6aa1a 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1372,7 +1372,7 @@ retry_writepage:
 		 * The page may have dirty, unmapped buffers.  Make them
 		 * freeable here, so the page does not leak.
 		 */
-		block_invalidatepage(page, 0);
+		block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 		unlock_page(page);
 		ntfs_debug("Write outside i_size - truncated?");
 		return 0;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 99e36107ff6..9e38dafa3bc 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1004,13 +1004,11 @@ dir_err_out:
 /**
  * ntfs_filldir - ntfs specific filldir method
  * @vol:	current ntfs volume
- * @fpos:	position in the directory
  * @ndir:	ntfs inode of current directory
  * @ia_page:	page in which the index allocation buffer @ie is in resides
  * @ie:		current index entry
  * @name:	buffer to use for the converted name
- * @dirent:	vfs filldir callback context
- * @filldir:	vfs filldir callback
+ * @actor:	what to feed the entries to
  *
  * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
  * callback.
@@ -1024,12 +1022,12 @@ dir_err_out:
  * retake the lock if we are returning a non-zero value as ntfs_readdir()
  * would need to drop the lock immediately anyway.
  */
-static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
+static inline int ntfs_filldir(ntfs_volume *vol,
 		ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
-		u8 *name, void *dirent, filldir_t filldir)
+		u8 *name, struct dir_context *actor)
 {
 	unsigned long mref;
-	int name_len, rc;
+	int name_len;
 	unsigned dt_type;
 	FILE_NAME_TYPE_FLAGS name_type;
 
@@ -1068,13 +1066,14 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
 	if (ia_page)
 		unlock_page(ia_page);
 	ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
-			"0x%lx, DT_%s.", name, name_len, fpos, mref,
+			"0x%lx, DT_%s.", name, name_len, actor->pos, mref,
 			dt_type == DT_DIR ? "DIR" : "REG");
-	rc = filldir(dirent, name, name_len, fpos, mref, dt_type);
+	if (!dir_emit(actor, name, name_len, mref, dt_type))
+		return 1;
 	/* Relock the page but not if we are aborting ->readdir. */
-	if (!rc && ia_page)
+	if (ia_page)
 		lock_page(ia_page);
-	return rc;
+	return 0;
 }
 
 /*
@@ -1097,11 +1096,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
  *	       removes them again after the write is complete after which it 
  *	       unlocks the page.
  */
-static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int ntfs_readdir(struct file *file, struct dir_context *actor)
 {
 	s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
-	loff_t fpos, i_size;
-	struct inode *bmp_vi, *vdir = filp->f_path.dentry->d_inode;
+	loff_t i_size;
+	struct inode *bmp_vi, *vdir = file_inode(file);
 	struct super_block *sb = vdir->i_sb;
 	ntfs_inode *ndir = NTFS_I(vdir);
 	ntfs_volume *vol = NTFS_SB(sb);
@@ -1116,33 +1115,16 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	u8 *kaddr, *bmp, *index_end;
 	ntfs_attr_search_ctx *ctx;
 
-	fpos = filp->f_pos;
 	ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
-			vdir->i_ino, fpos);
+			vdir->i_ino, actor->pos);
 	rc = err = 0;
 	/* Are we at end of dir yet? */
 	i_size = i_size_read(vdir);
-	if (fpos >= i_size + vol->mft_record_size)
-		goto done;
+	if (actor->pos >= i_size + vol->mft_record_size)
+		return 0;
 	/* Emulate . and .. for all directories. */
-	if (!fpos) {
-		ntfs_debug("Calling filldir for . with len 1, fpos 0x0, "
-				"inode 0x%lx, DT_DIR.", vdir->i_ino);
-		rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
-		if (rc)
-			goto done;
-		fpos++;
-	}
-	if (fpos == 1) {
-		ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
-				"inode 0x%lx, DT_DIR.",
-				(unsigned long)parent_ino(filp->f_path.dentry));
-		rc = filldir(dirent, "..", 2, fpos,
-				parent_ino(filp->f_path.dentry), DT_DIR);
-		if (rc)
-			goto done;
-		fpos++;
-	}
+	if (!dir_emit_dots(file, actor))
+		return 0;
 	m = NULL;
 	ctx = NULL;
 	/*
@@ -1155,7 +1137,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		goto err_out;
 	}
 	/* Are we jumping straight into the index allocation attribute? */
-	if (fpos >= vol->mft_record_size)
+	if (actor->pos >= vol->mft_record_size)
 		goto skip_index_root;
 	/* Get hold of the mft record for the directory. */
 	m = map_mft_record(ndir);
@@ -1170,7 +1152,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		goto err_out;
 	}
 	/* Get the offset into the index root attribute. */
-	ir_pos = (s64)fpos;
+	ir_pos = (s64)actor->pos;
 	/* Find the index root attribute in the mft record. */
 	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
 			0, ctx);
@@ -1226,10 +1208,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (ir_pos > (u8*)ie - (u8*)ir)
 			continue;
 		/* Advance the position even if going to skip the entry. */
-		fpos = (u8*)ie - (u8*)ir;
+		actor->pos = (u8*)ie - (u8*)ir;
 		/* Submit the name to the filldir callback. */
-		rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent,
-				filldir);
+		rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
 		if (rc) {
 			kfree(ir);
 			goto abort;
@@ -1242,12 +1223,12 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (!NInoIndexAllocPresent(ndir))
 		goto EOD;
 	/* Advance fpos to the beginning of the index allocation. */
-	fpos = vol->mft_record_size;
+	actor->pos = vol->mft_record_size;
 skip_index_root:
 	kaddr = NULL;
 	prev_ia_pos = -1LL;
 	/* Get the offset into the index allocation attribute. */
-	ia_pos = (s64)fpos - vol->mft_record_size;
+	ia_pos = (s64)actor->pos - vol->mft_record_size;
 	ia_mapping = vdir->i_mapping;
 	ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
 	bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
@@ -1409,7 +1390,7 @@ find_next_index_buffer:
 		if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
 			continue;
 		/* Advance the position even if going to skip the entry. */
-		fpos = (u8*)ie - (u8*)ia +
+		actor->pos = (u8*)ie - (u8*)ia +
 				(sle64_to_cpu(ia->index_block_vcn) <<
 				ndir->itype.index.vcn_size_bits) +
 				vol->mft_record_size;
@@ -1419,8 +1400,7 @@ find_next_index_buffer:
 		 * before returning, unless a non-zero value is returned in
 		 * which case the page is left unlocked.
 		 */
-		rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent,
-				filldir);
+		rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
 		if (rc) {
 			/* @ia_page is already unlocked in this case. */
 			ntfs_unmap_page(ia_page);
@@ -1439,18 +1419,9 @@ unm_EOD:
 	iput(bmp_vi);
 EOD:
 	/* We are finished, set fpos to EOD. */
-	fpos = i_size + vol->mft_record_size;
+	actor->pos = i_size + vol->mft_record_size;
 abort:
 	kfree(name);
-done:
-#ifdef DEBUG
-	if (!rc)
-		ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
-	else
-		ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
-				rc, fpos);
-#endif
-	filp->f_pos = fpos;
 	return 0;
 err_out:
 	if (bmp_page) {
@@ -1471,7 +1442,6 @@ iput_err_out:
 	if (!err)
 		err = -EIO;
 	ntfs_debug("Failed. Returning error code %i.", -err);
-	filp->f_pos = fpos;
 	return err;
 }
 
@@ -1571,7 +1541,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 const struct file_operations ntfs_dir_ops = {
 	.llseek		= generic_file_llseek,	/* Seek inside directory. */
 	.read		= generic_read_dir,	/* Return -EISDIR. */
-	.readdir	= ntfs_readdir,		/* Read directory contents. */
+	.iterate	= ntfs_readdir,		/* Read directory contents. */
 #ifdef NTFS_RW
 	.fsync		= ntfs_dir_fsync,	/* Sync a directory to disk. */
 	/*.aio_fsync	= ,*/			/* Sync all outstanding async
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 5b2d4f0853a..c5670b8d198 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
+#include <linux/aio.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -2129,7 +2130,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 	BUG_ON(iocb->ki_pos != pos);
 
-	sb_start_write(inode->i_sb);
 	mutex_lock(&inode->i_mutex);
 	ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
@@ -2138,7 +2138,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		if (err < 0)
 			ret = err;
 	}
-	sb_end_write(inode->i_sb);
 	return ret;
 }
 
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d3e118cc6ff..2778b0255dc 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,6 +28,7 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/log2.h>
+#include <linux/aio.h>
 
 #include "aops.h"
 #include "attrib.h"
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 4a8289f8b16..82650d52d91 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3079,6 +3079,7 @@ static struct file_system_type ntfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("ntfs");
 
 /* Stable names for the slab caches. */
 static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache";
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 260b16281fc..8a404576fb2 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -65,7 +65,20 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
 
 		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
 		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
-		acl->a_entries[n].e_id   = le32_to_cpu(entry->e_id);
+		switch(acl->a_entries[n].e_tag) {
+		case ACL_USER:
+			acl->a_entries[n].e_uid =
+				make_kuid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+		case ACL_GROUP:
+			acl->a_entries[n].e_gid =
+				make_kgid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+		default:
+			break;
+		}
 		value += sizeof(struct posix_acl_entry);
 
 	}
@@ -91,7 +104,21 @@ static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
 	for (n = 0; n < acl->a_count; n++, entry++) {
 		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
 		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
-		entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id);
+		switch(acl->a_entries[n].e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns,
+					  acl->a_entries[n].e_uid));
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns,
+					  acl->a_entries[n].e_gid));
+			break;
+		default:
+			entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+			break;
+		}
 	}
 	return ocfs2_acl;
 }
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 31b9463fba1..17e6bdde96c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5655,7 +5655,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 					       &ref_tree, NULL);
 		if (ret) {
 			mlog_errno(ret);
-			goto out;
+			goto bail;
 		}
 
 		ret = ocfs2_prepare_refcount_change_for_del(inode,
@@ -5666,7 +5666,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 							    &extra_blocks);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out;
+			goto bail;
 		}
 	}
 
@@ -5674,7 +5674,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 						 extra_blocks);
 	if (ret) {
 		mlog_errno(ret);
-		return ret;
+		goto bail;
 	}
 
 	mutex_lock(&tl_inode->i_mutex);
@@ -5734,7 +5734,7 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
 	mutex_unlock(&tl_inode->i_mutex);
-
+bail:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
@@ -6751,8 +6751,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
 		mlog_errno(ret);
 
 out:
-	if (pages)
-		kfree(pages);
+	kfree(pages);
 
 	return ret;
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 657743254eb..79736a28d84 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -569,7 +569,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 			     int ret,
 			     bool is_async)
 {
-	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(iocb->ki_filp);
 	int level;
 	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
 
@@ -593,9 +593,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 	level = ocfs2_iocb_rw_locked_level(iocb);
 	ocfs2_rw_unlock(inode, level);
 
+	inode_dio_done(inode);
 	if (is_async)
 		aio_complete(iocb, ret, 0);
-	inode_dio_done(inode);
 }
 
 /*
@@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
  * from ext3.  PageChecked() bits have been removed as OCFS2 does not
  * do journalled data.
  */
-static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
+static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 
-	jbd2_journal_invalidatepage(journal, page, offset);
+	jbd2_journal_invalidatepage(journal, page, offset, length);
 }
 
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -626,7 +627,7 @@ static ssize_t ocfs2_direct_IO(int rw,
 			       unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
+	struct inode *inode = file_inode(file)->i_mapping->host;
 
 	/*
 	 * Fallback to buffered I/O if we see an inode without
@@ -1194,6 +1195,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 				goto out;
 			}
 		}
+		wait_for_stable_page(wc->w_pages[i]);
 
 		if (index == target_index)
 			wc->w_target_page = wc->w_pages[i];
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index ffb2da370a9..f671e49beb3 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,6 +22,8 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
 
+#include <linux/aio.h>
+
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 struct page *page,
 							 unsigned from,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f7c648d7d6b..5c1c864e81c 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -176,7 +176,7 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
 	}
 }
 
-static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
+static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
 {
 	int ret = -1;
 
@@ -500,7 +500,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
 	}
 
 	atomic_inc(&write_wc->wc_num_reqs);
-	submit_bio(WRITE, bio);
+	submit_bio(WRITE_SYNC, bio);
 
 	status = 0;
 bail:
@@ -1471,8 +1471,7 @@ static void o2hb_region_release(struct config_item *item)
 
 	mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
 
-	if (reg->hr_tmp_block)
-		kfree(reg->hr_tmp_block);
+	kfree(reg->hr_tmp_block);
 
 	if (reg->hr_slot_data) {
 		for (i = 0; i < reg->hr_num_pages; i++) {
@@ -1486,8 +1485,7 @@ static void o2hb_region_release(struct config_item *item)
 	if (reg->hr_bdev)
 		blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
 
-	if (reg->hr_slots)
-		kfree(reg->hr_slots);
+	kfree(reg->hr_slots);
 
 	kfree(reg->hr_db_regnum);
 	kfree(reg->hr_db_livenodes);
@@ -2273,7 +2271,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
 		if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
 			continue;
 
-		ret = o2hb_global_hearbeat_mode_set(i);
+		ret = o2hb_global_heartbeat_mode_set(i);
 		if (!ret)
 			printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
 			       o2hb_heartbeat_mode_desc[i]);
@@ -2306,7 +2304,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
 	NULL,
 };
 
-static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
+static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
 	.show_attribute		= o2hb_heartbeat_group_show,
 	.store_attribute	= o2hb_heartbeat_group_store,
 };
@@ -2318,7 +2316,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
 
 static struct config_item_type o2hb_heartbeat_group_type = {
 	.ct_group_ops	= &o2hb_heartbeat_group_group_ops,
-	.ct_item_ops	= &o2hb_hearbeat_group_item_ops,
+	.ct_item_ops	= &o2hb_heartbeat_group_item_ops,
 	.ct_attrs	= o2hb_heartbeat_group_attrs,
 	.ct_owner	= THIS_MODULE,
 };
@@ -2391,6 +2389,9 @@ static int o2hb_region_pin(const char *region_uuid)
 	assert_spin_locked(&o2hb_live_lock);
 
 	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		if (reg->hr_item_dropped)
+			continue;
+
 		uuid = config_item_name(&reg->hr_item);
 
 		/* local heartbeat */
@@ -2441,6 +2442,9 @@ static void o2hb_region_unpin(const char *region_uuid)
 	assert_spin_locked(&o2hb_live_lock);
 
 	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		if (reg->hr_item_dropped)
+			continue;
+
 		uuid = config_item_name(&reg->hr_item);
 		if (region_uuid) {
 			if (strcmp(region_uuid, uuid))
@@ -2656,6 +2660,9 @@ int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
 
 	p = region_uuids;
 	list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+		if (reg->hr_item_dropped)
+			continue;
+
 		mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
 		if (numregs < max_regions) {
 			memcpy(p, config_item_name(&reg->hr_item),
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index c19897d0fe1..1ec141e758d 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -264,7 +264,7 @@ void o2quo_hb_still_up(u8 node)
 /* This is analogous to hb_up.  as a node's connection comes up we delay the
  * quorum decision until we see it heartbeating.  the hold will be droped in
  * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
- * it's already heartbeating we we might be dropping a hold that conn_up got.
+ * it's already heartbeating we might be dropping a hold that conn_up got.
  * */
 void o2quo_conn_up(u8 node)
 {
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1bfe8802cc1..d644dc61142 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -304,28 +304,22 @@ static u8 o2net_num_from_nn(struct o2net_node *nn)
 
 static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
 {
-	int ret = 0;
-
-	do {
-		if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
-			ret = -EAGAIN;
-			break;
-		}
-		spin_lock(&nn->nn_lock);
-		ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
-		if (ret == 0)
-			list_add_tail(&nsw->ns_node_item,
-				      &nn->nn_status_list);
-		spin_unlock(&nn->nn_lock);
-	} while (ret == -EAGAIN);
+	int ret;
 
-	if (ret == 0)  {
-		init_waitqueue_head(&nsw->ns_wq);
-		nsw->ns_sys_status = O2NET_ERR_NONE;
-		nsw->ns_status = 0;
+	spin_lock(&nn->nn_lock);
+	ret = idr_alloc(&nn->nn_status_idr, nsw, 0, 0, GFP_ATOMIC);
+	if (ret >= 0) {
+		nsw->ns_id = ret;
+		list_add_tail(&nsw->ns_node_item, &nn->nn_status_list);
 	}
+	spin_unlock(&nn->nn_lock);
+	if (ret < 0)
+		return ret;
 
-	return ret;
+	init_waitqueue_head(&nsw->ns_wq);
+	nsw->ns_sys_status = O2NET_ERR_NONE;
+	nsw->ns_status = 0;
+	return 0;
 }
 
 static void o2net_complete_nsw_locked(struct o2net_node *nn,
@@ -412,6 +406,9 @@ static void sc_kref_release(struct kref *kref)
 	sc->sc_node = NULL;
 
 	o2net_debug_del_sc(sc);
+
+	if (sc->sc_page)
+		__free_page(sc->sc_page);
 	kfree(sc);
 }
 
@@ -636,19 +633,19 @@ static void o2net_state_change(struct sock *sk)
 	state_change = sc->sc_state_change;
 
 	switch(sk->sk_state) {
-		/* ignore connecting sockets as they make progress */
-		case TCP_SYN_SENT:
-		case TCP_SYN_RECV:
-			break;
-		case TCP_ESTABLISHED:
-			o2net_sc_queue_work(sc, &sc->sc_connect_work);
-			break;
-		default:
-			printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
-			      " shutdown, state %d\n",
-			      SC_NODEF_ARGS(sc), sk->sk_state);
-			o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
-			break;
+	/* ignore connecting sockets as they make progress */
+	case TCP_SYN_SENT:
+	case TCP_SYN_RECV:
+		break;
+	case TCP_ESTABLISHED:
+		o2net_sc_queue_work(sc, &sc->sc_connect_work);
+		break;
+	default:
+		printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
+			" shutdown, state %d\n",
+			SC_NODEF_ARGS(sc), sk->sk_state);
+		o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+		break;
 	}
 out:
 	read_unlock(&sk->sk_callback_lock);
@@ -870,7 +867,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
 		/* we've had some trouble with handlers seemingly vanishing. */
 		mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
 							  &parent) == NULL,
-			        "couldn't find handler we *just* registerd "
+			        "couldn't find handler we *just* registered "
 				"for type %u key %08x\n", msg_type, key);
 	}
 	write_unlock(&o2net_handler_lock);
@@ -1165,10 +1162,8 @@ out:
 	o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
 	if (sc)
 		sc_put(sc);
-	if (vec)
-		kfree(vec);
-	if (msg)
-		kfree(msg);
+	kfree(vec);
+	kfree(msg);
 	o2net_complete_nsw(nn, &nsw, 0, 0, 0);
 	return ret;
 }
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 8db4b58b2e4..ef999729e27 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -169,11 +169,10 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 				      u64 parent_blkno,
 				      int skip_unhashed)
 {
-	struct hlist_node *p;
 	struct dentry *dentry;
 
 	spin_lock(&inode->i_lock);
-	hlist_for_each_entry(dentry, p, &inode->i_dentry, d_alias) {
+	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		spin_lock(&dentry->d_lock);
 		if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
 			trace_ocfs2_find_local_alias(dentry->d_name.len,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8fe4e2892ab..eb760d8acd5 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -67,7 +67,6 @@
 #define NAMEI_RA_CHUNKS  2
 #define NAMEI_RA_BLOCKS  4
 #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
 
 static unsigned char ocfs2_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
@@ -1762,11 +1761,10 @@ bail:
 
 static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 				    u64 *f_version,
-				    loff_t *f_pos, void *priv,
-				    filldir_t filldir, int *filldir_err)
+				    struct dir_context *ctx)
 {
-	int ret, i, filldir_ret;
-	unsigned long offset = *f_pos;
+	int ret, i;
+	unsigned long offset = ctx->pos;
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
@@ -1782,8 +1780,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 	data = &di->id2.i_data;
 
-	while (*f_pos < i_size_read(inode)) {
-revalidate:
+	while (ctx->pos < i_size_read(inode)) {
 		/* If the dir block has changed since the last call to
 		 * readdir(2), then we might be pointing to an invalid
 		 * dirent right now.  Scan from the start of the block
@@ -1803,50 +1800,31 @@ revalidate:
 					break;
 				i += le16_to_cpu(de->rec_len);
 			}
-			*f_pos = offset = i;
+			ctx->pos = offset = i;
 			*f_version = inode->i_version;
 		}
 
-		de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
-		if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
+		de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
+		if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
 			/* On error, skip the f_pos to the end. */
-			*f_pos = i_size_read(inode);
-			goto out;
+			ctx->pos = i_size_read(inode);
+			break;
 		}
 		offset += le16_to_cpu(de->rec_len);
 		if (le64_to_cpu(de->inode)) {
-			/* We might block in the next section
-			 * if the data destination is
-			 * currently swapped out.  So, use a
-			 * version stamp to detect whether or
-			 * not the directory has been modified
-			 * during the copy operation.
-			 */
-			u64 version = *f_version;
 			unsigned char d_type = DT_UNKNOWN;
 
 			if (de->file_type < OCFS2_FT_MAX)
 				d_type = ocfs2_filetype_table[de->file_type];
 
-			filldir_ret = filldir(priv, de->name,
-					      de->name_len,
-					      *f_pos,
-					      le64_to_cpu(de->inode),
-					      d_type);
-			if (filldir_ret) {
-				if (filldir_err)
-					*filldir_err = filldir_ret;
-				break;
-			}
-			if (version != *f_version)
-				goto revalidate;
+			if (!dir_emit(ctx, de->name, de->name_len,
+				      le64_to_cpu(de->inode), d_type))
+				goto out;
 		}
-		*f_pos += le16_to_cpu(de->rec_len);
+		ctx->pos += le16_to_cpu(de->rec_len);
 	}
-
 out:
 	brelse(di_bh);
-
 	return 0;
 }
 
@@ -1856,27 +1834,26 @@ out:
  */
 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 				    u64 *f_version,
-				    loff_t *f_pos, void *priv,
-				    filldir_t filldir, int *filldir_err)
+				    struct dir_context *ctx,
+				    bool persist)
 {
-	int error = 0;
 	unsigned long offset, blk, last_ra_blk = 0;
-	int i, stored;
+	int i;
 	struct buffer_head * bh, * tmp;
 	struct ocfs2_dir_entry * de;
 	struct super_block * sb = inode->i_sb;
 	unsigned int ra_sectors = 16;
+	int stored = 0;
 
-	stored = 0;
 	bh = NULL;
 
-	offset = (*f_pos) & (sb->s_blocksize - 1);
+	offset = ctx->pos & (sb->s_blocksize - 1);
 
-	while (!error && !stored && *f_pos < i_size_read(inode)) {
-		blk = (*f_pos) >> sb->s_blocksize_bits;
+	while (ctx->pos < i_size_read(inode)) {
+		blk = ctx->pos >> sb->s_blocksize_bits;
 		if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
 			/* Skip the corrupt dirblock and keep trying */
-			*f_pos += sb->s_blocksize - offset;
+			ctx->pos += sb->s_blocksize - offset;
 			continue;
 		}
 
@@ -1898,7 +1875,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 			ra_sectors = 8;
 		}
 
-revalidate:
 		/* If the dir block has changed since the last call to
 		 * readdir(2), then we might be pointing to an invalid
 		 * dirent right now.  Scan from the start of the block
@@ -1918,93 +1894,64 @@ revalidate:
 				i += le16_to_cpu(de->rec_len);
 			}
 			offset = i;
-			*f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
+			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
 				| offset;
 			*f_version = inode->i_version;
 		}
 
-		while (!error && *f_pos < i_size_read(inode)
+		while (ctx->pos < i_size_read(inode)
 		       && offset < sb->s_blocksize) {
 			de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
 			if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
 				/* On error, skip the f_pos to the
 				   next block. */
-				*f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
+				ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
 				brelse(bh);
-				goto out;
+				continue;
 			}
-			offset += le16_to_cpu(de->rec_len);
 			if (le64_to_cpu(de->inode)) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation.
-				 */
-				unsigned long version = *f_version;
 				unsigned char d_type = DT_UNKNOWN;
 
 				if (de->file_type < OCFS2_FT_MAX)
 					d_type = ocfs2_filetype_table[de->file_type];
-				error = filldir(priv, de->name,
+				if (!dir_emit(ctx, de->name,
 						de->name_len,
-						*f_pos,
 						le64_to_cpu(de->inode),
-						d_type);
-				if (error) {
-					if (filldir_err)
-						*filldir_err = error;
-					break;
+						d_type)) {
+					brelse(bh);
+					return 0;
 				}
-				if (version != *f_version)
-					goto revalidate;
-				stored ++;
+				stored++;
 			}
-			*f_pos += le16_to_cpu(de->rec_len);
+			offset += le16_to_cpu(de->rec_len);
+			ctx->pos += le16_to_cpu(de->rec_len);
 		}
 		offset = 0;
 		brelse(bh);
 		bh = NULL;
+		if (!persist && stored)
+			break;
 	}
-
-	stored = 0;
-out:
-	return stored;
+	return 0;
 }
 
 static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
-				 loff_t *f_pos, void *priv, filldir_t filldir,
-				 int *filldir_err)
+				 struct dir_context *ctx,
+				 bool persist)
 {
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-		return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
-						filldir, filldir_err);
-
-	return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
-					filldir_err);
+		return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
+	return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
 }
 
 /*
  * This is intended to be called from inside other kernel functions,
  * so we fake some arguments.
  */
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
-		      filldir_t filldir)
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
 {
-	int ret = 0, filldir_err = 0;
 	u64 version = inode->i_version;
-
-	while (*f_pos < i_size_read(inode)) {
-		ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
-					    filldir, &filldir_err);
-		if (ret || filldir_err)
-			break;
-	}
-
-	if (ret > 0)
-		ret = -EIO;
-
+	ocfs2_dir_foreach_blk(inode, &version, ctx, true);
 	return 0;
 }
 
@@ -2012,15 +1959,15 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
  * ocfs2_readdir()
  *
  */
-int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int ocfs2_readdir(struct file *file, struct dir_context *ctx)
 {
 	int error = 0;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	int lock_level = 0;
 
 	trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+	error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
 	if (lock_level && error >= 0) {
 		/* We release EX lock which used to update atime
 		 * and get PR lock again to reduce contention
@@ -2036,8 +1983,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 		goto bail_nolock;
 	}
 
-	error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
-				      dirent, filldir, NULL);
+	error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
 
 	ocfs2_inode_unlock(inode, lock_level);
 	if (error)
@@ -2121,6 +2067,7 @@ bail:
 }
 
 struct ocfs2_empty_dir_priv {
+	struct dir_context ctx;
 	unsigned seen_dot;
 	unsigned seen_dot_dot;
 	unsigned seen_other;
@@ -2205,8 +2152,9 @@ out:
 int ocfs2_empty_dir(struct inode *inode)
 {
 	int ret;
-	loff_t start = 0;
-	struct ocfs2_empty_dir_priv priv;
+	struct ocfs2_empty_dir_priv priv = {
+		.ctx.actor = ocfs2_empty_dir_filldir
+	};
 
 	memset(&priv, 0, sizeof(priv));
 
@@ -2220,7 +2168,7 @@ int ocfs2_empty_dir(struct inode *inode)
 		 */
 	}
 
-	ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
+	ret = ocfs2_dir_foreach(inode, &priv.ctx);
 	if (ret)
 		mlog_errno(ret);
 
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e683f3deb64..f0344b75b14 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -92,9 +92,8 @@ int ocfs2_find_files_on_disk(const char *name,
 			     struct ocfs2_dir_lookup_result *res);
 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
 			       int namelen, u64 *blkno);
-int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
-int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
-		      filldir_t filldir);
+int ocfs2_readdir(struct file *file, struct dir_context *ctx);
+int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 				 struct inode *dir,
 				 struct buffer_head *parent_fe_bh,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 9e89d70df33..dbb17c07656 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -319,9 +319,7 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 	if (dlm->master_hash)
 		dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
 
-	if (dlm->name)
-		kfree(dlm->name);
-
+	kfree(dlm->name);
 	kfree(dlm);
 }
 
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 975810b9849..47e67c2d228 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -178,6 +178,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
 				     lock->ml.node);
 			}
 		} else {
+			status = DLM_NORMAL;
 			dlm_lock_get(lock);
 			list_add_tail(&lock->list, &res->blocked);
 			kick_thread = 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 005261c333b..33ecbe0e673 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2020,7 +2020,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
 			       int ignore_higher, u8 request_from, u32 flags)
 {
 	struct dlm_work_item *item;
-	item = kzalloc(sizeof(*item), GFP_NOFS);
+	item = kzalloc(sizeof(*item), GFP_ATOMIC);
 	if (!item)
 		return -ENOMEM;
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 01ebfd0bdad..773bd32bfd8 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -55,9 +55,6 @@
 static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
 
 static int dlm_recovery_thread(void *data);
-void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
-int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
-void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
 static int dlm_do_recovery(struct dlm_ctxt *dlm);
 
 static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
@@ -789,7 +786,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
 				 u8 dead_node)
 {
 	struct dlm_lock_request lr;
-	enum dlm_status ret;
+	int ret;
 
 	mlog(0, "\n");
 
@@ -802,7 +799,6 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
 	lr.dead_node = dead_node;
 
 	// send message
-	ret = DLM_NOLOCKMGR;
 	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
 				 &lr, sizeof(lr), request_from, NULL);
 
@@ -1408,6 +1404,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
 				     mres->lockname_len, mres->lockname);
 				ret = -EFAULT;
 				spin_unlock(&res->spinlock);
+				dlm_lockres_put(res);
 				goto leave;
 			}
 			res->state |= DLM_LOCK_RES_MIGRATING;
@@ -1498,10 +1495,8 @@ leave:
 
 	dlm_put(dlm);
 	if (ret < 0) {
-		if (buf)
-			kfree(buf);
-		if (item)
-			kfree(item);
+		kfree(buf);
+		kfree(item);
 		mlog_errno(ret);
 	}
 
@@ -2083,7 +2078,6 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 					      u8 dead_node, u8 new_master)
 {
 	int i;
-	struct hlist_node *hash_iter;
 	struct hlist_head *bucket;
 	struct dlm_lock_resource *res, *next;
 
@@ -2114,7 +2108,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 	 * if necessary */
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
 		bucket = dlm_lockres_hash(dlm, i);
-		hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
+		hlist_for_each_entry(res, bucket, hash_node) {
 			if (!(res->state & DLM_LOCK_RES_RECOVERING))
 				continue;
 
@@ -2273,7 +2267,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
 
 static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 {
-	struct hlist_node *iter;
 	struct dlm_lock_resource *res;
 	int i;
 	struct hlist_head *bucket;
@@ -2299,7 +2292,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 	 */
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
 		bucket = dlm_lockres_hash(dlm, i);
-		hlist_for_each_entry(res, iter, bucket, hash_node) {
+		hlist_for_each_entry(res, bucket, hash_node) {
  			/* always prune any $RECOVERY entries for dead nodes,
  			 * otherwise hangs can occur during later recovery */
 			if (dlm_is_recovery_lock(res->lockname.name,
@@ -2699,6 +2692,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
 		     dlm->name, br->node_idx, br->dead_node,
 		     dlm->reco.dead_node, dlm->reco.new_master);
 		spin_unlock(&dlm->spinlock);
+		dlm_put(dlm);
 		return -EAGAIN;
 	}
 	spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 16b712d260d..12bafb7265c 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -224,7 +224,7 @@ static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
 static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
 {
 	int event = 0;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct dlmfs_inode_private *ip = DLMFS_I(inode);
 
 	poll_wait(file, &ip->ip_lockres.l_event, wait);
@@ -245,7 +245,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
 	int bytes_left;
 	ssize_t readlen, got;
 	char *lvb_buf;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
 	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
 		inode->i_ino, count, *ppos);
@@ -293,7 +293,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
 	int bytes_left;
 	ssize_t writelen;
 	char *lvb_buf;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
 	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
 		inode->i_ino, count, *ppos);
@@ -640,6 +640,7 @@ static struct file_system_type dlmfs_fs_type = {
 	.mount		= dlmfs_mount,
 	.kill_sb	= kill_litter_super,
 };
+MODULE_ALIAS_FS("ocfs2_dlmfs");
 
 static int __init init_dlmfs_fs(void)
 {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4f7795fb5fc..3a44a648dae 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2045,8 +2045,8 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 	lvb->lvb_version   = OCFS2_LVB_VERSION;
 	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
 	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
-	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
-	lvb->lvb_igid      = cpu_to_be32(inode->i_gid);
+	lvb->lvb_iuid      = cpu_to_be32(i_uid_read(inode));
+	lvb->lvb_igid      = cpu_to_be32(i_gid_read(inode));
 	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
 	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
 	lvb->lvb_iatime_packed  =
@@ -2095,8 +2095,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	else
 		inode->i_blocks = ocfs2_inode_sector_count(inode);
 
-	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
-	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
+	i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid));
+	i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
 	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
 	set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
 	ocfs2_unpack_timespec(&inode->i_atime,
@@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
 	status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
 				      arg_flags, subclass, _RET_IP_);
 	if (status < 0) {
-		if (status != -EAGAIN && status != -EIOCBRETRY)
+		if (status != -EAGAIN)
 			mlog_errno(status);
 		goto bail;
 	}
@@ -2545,6 +2545,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
 	 * everything is up to the caller :) */
 	status = ocfs2_should_refresh_lock_res(lockres);
 	if (status < 0) {
+		ocfs2_cluster_unlock(osb, lockres, level);
 		mlog_errno(status);
 		goto bail;
 	}
@@ -2553,8 +2554,10 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
 
 		ocfs2_complete_lock_res_refresh(lockres, status);
 
-		if (status < 0)
+		if (status < 0) {
+			ocfs2_cluster_unlock(osb, lockres, level);
 			mlog_errno(status);
+		}
 		ocfs2_track_lock_refresh(lockres);
 	}
 bail:
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 322216a5f0d..29651167190 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -195,11 +195,11 @@ static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,
 
 	if (parent && (len < 6)) {
 		*max_len = 6;
-		type = 255;
+		type = FILEID_INVALID;
 		goto bail;
 	} else if (len < 3) {
 		*max_len = 3;
-		type = 255;
+		type = FILEID_INVALID;
 		goto bail;
 	}
 
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f487aa34344..2487116d0d3 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -282,8 +282,7 @@ search:
 	spin_unlock(&oi->ip_lock);
 
 out:
-	if (new_emi)
-		kfree(new_emi);
+	kfree(new_emi);
 }
 
 static int ocfs2_last_eb_is_empty(struct inode *inode,
@@ -791,7 +790,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 						 &hole_size, &rec, &is_last);
 		if (ret) {
 			mlog_errno(ret);
-			goto out;
+			goto out_unlock;
 		}
 
 		if (rec.e_blkno == 0ULL) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 37d313ede15..41000f223ca 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1116,7 +1116,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
 			    dentry->d_name.len, dentry->d_name.name,
 			    attr->ia_valid, attr->ia_mode,
-			    attr->ia_uid, attr->ia_gid);
+			    from_kuid(&init_user_ns, attr->ia_uid),
+			    from_kgid(&init_user_ns, attr->ia_gid));
 
 	/* ensuring we don't even attempt to truncate a symlink */
 	if (S_ISLNK(inode->i_mode))
@@ -1174,14 +1175,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+	if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+	    (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
 		/*
 		 * Gather pointers to quota structures so that allocation /
 		 * freeing of quota structures happens here and not inside
 		 * dquot_transfer() where we have problems with lock ordering
 		 */
-		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
+		if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
 			transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
@@ -1190,7 +1191,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 				goto bail_unlock;
 			}
 		}
-		if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+		if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
 			transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
@@ -1949,7 +1950,7 @@ out:
 int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 			    struct ocfs2_space_resv *sr)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int ret;
 
@@ -1977,7 +1978,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
 			    loff_t len)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_space_resv sr;
 	int change_size = 1;
@@ -2232,7 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	loff_t old_size, *ppos = &iocb->ki_pos;
 	u32 old_clusters;
 	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int full_coherency = !(osb->s_mount_opt &
 			       OCFS2_MOUNT_COHERENCY_BUFFERED);
@@ -2247,8 +2248,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	if (iocb->ki_left == 0)
 		return 0;
 
-	sb_start_write(inode->i_sb);
-
 	appending = file->f_flags & O_APPEND ? 1 : 0;
 	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
 
@@ -2289,7 +2288,7 @@ relock:
 		ret = ocfs2_inode_lock(inode, NULL, 1);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out_sems;
+			goto out;
 		}
 
 		ocfs2_inode_unlock(inode, 1);
@@ -2422,7 +2421,6 @@ out_sems:
 		ocfs2_iocb_clear_sem_locked(iocb);
 
 	mutex_unlock(&inode->i_mutex);
-	sb_end_write(inode->i_sb);
 
 	if (written)
 		ret = written;
@@ -2467,8 +2465,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 			out->f_path.dentry->d_name.len,
 			out->f_path.dentry->d_name.name, len);
 
-	if (pipe->inode)
-		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
+	pipe_lock(pipe);
 
 	splice_from_pipe_begin(&sd);
 	do {
@@ -2488,8 +2485,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 	} while (ret > 0);
 	splice_from_pipe_end(pipe, &sd);
 
-	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 
 	if (sd.num_spliced)
 		ret = sd.num_spliced;
@@ -2516,7 +2512,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
 				      unsigned int flags)
 {
 	int ret = 0, lock_level = 0;
-	struct inode *inode = in->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(in);
 
 	trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2526,7 +2522,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
 	/*
 	 * See the comment in ocfs2_file_aio_read()
 	 */
-	ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
+	ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
@@ -2546,7 +2542,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 {
 	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
 	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 
 	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2589,7 +2585,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 	 * like i_size. This allows the checks down below
 	 * generic_file_aio_read() a chance of actually working.
 	 */
-	ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+	ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto bail;
@@ -2650,17 +2646,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
 		goto out;
 	}
 
-	if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
-		ret = -EINVAL;
-	if (!ret && offset > inode->i_sb->s_maxbytes)
-		ret = -EINVAL;
-	if (ret)
-		goto out;
-
-	if (offset != file->f_pos) {
-		file->f_pos = offset;
-		file->f_version = 0;
-	}
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out:
 	mutex_unlock(&inode->i_mutex);
@@ -2716,7 +2702,7 @@ const struct file_operations ocfs2_fops = {
 const struct file_operations ocfs2_dops = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ocfs2_readdir,
+	.iterate	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_dir_release,
 	.open		= ocfs2_dir_open,
@@ -2763,7 +2749,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
 const struct file_operations ocfs2_dops_no_plocks = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= ocfs2_readdir,
+	.iterate	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
 	.release	= ocfs2_dir_release,
 	.open		= ocfs2_dir_open,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index d89e08a81ed..f87f9bd1edf 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -269,8 +269,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	inode->i_generation = le32_to_cpu(fe->i_generation);
 	inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
 	inode->i_mode = le16_to_cpu(fe->i_mode);
-	inode->i_uid = le32_to_cpu(fe->i_uid);
-	inode->i_gid = le32_to_cpu(fe->i_gid);
+	i_uid_write(inode, le32_to_cpu(fe->i_uid));
+	i_gid_write(inode, le32_to_cpu(fe->i_gid));
 
 	/* Fast symlinks will have i_size but no allocated clusters. */
 	if (S_ISLNK(inode->i_mode) && !fe->i_clusters) {
@@ -1259,8 +1259,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 
 	fe->i_size = cpu_to_le64(i_size_read(inode));
 	ocfs2_set_links_count(fe, inode->i_nlink);
-	fe->i_uid = cpu_to_le32(inode->i_uid);
-	fe->i_gid = cpu_to_le32(inode->i_gid);
+	fe->i_uid = cpu_to_le32(i_uid_read(inode));
+	fe->i_gid = cpu_to_le32(i_gid_read(inode));
 	fe->i_mode = cpu_to_le16(inode->i_mode);
 	fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
 	fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
@@ -1290,8 +1290,8 @@ void ocfs2_refresh_inode(struct inode *inode,
 	ocfs2_set_inode_flags(inode);
 	i_size_write(inode, le64_to_cpu(fe->i_size));
 	set_nlink(inode, ocfs2_read_links_count(fe));
-	inode->i_uid = le32_to_cpu(fe->i_uid);
-	inode->i_gid = le32_to_cpu(fe->i_gid);
+	i_uid_write(inode, le32_to_cpu(fe->i_uid));
+	i_gid_write(inode, le32_to_cpu(fe->i_gid));
 	inode->i_mode = le16_to_cpu(fe->i_mode);
 	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
 		inode->i_blocks = 0;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 88924a3133f..621fc73bf23 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -147,8 +147,6 @@ void ocfs2_refresh_inode(struct inode *inode,
 int ocfs2_mark_inode_dirty(handle_t *handle,
 			   struct inode *inode,
 			   struct buffer_head *bh);
-int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
-int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
 struct buffer_head *ocfs2_bread(struct inode *inode,
 				int block, int *err, int reada);
 
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f20edcbfe70..0c60ef2d805 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -101,13 +101,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 	if (!S_ISDIR(inode->i_mode))
 		flags &= ~OCFS2_DIRSYNC_FL;
 
-	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto bail_unlock;
-	}
-
 	oldflags = ocfs2_inode->ip_attr;
 	flags = flags & mask;
 	flags |= oldflags & ~mask;
@@ -120,7 +113,14 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 	if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
 		(OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
 		if (!capable(CAP_LINUX_IMMUTABLE))
-			goto bail_commit;
+			goto bail_unlock;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto bail_unlock;
 	}
 
 	ocfs2_inode->ip_attr = flags;
@@ -130,8 +130,8 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 	if (status < 0)
 		mlog_errno(status);
 
-bail_commit:
 	ocfs2_commit_trans(osb, handle);
+
 bail_unlock:
 	ocfs2_inode_unlock(inode, 1);
 bail:
@@ -706,8 +706,10 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
 
 	o2info_set_request_filled(&oiff->iff_req);
 
-	if (o2info_to_user(*oiff, req))
+	if (o2info_to_user(*oiff, req)) {
+		status = -EFAULT;
 		goto bail;
+	}
 
 	status = 0;
 bail:
@@ -881,7 +883,7 @@ bail:
 
 long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	unsigned int flags;
 	int new_clusters;
 	int status;
@@ -994,7 +996,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	bool preserve;
 	struct reflink_arguments args;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ocfs2_info info;
 	void __user *argp = (void __user *)arg;
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 2dd36af79e2..242170d8397 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1234,11 +1234,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 		/* Though we wish to avoid it, we are in fact safe in
 		 * skipping local alloc cleanup as fsck.ocfs2 is more
 		 * than capable of reclaiming unused space. */
-		if (la_dinode)
-			kfree(la_dinode);
-
-		if (tl_dinode)
-			kfree(tl_dinode);
+		kfree(la_dinode);
+		kfree(tl_dinode);
 
 		if (qrec)
 			ocfs2_free_quota_recovery(qrec);
@@ -1408,8 +1405,7 @@ bail:
 
 	mutex_unlock(&osb->recovery_lock);
 
-	if (rm_quota)
-		kfree(rm_quota);
+	kfree(rm_quota);
 
 	/* no one is callint kthread_stop() for us so the kthread() api
 	 * requires that we call do_exit().  And it isn't exported, but
@@ -1945,6 +1941,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
 }
 
 struct ocfs2_orphan_filldir_priv {
+	struct dir_context	ctx;
 	struct inode		*head;
 	struct ocfs2_super	*osb;
 };
@@ -1981,11 +1978,11 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 {
 	int status;
 	struct inode *orphan_dir_inode = NULL;
-	struct ocfs2_orphan_filldir_priv priv;
-	loff_t pos = 0;
-
-	priv.osb = osb;
-	priv.head = *head;
+	struct ocfs2_orphan_filldir_priv priv = {
+		.ctx.actor = ocfs2_orphan_filldir,
+		.osb = osb,
+		.head = *head
+	};
 
 	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
 						       ORPHAN_DIR_SYSTEM_INODE,
@@ -2003,8 +2000,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
-				   ocfs2_orphan_filldir);
+	status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
 	if (status) {
 		mlog_errno(status);
 		goto out_cluster;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index a3385b63ff5..96f9ac237e8 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -200,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
 
 static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
 {
-	atomic_set(&osb->needs_checkpoint, 1);
 	wake_up(&osb->checkpoint_event);
 }
 
@@ -538,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
 
 	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
-	       ocfs2_quota_trans_credits(sb);
+	       ocfs2_quota_trans_credits(sb) + bits_wanted;
 }
 
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index a9f78c74d68..aebeacd807c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -476,8 +476,7 @@ out:
 	if (local_alloc_inode)
 		iput(local_alloc_inode);
 
-	if (alloc_copy)
-		kfree(alloc_copy);
+	kfree(alloc_copy);
 }
 
 /*
@@ -534,7 +533,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 		mlog_errno(status);
 
 bail:
-	if ((status < 0) && (*alloc_copy)) {
+	if (status < 0) {
 		kfree(*alloc_copy);
 		*alloc_copy = NULL;
 	}
@@ -1290,8 +1289,7 @@ bail:
 	if (main_bm_inode)
 		iput(main_bm_inode);
 
-	if (alloc_copy)
-		kfree(alloc_copy);
+	kfree(alloc_copy);
 
 	if (ac)
 		ocfs2_free_alloc_context(ac);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 47a87dda54c..10d66c75cec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -62,7 +62,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
 				struct page *page)
 {
 	int ret = VM_FAULT_NOPAGE;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
 	loff_t pos = page_offset(page);
 	unsigned int len = PAGE_CACHE_SIZE;
@@ -131,7 +131,7 @@ out:
 static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(vma->vm_file);
 	struct buffer_head *di_bh = NULL;
 	sigset_t oldset;
 	int ret;
@@ -180,13 +180,13 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	int ret = 0, lock_level = 0;
 
-	ret = ocfs2_inode_lock_atime(file->f_dentry->d_inode,
-				    file->f_vfsmnt, &lock_level);
+	ret = ocfs2_inode_lock_atime(file_inode(file),
+				    file->f_path.mnt, &lock_level);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
-	ocfs2_inode_unlock(file->f_dentry->d_inode, lock_level);
+	ocfs2_inode_unlock(file_inode(file), lock_level);
 out:
 	vma->vm_ops = &ocfs2_file_vm_ops;
 	return 0;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 6083432f667..f1fc172175b 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -471,7 +471,7 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
 	int ret, goal_bit = 0;
 
 	struct buffer_head *gd_bh = NULL;
-	struct ocfs2_group_desc *bg = NULL;
+	struct ocfs2_group_desc *bg;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int c_to_b = 1 << (osb->s_clustersize_bits -
 					inode->i_sb->s_blocksize_bits);
@@ -482,13 +482,6 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
 	range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
 						      range->me_goal);
 	/*
-	 * moving goal is not allowd to start with a group desc blok(#0 blk)
-	 * let's compromise to the latter cluster.
-	 */
-	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
-		range->me_goal += c_to_b;
-
-	/*
 	 * validate goal sits within global_bitmap, and return the victim
 	 * group desc
 	 */
@@ -502,6 +495,13 @@ static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
 	bg = (struct ocfs2_group_desc *)gd_bh->b_data;
 
 	/*
+	 * moving goal is not allowd to start with a group desc blok(#0 blk)
+	 * let's compromise to the latter cluster.
+	 */
+	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
+		range->me_goal += c_to_b;
+
+	/*
 	 * movement is not gonna cross two groups.
 	 */
 	if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
@@ -1055,44 +1055,42 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
 {
 	int status;
 
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct ocfs2_move_extents range;
-	struct ocfs2_move_extents_context *context = NULL;
+	struct ocfs2_move_extents_context *context;
+
+	if (!argp)
+		return -EINVAL;
 
 	status = mnt_want_write_file(filp);
 	if (status)
 		return status;
 
 	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
-		goto out;
+		goto out_drop;
 
 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
 		status = -EPERM;
-		goto out;
+		goto out_drop;
 	}
 
 	context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
 	if (!context) {
 		status = -ENOMEM;
 		mlog_errno(status);
-		goto out;
+		goto out_drop;
 	}
 
 	context->inode = inode;
 	context->file = filp;
 
-	if (argp) {
-		if (copy_from_user(&range, argp, sizeof(range))) {
-			status = -EFAULT;
-			goto out;
-		}
-	} else {
-		status = -EINVAL;
-		goto out;
+	if (copy_from_user(&range, argp, sizeof(range))) {
+		status = -EFAULT;
+		goto out_free;
 	}
 
 	if (range.me_start > i_size_read(inode))
-		goto out;
+		goto out_free;
 
 	if (range.me_start + range.me_len > i_size_read(inode))
 			range.me_len = i_size_read(inode) - range.me_start;
@@ -1124,25 +1122,24 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
 
 		status = ocfs2_validate_and_adjust_move_goal(inode, &range);
 		if (status)
-			goto out;
+			goto out_copy;
 	}
 
 	status = ocfs2_move_extents(context);
 	if (status)
 		mlog_errno(status);
-out:
+out_copy:
 	/*
 	 * movement/defragmentation may end up being partially completed,
 	 * that's the reason why we need to return userspace the finished
 	 * length and new_offset even if failure happens somewhere.
 	 */
-	if (argp) {
-		if (copy_to_user(argp, &range, sizeof(range)))
-			status = -EFAULT;
-	}
+	if (copy_to_user(argp, &range, sizeof(range)))
+		status = -EFAULT;
 
+out_free:
 	kfree(context);
-
+out_drop:
 	mnt_drop_write_file(filp);
 
 	return status;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f1fd0741162..be3f8676a43 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -512,8 +512,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
 	fe->i_suballoc_loc = cpu_to_le64(suballoc_loc);
 	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
 	fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
-	fe->i_uid = cpu_to_le32(inode->i_uid);
-	fe->i_gid = cpu_to_le32(inode->i_gid);
+	fe->i_uid = cpu_to_le32(i_uid_read(inode));
+	fe->i_gid = cpu_to_le32(i_gid_read(inode));
 	fe->i_mode = cpu_to_le16(inode->i_mode);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
@@ -522,7 +522,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
 
 	fe->i_last_eb_blk = 0;
 	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
-	le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
+	fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
 	fe->i_atime = fe->i_ctime = fe->i_mtime =
 		cpu_to_le64(CURRENT_TIME.tv_sec);
 	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
@@ -773,7 +773,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
 	return ret;
 }
 
-static inline int inode_is_unlinkable(struct inode *inode)
+static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
 {
 	if (S_ISDIR(inode->i_mode)) {
 		if (inode->i_nlink == 2)
@@ -791,6 +791,7 @@ static int ocfs2_unlink(struct inode *dir,
 {
 	int status;
 	int child_locked = 0;
+	bool is_unlinkable = false;
 	struct inode *inode = dentry->d_inode;
 	struct inode *orphan_dir = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -865,7 +866,7 @@ static int ocfs2_unlink(struct inode *dir,
 		goto leave;
 	}
 
-	if (inode_is_unlinkable(inode)) {
+	if (ocfs2_inode_is_unlinkable(inode)) {
 		status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
 						  OCFS2_I(inode)->ip_blkno,
 						  orphan_name, &orphan_insert);
@@ -873,6 +874,7 @@ static int ocfs2_unlink(struct inode *dir,
 			mlog_errno(status);
 			goto leave;
 		}
+		is_unlinkable = true;
 	}
 
 	handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
@@ -892,15 +894,6 @@ static int ocfs2_unlink(struct inode *dir,
 
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
-	if (inode_is_unlinkable(inode)) {
-		status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
-					  &orphan_insert, orphan_dir);
-		if (status < 0) {
-			mlog_errno(status);
-			goto leave;
-		}
-	}
-
 	/* delete the name from the parent dir */
 	status = ocfs2_delete_entry(handle, dir, &lookup);
 	if (status < 0) {
@@ -923,6 +916,14 @@ static int ocfs2_unlink(struct inode *dir,
 		mlog_errno(status);
 		if (S_ISDIR(inode->i_mode))
 			inc_nlink(dir);
+		goto leave;
+	}
+
+	if (is_unlinkable) {
+		status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
+				orphan_name, &orphan_insert, orphan_dir);
+		if (status < 0)
+			mlog_errno(status);
 	}
 
 leave:
@@ -947,7 +948,7 @@ leave:
 	ocfs2_free_dir_lookup_result(&orphan_insert);
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	if (status)
+	if (status && (status != -ENOTEMPTY))
 		mlog_errno(status);
 
 	return status;
@@ -2012,6 +2013,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 		goto leave;
 	}
 
+	/*
+	 * We're going to journal the change of i_flags and i_orphaned_slot.
+	 * It's safe anyway, though some callers may duplicate the journaling.
+	 * Journaling within the func just make the logic look more
+	 * straightforward.
+	 */
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(inode),
+					 fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	/* we're a cluster, and nlink can change on disk from
 	 * underneath us... */
 	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
@@ -2026,25 +2042,10 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 				   orphan_dir_bh, lookup);
 	if (status < 0) {
 		mlog_errno(status);
-		goto leave;
+		goto rollback;
 	}
 
-	/*
-	 * We're going to journal the change of i_flags and i_orphaned_slot.
-	 * It's safe anyway, though some callers may duplicate the journaling.
-	 * Journaling within the func just make the logic look more
-	 * straightforward.
-	 */
-	status = ocfs2_journal_access_di(handle,
-					 INODE_CACHE(inode),
-					 fe_bh,
-					 OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+	fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
 	OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
 
 	/* Record which orphan dir our inode now resides
@@ -2057,11 +2058,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
 				   osb->slot_num);
 
+rollback:
+	if (status < 0) {
+		if (S_ISDIR(inode->i_mode))
+			ocfs2_add_links_count(orphan_fe, -1);
+		set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
+	}
+
 leave:
 	brelse(orphan_dir_bh);
 
-	if (status)
-		mlog_errno(status);
 	return status;
 }
 
@@ -2216,7 +2222,7 @@ out:
 
 	brelse(orphan_dir_bh);
 
-	return 0;
+	return ret;
 }
 
 int ocfs2_create_inode_in_orphan(struct inode *dir,
@@ -2434,7 +2440,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 	}
 
 	di = (struct ocfs2_dinode *)di_bh->b_data;
-	le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
+	di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
 	di->i_orphaned_slot = 0;
 	set_nlink(inode, 1);
 	ocfs2_set_links_count(di, inode->i_nlink);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d355e6e36b3..3a903470c79 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -347,7 +347,6 @@ struct ocfs2_super
 	struct task_struct *recovery_thread_task;
 	int disable_recovery;
 	wait_queue_head_t checkpoint_event;
-	atomic_t needs_checkpoint;
 	struct ocfs2_journal *journal;
 	unsigned long osb_commit_interval;
 
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 30a055049e1..998b17eda09 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2927,7 +2927,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 				     u32 new_cluster, u32 new_len)
 {
 	int ret = 0, partial;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct ocfs2_caching_info *ci = INODE_CACHE(inode);
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
@@ -3020,7 +3020,7 @@ int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
 				    u32 new_cluster, u32 new_len)
 {
 	int ret = 0;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	struct ocfs2_caching_info *ci = INODE_CACHE(inode);
 	int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
@@ -4407,7 +4407,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
 	 * rights to do so.
 	 */
 	if (preserve) {
-		if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
+		if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN))
 			return -EPERM;
 		if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
 			return -EPERM;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 94368017edb..bf1f8930456 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -376,7 +376,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
 	dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
 
 out_free:
-	if (rc && conn->cc_private)
+	if (rc)
 		kfree(conn->cc_private);
 
 out:
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f169da4624f..5397c07ce60 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -642,7 +642,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,
 	 * cluster groups will be staying in cache for the duration of
 	 * this operation.
 	 */
-	ac->ac_allow_chain_relink = 0;
+	ac->ac_disable_chain_relink = 1;
 
 	/* Claim the first region */
 	status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
@@ -1422,7 +1422,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	int status;
 	/* there is a really tiny chance the journal calls could fail,
 	 * but we wouldn't want inconsistent blocks in *any* case. */
-	u64 fe_ptr, bg_ptr, prev_bg_ptr;
+	u64 bg_ptr, prev_bg_ptr;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
@@ -1437,51 +1437,44 @@ static int ocfs2_relink_block_group(handle_t *handle,
 		(unsigned long long)le64_to_cpu(bg->bg_blkno),
 		(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
 
-	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
 	bg_ptr = le64_to_cpu(bg->bg_next_group);
 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
 
 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
 					 prev_bg_bh,
 					 OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_rollback;
-	}
+	if (status < 0)
+		goto out;
 
 	prev_bg->bg_next_group = bg->bg_next_group;
 	ocfs2_journal_dirty(handle, prev_bg_bh);
 
 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
 					 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_rollback;
-	}
+	if (status < 0)
+		goto out_rollback_prev_bg;
 
 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
 	ocfs2_journal_dirty(handle, bg_bh);
 
 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
 					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto out_rollback;
-	}
+	if (status < 0)
+		goto out_rollback_bg;
 
 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
 	ocfs2_journal_dirty(handle, fe_bh);
 
-out_rollback:
-	if (status < 0) {
-		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
-		bg->bg_next_group = cpu_to_le64(bg_ptr);
-		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
-	}
-
-	if (status)
+out:
+	if (status < 0)
 		mlog_errno(status);
 	return status;
+
+out_rollback_bg:
+	bg->bg_next_group = cpu_to_le64(bg_ptr);
+out_rollback_prev_bg:
+	prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
+	goto out;
 }
 
 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
@@ -1823,7 +1816,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	 * Do this *after* figuring out how many bits we're taking out
 	 * of our target group.
 	 */
-	if (ac->ac_allow_chain_relink &&
+	if (!ac->ac_disable_chain_relink &&
 	    (prev_group_bh) &&
 	    (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
 		status = ocfs2_relink_block_group(handle, alloc_inode,
@@ -1928,7 +1921,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 
 	victim = ocfs2_find_victim_chain(cl);
 	ac->ac_chain = victim;
-	ac->ac_allow_chain_relink = 1;
 
 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
 				    res, &bits_left);
@@ -1947,7 +1939,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 	 * searching each chain in order. Don't allow chain relinking
 	 * because we only calculate enough journal credits for one
 	 * relink per alloc. */
-	ac->ac_allow_chain_relink = 0;
+	ac->ac_disable_chain_relink = 1;
 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
 		if (i == victim)
 			continue;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index b8afabfeede..a36d0aa5091 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -49,7 +49,7 @@ struct ocfs2_alloc_context {
 
 	/* these are used by the chain search */
 	u16    ac_chain;
-	int    ac_allow_chain_relink;
+	int    ac_disable_chain_relink;
 	group_search_t *ac_group_search;
 
 	u64    ac_last_group;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 0e91ec22a94..854d80955bf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -286,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
 	spin_unlock(&osb->osb_lock);
 
 	out += snprintf(buf + out, len - out,
-			"%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
+			"%10s => Pid: %d  Interval: %lu\n", "Commit",
 			(osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
-			osb->osb_commit_interval,
-			atomic_read(&osb->needs_checkpoint));
+			osb->osb_commit_interval);
 
 	out += snprintf(buf + out, len - out,
 			"%10s => State: %d  TxnId: %lu  NumTxns: %d\n",
@@ -1266,6 +1265,7 @@ static struct file_system_type ocfs2_fs_type = {
 	.fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
 	.next           = NULL
 };
+MODULE_ALIAS_FS("ocfs2");
 
 static int ocfs2_check_set_options(struct super_block *sb,
 				   struct mount_options *options)
@@ -2153,7 +2153,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	}
 
 	init_waitqueue_head(&osb->checkpoint_event);
-	atomic_set(&osb->needs_checkpoint, 0);
 
 	osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
 
@@ -2525,8 +2524,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
 		mlog_errno(status);
 
 finally:
-	if (local_alloc)
-		kfree(local_alloc);
+	kfree(local_alloc);
 
 	if (status)
 		mlog_errno(status);
@@ -2553,8 +2551,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 	 * we free it here.
 	 */
 	kfree(osb->journal);
-	if (osb->local_alloc_copy)
-		kfree(osb->local_alloc_copy);
+	kfree(osb->local_alloc_copy);
 	kfree(osb->uuid_str);
 	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
 	memset(osb, 0, sizeof(struct ocfs2_super));
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f1fbb4b552a..66edce7ecfd 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -57,7 +57,7 @@
 static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 	int status = ocfs2_read_inode_block(inode, &bh);
 	struct ocfs2_dinode *fe;
 	const char *link;
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 3d635f4bbb2..f053688d22a 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -91,8 +91,7 @@ static struct inode **get_local_system_inode(struct ocfs2_super *osb,
 		} else
 			osb->local_system_inodes = local_system_inodes;
 		spin_unlock(&osb->osb_lock);
-		if (unlikely(free))
-			kfree(free);
+		kfree(free);
 	}
 
 	index = (slot * NUM_LOCAL_SYSTEM_INODES) +
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 0ba9ea1e796..317ef0abccb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2751,7 +2751,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
 {
 	int ret;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
 	struct ocfs2_xa_loc loc;
 
 	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
@@ -2759,13 +2758,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
 
 	down_write(&oi->ip_alloc_sem);
 	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
-		if (!ocfs2_xattr_has_space_inline(inode, di)) {
-			ret = -ENOSPC;
-			goto out;
-		}
-	}
-
-	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
 		ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
 		if (ret) {
 			if (ret != -ENOSPC)
@@ -6499,6 +6491,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
 	}
 
 	new_oi = OCFS2_I(args->new_inode);
+	/*
+	 * Adjust extent record count to reserve space for extended attribute.
+	 * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
+	 */
+	if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
+	    !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
+		struct ocfs2_extent_list *el = &new_di->id2.i_list;
+		le16_add_cpu(&el->l_count, -(inline_size /
+					sizeof(struct ocfs2_extent_rec)));
+	}
 	spin_lock(&new_oi->ip_lock);
 	new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
 	new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
@@ -7189,7 +7191,7 @@ int ocfs2_init_security_and_acl(struct inode *dir,
 	struct buffer_head *dir_bh = NULL;
 
 	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
-	if (!ret) {
+	if (ret) {
 		mlog_errno(ret);
 		goto leave;
 	}
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index fb5b3ff79dc..1b8e9e8405b 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -327,26 +327,23 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
 	return is_bad;
 }
 
-static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
+static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
 		u64 fsblock, int hindex)
 {
-	struct inode *dir = filp->f_dentry->d_inode;
-	struct buffer_head *bh;
-	struct omfs_inode *oi;
-	u64 self;
-	int res = 0;
-	unsigned char d_type;
-
 	/* follow chain in this bucket */
 	while (fsblock != ~0) {
-		bh = omfs_bread(dir->i_sb, fsblock);
+		struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock);
+		struct omfs_inode *oi;
+		u64 self;
+		unsigned char d_type;
+
 		if (!bh)
-			goto out;
+			return true;
 
 		oi = (struct omfs_inode *) bh->b_data;
 		if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
 			brelse(bh);
-			goto out;
+			return true;
 		}
 
 		self = fsblock;
@@ -361,15 +358,16 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
 
 		d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
 
-		res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
-			OMFS_NAMELEN), filp->f_pos, self, d_type);
+		if (!dir_emit(ctx, oi->i_name,
+			      strnlen(oi->i_name, OMFS_NAMELEN),
+			      self, d_type)) {
+			brelse(bh);
+			return false;
+		}
 		brelse(bh);
-		if (res < 0)
-			break;
-		filp->f_pos++;
+		ctx->pos++;
 	}
-out:
-	return res;
+	return true;
 }
 
 static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -403,60 +401,44 @@ out:
 	return err;
 }
 
-static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int omfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *dir = filp->f_dentry->d_inode;
+	struct inode *dir = file_inode(file);
 	struct buffer_head *bh;
-	loff_t offset, res;
+	__be64 *p;
 	unsigned int hchain, hindex;
 	int nbuckets;
-	u64 fsblock;
-	int ret = -EINVAL;
-
-	if (filp->f_pos >> 32)
-		goto success;
-
-	switch ((unsigned long) filp->f_pos) {
-	case 0:
-		if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
-			goto success;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
-		if (filldir(dirent, "..", 2, 1,
-		    parent_ino(filp->f_dentry), DT_DIR) < 0)
-			goto success;
-		filp->f_pos = 1 << 20;
-		/* fall through */
+
+	if (ctx->pos >> 32)
+		return -EINVAL;
+
+	if (ctx->pos < 1 << 20) {
+		if (!dir_emit_dots(file, ctx))
+			return 0;
+		ctx->pos = 1 << 20;
 	}
 
 	nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
 
 	/* high 12 bits store bucket + 1 and low 20 bits store hash index */
-	hchain = (filp->f_pos >> 20) - 1;
-	hindex = filp->f_pos & 0xfffff;
+	hchain = (ctx->pos >> 20) - 1;
+	hindex = ctx->pos & 0xfffff;
 
 	bh = omfs_bread(dir->i_sb, dir->i_ino);
 	if (!bh)
-		goto out;
+		return -EINVAL;
 
-	offset = OMFS_DIR_START + hchain * 8;
+	p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain;
 
-	for (; hchain < nbuckets; hchain++, offset += 8) {
-		fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset]));
-
-		res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
-		hindex = 0;
-		if (res < 0)
+	for (; hchain < nbuckets; hchain++) {
+		__u64 fsblock = be64_to_cpu(*p++);
+		if (!omfs_fill_chain(dir, ctx, fsblock, hindex))
 			break;
-
-		filp->f_pos = (hchain+2) << 20;
+		hindex = 0;
+		ctx->pos = (hchain+2) << 20;
 	}
 	brelse(bh);
-success:
-	ret = 0;
-out:
-	return ret;
+	return 0;
 }
 
 const struct inode_operations omfs_dir_inops = {
@@ -470,6 +452,6 @@ const struct inode_operations omfs_dir_inops = {
 
 const struct file_operations omfs_dir_operations = {
 	.read = generic_read_dir,
-	.readdir = omfs_readdir,
+	.iterate = omfs_readdir,
 	.llseek = generic_file_llseek,
 };
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 25d715c7c87..d8b0afde217 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -572,6 +572,7 @@ static struct file_system_type omfs_fs_type = {
 	.kill_sb = kill_block_super,
 	.fs_flags = FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("omfs");
 
 static int __init init_omfs_fs(void)
 {
diff --git a/fs/open.c b/fs/open.c
index 9b33c0cbfac..fca72c4d3f1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -30,6 +30,7 @@
 #include <linux/fs_struct.h>
 #include <linux/ima.h>
 #include <linux/dnotify.h>
+#include <linux/compat.h>
 
 #include "internal.h"
 
@@ -140,6 +141,13 @@ SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
 	return do_sys_truncate(path, length);
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
+{
+	return do_sys_truncate(path, length);
+}
+#endif
+
 static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 {
 	struct inode *inode;
@@ -189,46 +197,33 @@ out:
 
 SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
 {
-	long ret = do_sys_ftruncate(fd, length, 1);
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(2, ret, fd, length);
-	return ret;
+	return do_sys_ftruncate(fd, length, 1);
 }
 
-/* LFS versions of truncate are only needed on 32 bit machines */
-#if BITS_PER_LONG == 32
-SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
 {
-	return do_sys_truncate(path, length);
+	return do_sys_ftruncate(fd, length, 1);
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_truncate64(long path, loff_t length)
-{
-	return SYSC_truncate64((const char __user *) path, length);
-}
-SYSCALL_ALIAS(sys_truncate64, SyS_truncate64);
 #endif
 
-SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length)
+/* LFS versions of truncate are only needed on 32 bit machines */
+#if BITS_PER_LONG == 32
+SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
 {
-	long ret = do_sys_ftruncate(fd, length, 0);
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(2, ret, fd, length);
-	return ret;
+	return do_sys_truncate(path, length);
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_ftruncate64(long fd, loff_t length)
+
+SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
 {
-	return SYSC_ftruncate64((unsigned int) fd, length);
+	return do_sys_ftruncate(fd, length, 0);
 }
-SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
-#endif
 #endif /* BITS_PER_LONG == 32 */
 
 
 int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	long ret;
 
 	if (offset < 0 || len <= 0)
@@ -284,7 +279,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	return ret;
 }
 
-SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
 {
 	struct fd f = fdget(fd);
 	int error = -EBADF;
@@ -296,14 +291,6 @@ SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
 	return error;
 }
 
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
-{
-	return SYSC_fallocate((int)fd, (int)mode, offset, len);
-}
-SYSCALL_ALIAS(sys_fallocate, SyS_fallocate);
-#endif
-
 /*
  * access() needs to use the real uid/gid, not the effective uid/gid.
  * We do this by temporarily clearing all FS-related capabilities and
@@ -426,7 +413,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 	if (!f.file)
 		goto out;
 
-	inode = f.file->f_path.dentry->d_inode;
+	inode = file_inode(f.file);
 
 	error = -ENOTDIR;
 	if (!S_ISDIR(inode->i_mode))
@@ -689,7 +676,7 @@ static int do_dentry_open(struct file *f,
 		f->f_mode = FMODE_PATH;
 
 	path_get(&f->f_path);
-	inode = f->f_path.dentry->d_inode;
+	inode = f->f_inode = f->f_path.dentry->d_inode;
 	if (f->f_mode & FMODE_WRITE) {
 		error = __get_file_write_access(inode, f->f_path.mnt);
 		if (error)
@@ -699,7 +686,6 @@ static int do_dentry_open(struct file *f,
 	}
 
 	f->f_mapping = inode->i_mapping;
-	f->f_pos = 0;
 	file_sb_list_add(f, inode->i_sb);
 
 	if (unlikely(f->f_mode & FMODE_PATH)) {
@@ -753,6 +739,7 @@ cleanup_file:
 	path_put(&f->f_path);
 	f->f_path.mnt = NULL;
 	f->f_path.dentry = NULL;
+	f->f_inode = NULL;
 	return error;
 }
 
@@ -810,23 +797,22 @@ struct file *dentry_open(const struct path *path, int flags,
 	/* We must always pass in a valid mount pointer. */
 	BUG_ON(!path->mnt);
 
-	error = -ENFILE;
 	f = get_empty_filp();
-	if (f == NULL)
-		return ERR_PTR(error);
-
-	f->f_flags = flags;
-	f->f_path = *path;
-	error = do_dentry_open(f, NULL, cred);
-	if (!error) {
-		error = open_check_o_direct(f);
-		if (error) {
-			fput(f);
+	if (!IS_ERR(f)) {
+		f->f_flags = flags;
+		f->f_path = *path;
+		error = do_dentry_open(f, NULL, cred);
+		if (!error) {
+			/* from now on we need fput() to dispose of f */
+			error = open_check_o_direct(f);
+			if (error) {
+				fput(f);
+				f = ERR_PTR(error);
+			}
+		} else { 
+			put_filp(f);
 			f = ERR_PTR(error);
 		}
-	} else { 
-		put_filp(f);
-		f = ERR_PTR(error);
 	}
 	return f;
 }
@@ -854,11 +840,15 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 	if (flags & __O_SYNC)
 		flags |= O_DSYNC;
 
-	/*
-	 * If we have O_PATH in the open flag. Then we
-	 * cannot have anything other than the below set of flags
-	 */
-	if (flags & O_PATH) {
+	if (flags & O_TMPFILE) {
+		if (!(flags & O_CREAT))
+			return -EINVAL;
+		acc_mode = MAY_OPEN | ACC_MODE(flags);
+	} else if (flags & O_PATH) {
+		/*
+		 * If we have O_PATH in the open flag. Then we
+		 * cannot have anything other than the below set of flags
+		 */
 		flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
 		acc_mode = 0;
 	} else {
@@ -890,7 +880,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 		lookup_flags |= LOOKUP_DIRECTORY;
 	if (!(flags & O_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
-	return lookup_flags;
+	op->lookup_flags = lookup_flags;
+	return 0;
 }
 
 /**
@@ -907,8 +898,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 struct file *file_open_name(struct filename *name, int flags, umode_t mode)
 {
 	struct open_flags op;
-	int lookup = build_open_flags(flags, mode, &op);
-	return do_filp_open(AT_FDCWD, name, &op, lookup);
+	int err = build_open_flags(flags, mode, &op);
+	return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
 }
 
 /**
@@ -933,65 +924,61 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 			    const char *filename, int flags)
 {
 	struct open_flags op;
-	int lookup = build_open_flags(flags, 0, &op);
+	int err = build_open_flags(flags, 0, &op);
+	if (err)
+		return ERR_PTR(err);
 	if (flags & O_CREAT)
 		return ERR_PTR(-EINVAL);
 	if (!filename && (flags & O_DIRECTORY))
 		if (!dentry->d_inode->i_op->lookup)
 			return ERR_PTR(-ENOTDIR);
-	return do_file_open_root(dentry, mnt, filename, &op, lookup);
+	return do_file_open_root(dentry, mnt, filename, &op);
 }
 EXPORT_SYMBOL(file_open_root);
 
 long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
 {
 	struct open_flags op;
-	int lookup = build_open_flags(flags, mode, &op);
-	struct filename *tmp = getname(filename);
-	int fd = PTR_ERR(tmp);
-
-	if (!IS_ERR(tmp)) {
-		fd = get_unused_fd_flags(flags);
-		if (fd >= 0) {
-			struct file *f = do_filp_open(dfd, tmp, &op, lookup);
-			if (IS_ERR(f)) {
-				put_unused_fd(fd);
-				fd = PTR_ERR(f);
-			} else {
-				fsnotify_open(f);
-				fd_install(fd, f);
-			}
+	int fd = build_open_flags(flags, mode, &op);
+	struct filename *tmp;
+
+	if (fd)
+		return fd;
+
+	tmp = getname(filename);
+	if (IS_ERR(tmp))
+		return PTR_ERR(tmp);
+
+	fd = get_unused_fd_flags(flags);
+	if (fd >= 0) {
+		struct file *f = do_filp_open(dfd, tmp, &op);
+		if (IS_ERR(f)) {
+			put_unused_fd(fd);
+			fd = PTR_ERR(f);
+		} else {
+			fsnotify_open(f);
+			fd_install(fd, f);
 		}
-		putname(tmp);
 	}
+	putname(tmp);
 	return fd;
 }
 
 SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
 {
-	long ret;
-
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	ret = do_sys_open(AT_FDCWD, filename, flags, mode);
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(3, ret, filename, flags, mode);
-	return ret;
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
 }
 
 SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
 		umode_t, mode)
 {
-	long ret;
-
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	ret = do_sys_open(dfd, filename, flags, mode);
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(4, ret, dfd, filename, flags, mode);
-	return ret;
+	return do_sys_open(dfd, filename, flags, mode);
 }
 
 #ifndef __alpha__
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 2ad080faca3..8c0ceb8dd1f 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -162,11 +162,11 @@ static const struct file_operations openpromfs_prop_ops = {
 	.release	= seq_release,
 };
 
-static int openpromfs_readdir(struct file *, void *, filldir_t);
+static int openpromfs_readdir(struct file *, struct dir_context *);
 
 static const struct file_operations openprom_operations = {
 	.read		= generic_read_dir,
-	.readdir	= openpromfs_readdir,
+	.iterate	= openpromfs_readdir,
 	.llseek		= generic_file_llseek,
 };
 
@@ -260,71 +260,64 @@ found:
 	return NULL;
 }
 
-static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct op_inode_info *oi = OP_I(inode);
 	struct device_node *dp = oi->u.node;
 	struct device_node *child;
 	struct property *prop;
-	unsigned int ino;
 	int i;
 
 	mutex_lock(&op_mutex);
 	
-	ino = inode->i_ino;
-	i = filp->f_pos;
-	switch (i) {
-	case 0:
-		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+	if (ctx->pos == 0) {
+		if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
 			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall thru */
-	case 1:
-		if (filldir(dirent, "..", 2, i,
+		ctx->pos = 1;
+	}
+	if (ctx->pos == 1) {
+		if (!dir_emit(ctx, "..", 2,
 			    (dp->parent == NULL ?
 			     OPENPROM_ROOT_INO :
-			     dp->parent->unique_id), DT_DIR) < 0) 
+			     dp->parent->unique_id), DT_DIR))
 			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall thru */
-	default:
-		i -= 2;
-
-		/* First, the children nodes as directories.  */
-		child = dp->child;
-		while (i && child) {
-			child = child->sibling;
-			i--;
-		}
-		while (child) {
-			if (filldir(dirent,
-				    child->path_component_name,
-				    strlen(child->path_component_name),
-				    filp->f_pos, child->unique_id, DT_DIR) < 0)
-				goto out;
-
-			filp->f_pos++;
-			child = child->sibling;
-		}
+		ctx->pos = 2;
+	}
+	i = ctx->pos - 2;
 
-		/* Next, the properties as files.  */
-		prop = dp->properties;
-		while (i && prop) {
-			prop = prop->next;
-			i--;
-		}
-		while (prop) {
-			if (filldir(dirent, prop->name, strlen(prop->name),
-				    filp->f_pos, prop->unique_id, DT_REG) < 0)
-				goto out;
+	/* First, the children nodes as directories.  */
+	child = dp->child;
+	while (i && child) {
+		child = child->sibling;
+		i--;
+	}
+	while (child) {
+		if (!dir_emit(ctx,
+			    child->path_component_name,
+			    strlen(child->path_component_name),
+			    child->unique_id, DT_DIR))
+			goto out;
 
-			filp->f_pos++;
-			prop = prop->next;
-		}
+		ctx->pos++;
+		child = child->sibling;
+	}
+
+	/* Next, the properties as files.  */
+	prop = dp->properties;
+	while (i && prop) {
+		prop = prop->next;
+		i--;
 	}
+	while (prop) {
+		if (!dir_emit(ctx, prop->name, strlen(prop->name),
+			    prop->unique_id, DT_REG))
+			goto out;
+
+		ctx->pos++;
+		prop = prop->next;
+	}
+
 out:
 	mutex_unlock(&op_mutex);
 	return 0;
@@ -432,6 +425,7 @@ static struct file_system_type openprom_fs_type = {
 	.mount		= openprom_mount,
 	.kill_sb	= kill_anon_super,
 };
+MODULE_ALIAS_FS("openpromfs");
 
 static void op_inode_init_once(void *data)
 {
diff --git a/fs/pipe.c b/fs/pipe.c
index bd3479db4b6..d2c45e14e6d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,10 +21,13 @@
 #include <linux/audit.h>
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
+#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
+#include "internal.h"
+
 /*
  * The max size that a non-root user is allowed to grow the pipe. Can
  * be set by root in /proc/sys/fs/pipe-max-size
@@ -53,8 +56,8 @@ unsigned int pipe_min_size = PAGE_SIZE;
 
 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
 {
-	if (pipe->inode)
-		mutex_lock_nested(&pipe->inode->i_mutex, subclass);
+	if (pipe->files)
+		mutex_lock_nested(&pipe->mutex, subclass);
 }
 
 void pipe_lock(struct pipe_inode_info *pipe)
@@ -68,11 +71,21 @@ EXPORT_SYMBOL(pipe_lock);
 
 void pipe_unlock(struct pipe_inode_info *pipe)
 {
-	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
+	if (pipe->files)
+		mutex_unlock(&pipe->mutex);
 }
 EXPORT_SYMBOL(pipe_unlock);
 
+static inline void __pipe_lock(struct pipe_inode_info *pipe)
+{
+	mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
+}
+
+static inline void __pipe_unlock(struct pipe_inode_info *pipe)
+{
+	mutex_unlock(&pipe->mutex);
+}
+
 void pipe_double_lock(struct pipe_inode_info *pipe1,
 		      struct pipe_inode_info *pipe2)
 {
@@ -361,8 +374,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 	   unsigned long nr_segs, loff_t pos)
 {
 	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct pipe_inode_info *pipe;
+	struct pipe_inode_info *pipe = filp->private_data;
 	int do_wakeup;
 	ssize_t ret;
 	struct iovec *iov = (struct iovec *)_iov;
@@ -375,8 +387,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 
 	do_wakeup = 0;
 	ret = 0;
-	mutex_lock(&inode->i_mutex);
-	pipe = inode->i_pipe;
+	__pipe_lock(pipe);
 	for (;;) {
 		int bufs = pipe->nrbufs;
 		if (bufs) {
@@ -464,7 +475,7 @@ redo:
 		}
 		pipe_wait(pipe);
 	}
-	mutex_unlock(&inode->i_mutex);
+	__pipe_unlock(pipe);
 
 	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
@@ -486,8 +497,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 	    unsigned long nr_segs, loff_t ppos)
 {
 	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct pipe_inode_info *pipe;
+	struct pipe_inode_info *pipe = filp->private_data;
 	ssize_t ret;
 	int do_wakeup;
 	struct iovec *iov = (struct iovec *)_iov;
@@ -501,8 +511,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 
 	do_wakeup = 0;
 	ret = 0;
-	mutex_lock(&inode->i_mutex);
-	pipe = inode->i_pipe;
+	__pipe_lock(pipe);
 
 	if (!pipe->readers) {
 		send_sig(SIGPIPE, current, 0);
@@ -649,7 +658,7 @@ redo2:
 		pipe->waiting_writers--;
 	}
 out:
-	mutex_unlock(&inode->i_mutex);
+	__pipe_unlock(pipe);
 	if (do_wakeup) {
 		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
@@ -662,29 +671,14 @@ out:
 	return ret;
 }
 
-static ssize_t
-bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
-{
-	return -EBADF;
-}
-
-static ssize_t
-bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
-	   loff_t *ppos)
-{
-	return -EBADF;
-}
-
 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct pipe_inode_info *pipe;
+	struct pipe_inode_info *pipe = filp->private_data;
 	int count, buf, nrbufs;
 
 	switch (cmd) {
 		case FIONREAD:
-			mutex_lock(&inode->i_mutex);
-			pipe = inode->i_pipe;
+			__pipe_lock(pipe);
 			count = 0;
 			buf = pipe->curbuf;
 			nrbufs = pipe->nrbufs;
@@ -692,7 +686,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				count += pipe->bufs[buf].len;
 				buf = (buf+1) & (pipe->buffers - 1);
 			}
-			mutex_unlock(&inode->i_mutex);
+			__pipe_unlock(pipe);
 
 			return put_user(count, (int __user *)arg);
 		default:
@@ -705,8 +699,7 @@ static unsigned int
 pipe_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct pipe_inode_info *pipe = inode->i_pipe;
+	struct pipe_inode_info *pipe = filp->private_data;
 	int nrbufs;
 
 	poll_wait(filp, &pipe->wait, wait);
@@ -734,194 +727,56 @@ pipe_poll(struct file *filp, poll_table *wait)
 }
 
 static int
-pipe_release(struct inode *inode, int decr, int decw)
+pipe_release(struct inode *inode, struct file *file)
 {
-	struct pipe_inode_info *pipe;
+	struct pipe_inode_info *pipe = inode->i_pipe;
+	int kill = 0;
 
-	mutex_lock(&inode->i_mutex);
-	pipe = inode->i_pipe;
-	pipe->readers -= decr;
-	pipe->writers -= decw;
+	__pipe_lock(pipe);
+	if (file->f_mode & FMODE_READ)
+		pipe->readers--;
+	if (file->f_mode & FMODE_WRITE)
+		pipe->writers--;
 
-	if (!pipe->readers && !pipe->writers) {
-		free_pipe_info(inode);
-	} else {
+	if (pipe->readers || pipe->writers) {
 		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
-	mutex_unlock(&inode->i_mutex);
-
-	return 0;
-}
-
-static int
-pipe_read_fasync(int fd, struct file *filp, int on)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	int retval;
-
-	mutex_lock(&inode->i_mutex);
-	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
-	mutex_unlock(&inode->i_mutex);
-
-	return retval;
-}
-
-
-static int
-pipe_write_fasync(int fd, struct file *filp, int on)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	int retval;
+	spin_lock(&inode->i_lock);
+	if (!--pipe->files) {
+		inode->i_pipe = NULL;
+		kill = 1;
+	}
+	spin_unlock(&inode->i_lock);
+	__pipe_unlock(pipe);
 
-	mutex_lock(&inode->i_mutex);
-	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
-	mutex_unlock(&inode->i_mutex);
+	if (kill)
+		free_pipe_info(pipe);
 
-	return retval;
+	return 0;
 }
 
-
 static int
-pipe_rdwr_fasync(int fd, struct file *filp, int on)
+pipe_fasync(int fd, struct file *filp, int on)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct pipe_inode_info *pipe = inode->i_pipe;
-	int retval;
+	struct pipe_inode_info *pipe = filp->private_data;
+	int retval = 0;
 
-	mutex_lock(&inode->i_mutex);
-	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
-	if (retval >= 0) {
+	__pipe_lock(pipe);
+	if (filp->f_mode & FMODE_READ)
+		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
+	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
 		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
-		if (retval < 0) /* this can happen only if on == T */
+		if (retval < 0 && (filp->f_mode & FMODE_READ))
+			/* this can happen only if on == T */
 			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 	}
-	mutex_unlock(&inode->i_mutex);
+	__pipe_unlock(pipe);
 	return retval;
 }
 
-
-static int
-pipe_read_release(struct inode *inode, struct file *filp)
-{
-	return pipe_release(inode, 1, 0);
-}
-
-static int
-pipe_write_release(struct inode *inode, struct file *filp)
-{
-	return pipe_release(inode, 0, 1);
-}
-
-static int
-pipe_rdwr_release(struct inode *inode, struct file *filp)
-{
-	int decr, decw;
-
-	decr = (filp->f_mode & FMODE_READ) != 0;
-	decw = (filp->f_mode & FMODE_WRITE) != 0;
-	return pipe_release(inode, decr, decw);
-}
-
-static int
-pipe_read_open(struct inode *inode, struct file *filp)
-{
-	int ret = -ENOENT;
-
-	mutex_lock(&inode->i_mutex);
-
-	if (inode->i_pipe) {
-		ret = 0;
-		inode->i_pipe->readers++;
-	}
-
-	mutex_unlock(&inode->i_mutex);
-
-	return ret;
-}
-
-static int
-pipe_write_open(struct inode *inode, struct file *filp)
-{
-	int ret = -ENOENT;
-
-	mutex_lock(&inode->i_mutex);
-
-	if (inode->i_pipe) {
-		ret = 0;
-		inode->i_pipe->writers++;
-	}
-
-	mutex_unlock(&inode->i_mutex);
-
-	return ret;
-}
-
-static int
-pipe_rdwr_open(struct inode *inode, struct file *filp)
-{
-	int ret = -ENOENT;
-
-	mutex_lock(&inode->i_mutex);
-
-	if (inode->i_pipe) {
-		ret = 0;
-		if (filp->f_mode & FMODE_READ)
-			inode->i_pipe->readers++;
-		if (filp->f_mode & FMODE_WRITE)
-			inode->i_pipe->writers++;
-	}
-
-	mutex_unlock(&inode->i_mutex);
-
-	return ret;
-}
-
-/*
- * The file_operations structs are not static because they
- * are also used in linux/fs/fifo.c to do operations on FIFOs.
- *
- * Pipes reuse fifos' file_operations structs.
- */
-const struct file_operations read_pipefifo_fops = {
-	.llseek		= no_llseek,
-	.read		= do_sync_read,
-	.aio_read	= pipe_read,
-	.write		= bad_pipe_w,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_read_open,
-	.release	= pipe_read_release,
-	.fasync		= pipe_read_fasync,
-};
-
-const struct file_operations write_pipefifo_fops = {
-	.llseek		= no_llseek,
-	.read		= bad_pipe_r,
-	.write		= do_sync_write,
-	.aio_write	= pipe_write,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_write_open,
-	.release	= pipe_write_release,
-	.fasync		= pipe_write_fasync,
-};
-
-const struct file_operations rdwr_pipefifo_fops = {
-	.llseek		= no_llseek,
-	.read		= do_sync_read,
-	.aio_read	= pipe_read,
-	.write		= do_sync_write,
-	.aio_write	= pipe_write,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_rdwr_open,
-	.release	= pipe_rdwr_release,
-	.fasync		= pipe_rdwr_fasync,
-};
-
-struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
+struct pipe_inode_info *alloc_pipe_info(void)
 {
 	struct pipe_inode_info *pipe;
 
@@ -931,8 +786,8 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 		if (pipe->bufs) {
 			init_waitqueue_head(&pipe->wait);
 			pipe->r_counter = pipe->w_counter = 1;
-			pipe->inode = inode;
 			pipe->buffers = PIPE_DEF_BUFFERS;
+			mutex_init(&pipe->mutex);
 			return pipe;
 		}
 		kfree(pipe);
@@ -941,7 +796,7 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 	return NULL;
 }
 
-void __free_pipe_info(struct pipe_inode_info *pipe)
+void free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
 
@@ -956,12 +811,6 @@ void __free_pipe_info(struct pipe_inode_info *pipe)
 	kfree(pipe);
 }
 
-void free_pipe_info(struct inode *inode)
-{
-	__free_pipe_info(inode->i_pipe);
-	inode->i_pipe = NULL;
-}
-
 static struct vfsmount *pipe_mnt __read_mostly;
 
 /*
@@ -987,13 +836,14 @@ static struct inode * get_pipe_inode(void)
 
 	inode->i_ino = get_next_ino();
 
-	pipe = alloc_pipe_info(inode);
+	pipe = alloc_pipe_info();
 	if (!pipe)
 		goto fail_iput;
-	inode->i_pipe = pipe;
 
+	inode->i_pipe = pipe;
+	pipe->files = 2;
 	pipe->readers = pipe->writers = 1;
-	inode->i_fop = &rdwr_pipefifo_fops;
+	inode->i_fop = &pipefifo_fops;
 
 	/*
 	 * Mark the inode dirty from the very beginning,
@@ -1036,17 +886,19 @@ int create_pipe_files(struct file **res, int flags)
 	d_instantiate(path.dentry, inode);
 
 	err = -ENFILE;
-	f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
-	if (!f)
+	f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
+	if (IS_ERR(f))
 		goto err_dentry;
 
 	f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
+	f->private_data = inode->i_pipe;
 
-	res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops);
-	if (!res[0])
+	res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
+	if (IS_ERR(res[0]))
 		goto err_file;
 
 	path_get(&path);
+	res[0]->private_data = inode->i_pipe;
 	res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
 	res[1] = f;
 	return 0;
@@ -1054,12 +906,12 @@ int create_pipe_files(struct file **res, int flags)
 err_file:
 	put_filp(f);
 err_dentry:
-	free_pipe_info(inode);
+	free_pipe_info(inode->i_pipe);
 	path_put(&path);
 	return err;
 
 err_inode:
-	free_pipe_info(inode);
+	free_pipe_info(inode->i_pipe);
 	iput(inode);
 	return err;
 }
@@ -1141,6 +993,168 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 	return sys_pipe2(fildes, 0);
 }
 
+static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
+{
+	int cur = *cnt;	
+
+	while (cur == *cnt) {
+		pipe_wait(pipe);
+		if (signal_pending(current))
+			break;
+	}
+	return cur == *cnt ? -ERESTARTSYS : 0;
+}
+
+static void wake_up_partner(struct pipe_inode_info *pipe)
+{
+	wake_up_interruptible(&pipe->wait);
+}
+
+static int fifo_open(struct inode *inode, struct file *filp)
+{
+	struct pipe_inode_info *pipe;
+	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
+	int kill = 0;
+	int ret;
+
+	filp->f_version = 0;
+
+	spin_lock(&inode->i_lock);
+	if (inode->i_pipe) {
+		pipe = inode->i_pipe;
+		pipe->files++;
+		spin_unlock(&inode->i_lock);
+	} else {
+		spin_unlock(&inode->i_lock);
+		pipe = alloc_pipe_info();
+		if (!pipe)
+			return -ENOMEM;
+		pipe->files = 1;
+		spin_lock(&inode->i_lock);
+		if (unlikely(inode->i_pipe)) {
+			inode->i_pipe->files++;
+			spin_unlock(&inode->i_lock);
+			free_pipe_info(pipe);
+			pipe = inode->i_pipe;
+		} else {
+			inode->i_pipe = pipe;
+			spin_unlock(&inode->i_lock);
+		}
+	}
+	filp->private_data = pipe;
+	/* OK, we have a pipe and it's pinned down */
+
+	__pipe_lock(pipe);
+
+	/* We can only do regular read/write on fifos */
+	filp->f_mode &= (FMODE_READ | FMODE_WRITE);
+
+	switch (filp->f_mode) {
+	case FMODE_READ:
+	/*
+	 *  O_RDONLY
+	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
+	 *  opened, even when there is no process writing the FIFO.
+	 */
+		pipe->r_counter++;
+		if (pipe->readers++ == 0)
+			wake_up_partner(pipe);
+
+		if (!is_pipe && !pipe->writers) {
+			if ((filp->f_flags & O_NONBLOCK)) {
+				/* suppress POLLHUP until we have
+				 * seen a writer */
+				filp->f_version = pipe->w_counter;
+			} else {
+				if (wait_for_partner(pipe, &pipe->w_counter))
+					goto err_rd;
+			}
+		}
+		break;
+	
+	case FMODE_WRITE:
+	/*
+	 *  O_WRONLY
+	 *  POSIX.1 says that O_NONBLOCK means return -1 with
+	 *  errno=ENXIO when there is no process reading the FIFO.
+	 */
+		ret = -ENXIO;
+		if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
+			goto err;
+
+		pipe->w_counter++;
+		if (!pipe->writers++)
+			wake_up_partner(pipe);
+
+		if (!is_pipe && !pipe->readers) {
+			if (wait_for_partner(pipe, &pipe->r_counter))
+				goto err_wr;
+		}
+		break;
+	
+	case FMODE_READ | FMODE_WRITE:
+	/*
+	 *  O_RDWR
+	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
+	 *  This implementation will NEVER block on a O_RDWR open, since
+	 *  the process can at least talk to itself.
+	 */
+
+		pipe->readers++;
+		pipe->writers++;
+		pipe->r_counter++;
+		pipe->w_counter++;
+		if (pipe->readers == 1 || pipe->writers == 1)
+			wake_up_partner(pipe);
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto err;
+	}
+
+	/* Ok! */
+	__pipe_unlock(pipe);
+	return 0;
+
+err_rd:
+	if (!--pipe->readers)
+		wake_up_interruptible(&pipe->wait);
+	ret = -ERESTARTSYS;
+	goto err;
+
+err_wr:
+	if (!--pipe->writers)
+		wake_up_interruptible(&pipe->wait);
+	ret = -ERESTARTSYS;
+	goto err;
+
+err:
+	spin_lock(&inode->i_lock);
+	if (!--pipe->files) {
+		inode->i_pipe = NULL;
+		kill = 1;
+	}
+	spin_unlock(&inode->i_lock);
+	__pipe_unlock(pipe);
+	if (kill)
+		free_pipe_info(pipe);
+	return ret;
+}
+
+const struct file_operations pipefifo_fops = {
+	.open		= fifo_open,
+	.llseek		= no_llseek,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
+	.poll		= pipe_poll,
+	.unlocked_ioctl	= pipe_ioctl,
+	.release	= pipe_release,
+	.fasync		= pipe_fasync,
+};
+
 /*
  * Allocate a new array of pipe buffers and copy the info over. Returns the
  * pipe size if successful, or return -ERROR on error.
@@ -1226,9 +1240,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
  */
 struct pipe_inode_info *get_pipe_info(struct file *file)
 {
-	struct inode *i = file->f_path.dentry->d_inode;
-
-	return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL;
+	return file->f_op == &pipefifo_fops ? file->private_data : NULL;
 }
 
 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1240,7 +1252,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 	if (!pipe)
 		return -EBADF;
 
-	mutex_lock(&pipe->inode->i_mutex);
+	__pipe_lock(pipe);
 
 	switch (cmd) {
 	case F_SETPIPE_SZ: {
@@ -1269,7 +1281,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 
 out:
-	mutex_unlock(&pipe->inode->i_mutex);
+	__pipe_unlock(pipe);
 	return ret;
 }
 
diff --git a/fs/pnode.c b/fs/pnode.c
index 3e000a51ac0..9af0df15256 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -9,6 +9,7 @@
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/fs.h>
+#include <linux/nsproxy.h>
 #include "internal.h"
 #include "pnode.h"
 
@@ -82,7 +83,8 @@ static int do_make_slave(struct mount *mnt)
 		if (peer_mnt == mnt)
 			peer_mnt = NULL;
 	}
-	if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share))
+	if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) &&
+	    list_empty(&mnt->mnt_share))
 		mnt_release_group_id(mnt);
 
 	list_del_init(&mnt->mnt_share);
@@ -217,15 +219,15 @@ static struct mount *get_source(struct mount *dest,
  * @source_mnt: source mount.
  * @tree_list : list of heads of trees to be attached.
  */
-int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
+int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		    struct mount *source_mnt, struct list_head *tree_list)
 {
+	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
 	struct mount *m, *child;
 	int ret = 0;
 	struct mount *prev_dest_mnt = dest_mnt;
 	struct mount *prev_src_mnt  = source_mnt;
 	LIST_HEAD(tmp_list);
-	LIST_HEAD(umount_list);
 
 	for (m = propagation_next(dest_mnt, dest_mnt); m;
 			m = propagation_next(m, dest_mnt)) {
@@ -237,6 +239,10 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
 
 		source =  get_source(m, prev_dest_mnt, prev_src_mnt, &type);
 
+		/* Notice when we are propagating across user namespaces */
+		if (m->mnt_ns->user_ns != user_ns)
+			type |= CL_UNPRIVILEGED;
+
 		child = copy_tree(source, source->mnt.mnt_root, type);
 		if (IS_ERR(child)) {
 			ret = PTR_ERR(child);
@@ -244,8 +250,8 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
 			goto out;
 		}
 
-		if (is_subdir(dest_dentry, m->mnt.mnt_root)) {
-			mnt_set_mountpoint(m, dest_dentry, child);
+		if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) {
+			mnt_set_mountpoint(m, dest_mp, child);
 			list_add_tail(&child->mnt_hash, tree_list);
 		} else {
 			/*
@@ -261,10 +267,9 @@ out:
 	br_write_lock(&vfsmount_lock);
 	while (!list_empty(&tmp_list)) {
 		child = list_first_entry(&tmp_list, struct mount, mnt_hash);
-		umount_tree(child, 0, &umount_list);
+		umount_tree(child, 0);
 	}
 	br_write_unlock(&vfsmount_lock);
-	release_mounts(&umount_list);
 	return ret;
 }
 
diff --git a/fs/pnode.h b/fs/pnode.h
index 19b853a3445..b091445c1c4 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -23,6 +23,7 @@
 #define CL_MAKE_SHARED 		0x08
 #define CL_PRIVATE 		0x10
 #define CL_SHARED_TO_SLAVE	0x20
+#define CL_UNPRIVILEGED		0x40
 
 static inline void set_mnt_shared(struct mount *mnt)
 {
@@ -31,17 +32,16 @@ static inline void set_mnt_shared(struct mount *mnt)
 }
 
 void change_mnt_propagation(struct mount *, int);
-int propagate_mnt(struct mount *, struct dentry *, struct mount *,
+int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct list_head *);
 int propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void mnt_release_group_id(struct mount *);
 int get_dominating_id(struct mount *mnt, const struct path *root);
 unsigned int mnt_get_count(struct mount *mnt);
-void mnt_set_mountpoint(struct mount *, struct dentry *,
+void mnt_set_mountpoint(struct mount *, struct mountpoint *,
 			struct mount *);
-void release_mounts(struct list_head *);
-void umount_tree(struct mount *, int, struct list_head *);
+void umount_tree(struct mount *, int);
 struct mount *copy_tree(struct mount *, struct dentry *, int);
 bool is_path_reachable(struct mount *, struct dentry *,
 			 const struct path *root);
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 981b0560193..ab30716584f 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -5,10 +5,11 @@
 obj-y   += proc.o
 
 proc-y			:= nommu.o task_nommu.o
-proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
+proc-$(CONFIG_MMU)	:= task_mmu.o
 
 proc-y       += inode.o root.o base.o generic.o array.o \
-		proc_tty.o fd.o
+		fd.o
+proc-$(CONFIG_TTY)      += proc_tty.o
 proc-y	+= cmdline.o
 proc-y	+= consoles.o
 proc-y	+= cpuinfo.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6a91e6ffbcb..cbd0f1b324b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -143,6 +143,7 @@ static const char * const task_state_array[] = {
 	"x (dead)",		/*  64 */
 	"K (wakekill)",		/* 128 */
 	"W (waking)",		/* 256 */
+	"P (parked)",		/* 512 */
 };
 
 static inline const char *get_task_state(struct task_struct *tsk)
@@ -449,7 +450,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				gtime += t->gtime;
+				gtime += task_gtime(t);
 				t = next_thread(t);
 			} while (t != task);
 
@@ -472,7 +473,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		min_flt = task->min_flt;
 		maj_flt = task->maj_flt;
 		task_cputime_adjusted(task, &utime, &stime);
-		gtime = task->gtime;
+		gtime = task_gtime(task);
 	}
 
 	/* scale priority and nice values from timeslices to -20..20 */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9b43ff77a51..1485e38daaa 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -73,6 +73,7 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
+#include <linux/printk.h>
 #include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
@@ -85,6 +86,7 @@
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
 #include <linux/flex_array.h>
+#include <linux/posix-timers.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -383,7 +385,7 @@ static int lstats_open(struct inode *inode, struct file *file)
 static ssize_t lstats_write(struct file *file, const char __user *buf,
 			    size_t count, loff_t *offs)
 {
-	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file_inode(file));
 
 	if (!task)
 		return -ESRCH;
@@ -403,6 +405,37 @@ static const struct file_operations proc_lstats_operations = {
 
 #endif
 
+#ifdef CONFIG_CGROUPS
+static int cgroup_open(struct inode *inode, struct file *file)
+{
+	struct pid *pid = PROC_I(inode)->pid;
+	return single_open(file, proc_cgroup_show, pid);
+}
+
+static const struct file_operations proc_cgroup_operations = {
+	.open		= cgroup_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
+#ifdef CONFIG_PROC_PID_CPUSET
+
+static int cpuset_open(struct inode *inode, struct file *file)
+{
+	struct pid *pid = PROC_I(inode)->pid;
+	return single_open(file, proc_cpuset_show, pid);
+}
+
+static const struct file_operations proc_cpuset_operations = {
+	.open		= cpuset_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long totalpages = totalram_pages + total_swap_pages;
@@ -602,7 +635,7 @@ static const struct inode_operations proc_def_inode_operations = {
 static ssize_t proc_info_read(struct file * file, char __user * buf,
 			  size_t count, loff_t *ppos)
 {
-	struct inode * inode = file->f_path.dentry->d_inode;
+	struct inode * inode = file_inode(file);
 	unsigned long page;
 	ssize_t length;
 	struct task_struct *task = get_proc_task(inode);
@@ -668,7 +701,7 @@ static const struct file_operations proc_single_file_operations = {
 
 static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
 {
-	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	struct task_struct *task = get_proc_task(file_inode(file));
 	struct mm_struct *mm;
 
 	if (!task)
@@ -869,7 +902,7 @@ static const struct file_operations proc_environ_operations = {
 static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
 			    loff_t *ppos)
 {
-	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	struct task_struct *task = get_proc_task(file_inode(file));
 	char buffer[PROC_NUMBUF];
 	int oom_adj = OOM_ADJUST_MIN;
 	size_t len;
@@ -916,7 +949,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
 		goto out;
 	}
 
-	task = get_proc_task(file->f_path.dentry->d_inode);
+	task = get_proc_task(file_inode(file));
 	if (!task) {
 		err = -ESRCH;
 		goto out;
@@ -952,7 +985,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
 	 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
 	 * /proc/pid/oom_score_adj instead.
 	 */
-	printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+	pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
 		  current->comm, task_pid_nr(current), task_pid_nr(task),
 		  task_pid_nr(task));
 
@@ -976,7 +1009,7 @@ static const struct file_operations proc_oom_adj_operations = {
 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 					size_t count, loff_t *ppos)
 {
-	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	struct task_struct *task = get_proc_task(file_inode(file));
 	char buffer[PROC_NUMBUF];
 	short oom_score_adj = OOM_SCORE_ADJ_MIN;
 	unsigned long flags;
@@ -1019,7 +1052,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		goto out;
 	}
 
-	task = get_proc_task(file->f_path.dentry->d_inode);
+	task = get_proc_task(file_inode(file));
 	if (!task) {
 		err = -ESRCH;
 		goto out;
@@ -1067,7 +1100,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 				  size_t count, loff_t *ppos)
 {
-	struct inode * inode = file->f_path.dentry->d_inode;
+	struct inode * inode = file_inode(file);
 	struct task_struct *task = get_proc_task(inode);
 	ssize_t length;
 	char tmpbuf[TMPBUFLEN];
@@ -1084,7 +1117,7 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 				   size_t count, loff_t *ppos)
 {
-	struct inode * inode = file->f_path.dentry->d_inode;
+	struct inode * inode = file_inode(file);
 	char *page, *tmp;
 	ssize_t length;
 	uid_t loginuid;
@@ -1142,7 +1175,7 @@ static const struct file_operations proc_loginuid_operations = {
 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
 				  size_t count, loff_t *ppos)
 {
-	struct inode * inode = file->f_path.dentry->d_inode;
+	struct inode * inode = file_inode(file);
 	struct task_struct *task = get_proc_task(inode);
 	ssize_t length;
 	char tmpbuf[TMPBUFLEN];
@@ -1165,7 +1198,7 @@ static const struct file_operations proc_sessionid_operations = {
 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
 				      size_t count, loff_t *ppos)
 {
-	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file_inode(file));
 	char buffer[PROC_NUMBUF];
 	size_t len;
 	int make_it_fail;
@@ -1197,7 +1230,7 @@ static ssize_t proc_fault_inject_write(struct file * file,
 	make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
 	if (*end)
 		return -EINVAL;
-	task = get_proc_task(file->f_dentry->d_inode);
+	task = get_proc_task(file_inode(file));
 	if (!task)
 		return -ESRCH;
 	task->make_it_fail = make_it_fail;
@@ -1237,7 +1270,7 @@ static ssize_t
 sched_write(struct file *file, const char __user *buf,
 	    size_t count, loff_t *offset)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct task_struct *p;
 
 	p = get_proc_task(inode);
@@ -1288,7 +1321,7 @@ static ssize_t
 sched_autogroup_write(struct file *file, const char __user *buf,
 	    size_t count, loff_t *offset)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct task_struct *p;
 	char buffer[PROC_NUMBUF];
 	int nice;
@@ -1343,14 +1376,13 @@ static const struct file_operations proc_pid_sched_autogroup_operations = {
 static ssize_t comm_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *offset)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct task_struct *p;
 	char buffer[TASK_COMM_LEN];
+	const size_t maxlen = sizeof(buffer) - 1;
 
 	memset(buffer, 0, sizeof(buffer));
-	if (count > sizeof(buffer) - 1)
-		count = sizeof(buffer) - 1;
-	if (copy_from_user(buffer, buf, count))
+	if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
 		return -EFAULT;
 
 	p = get_proc_task(inode);
@@ -1620,6 +1652,15 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
 	return 0;
 }
 
+int pid_delete_dentry(const struct dentry *dentry)
+{
+	/* Is the task we represent dead?
+	 * If so, then don't put the dentry on the lru list,
+	 * kill it immediately.
+	 */
+	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
+}
+
 const struct dentry_operations pid_dentry_operations =
 {
 	.d_revalidate	= pid_revalidate,
@@ -1640,46 +1681,34 @@ const struct dentry_operations pid_dentry_operations =
  * reported by readdir in sync with the inode numbers reported
  * by stat.
  */
-int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+bool proc_fill_cache(struct file *file, struct dir_context *ctx,
 	const char *name, int len,
 	instantiate_t instantiate, struct task_struct *task, const void *ptr)
 {
-	struct dentry *child, *dir = filp->f_path.dentry;
+	struct dentry *child, *dir = file->f_path.dentry;
+	struct qstr qname = QSTR_INIT(name, len);
 	struct inode *inode;
-	struct qstr qname;
-	ino_t ino = 0;
-	unsigned type = DT_UNKNOWN;
-
-	qname.name = name;
-	qname.len  = len;
-	qname.hash = full_name_hash(name, len);
+	unsigned type;
+	ino_t ino;
 
-	child = d_lookup(dir, &qname);
+	child = d_hash_and_lookup(dir, &qname);
 	if (!child) {
-		struct dentry *new;
-		new = d_alloc(dir, &qname);
-		if (new) {
-			child = instantiate(dir->d_inode, new, task, ptr);
-			if (child)
-				dput(new);
-			else
-				child = new;
+		child = d_alloc(dir, &qname);
+		if (!child)
+			goto end_instantiate;
+		if (instantiate(dir->d_inode, child, task, ptr) < 0) {
+			dput(child);
+			goto end_instantiate;
 		}
 	}
-	if (!child || IS_ERR(child) || !child->d_inode)
-		goto end_instantiate;
 	inode = child->d_inode;
-	if (inode) {
-		ino = inode->i_ino;
-		type = inode->i_mode >> 12;
-	}
+	ino = inode->i_ino;
+	type = inode->i_mode >> 12;
 	dput(child);
+	return dir_emit(ctx, name, len, ino, type);
+
 end_instantiate:
-	if (!ino)
-		ino = find_inode_number(dir, &qname);
-	if (!ino)
-		ino = 1;
-	return filldir(dirent, name, len, filp->f_pos, ino, type);
+	return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
 }
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1711,7 +1740,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
 		return -ECHILD;
 
 	if (!capable(CAP_SYS_ADMIN)) {
-		status = -EACCES;
+		status = -EPERM;
 		goto out_notask;
 	}
 
@@ -1805,7 +1834,7 @@ struct map_files_info {
 	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
 
-static struct dentry *
+static int
 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 			   struct task_struct *task, const void *ptr)
 {
@@ -1815,7 +1844,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 
 	inode = proc_pid_make_inode(dir->i_sb, task);
 	if (!inode)
-		return ERR_PTR(-ENOENT);
+		return -ENOENT;
 
 	ei = PROC_I(inode);
 	ei->op.proc_get_link = proc_map_files_get_link;
@@ -1832,7 +1861,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 	d_set_d_op(dentry, &tid_map_files_dentry_operations);
 	d_add(dentry, inode);
 
-	return NULL;
+	return 0;
 }
 
 static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -1841,23 +1870,23 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
 	unsigned long vm_start, vm_end;
 	struct vm_area_struct *vma;
 	struct task_struct *task;
-	struct dentry *result;
+	int result;
 	struct mm_struct *mm;
 
-	result = ERR_PTR(-EACCES);
+	result = -EPERM;
 	if (!capable(CAP_SYS_ADMIN))
 		goto out;
 
-	result = ERR_PTR(-ENOENT);
+	result = -ENOENT;
 	task = get_proc_task(dir);
 	if (!task)
 		goto out;
 
-	result = ERR_PTR(-EACCES);
+	result = -EACCES;
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto out_put_task;
 
-	result = ERR_PTR(-ENOENT);
+	result = -ENOENT;
 	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
 		goto out_put_task;
 
@@ -1880,7 +1909,7 @@ out_no_vma:
 out_put_task:
 	put_task_struct(task);
 out:
-	return result;
+	return ERR_PTR(result);
 }
 
 static const struct inode_operations proc_map_files_inode_operations = {
@@ -1890,22 +1919,23 @@ static const struct inode_operations proc_map_files_inode_operations = {
 };
 
 static int
-proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+proc_map_files_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
 	struct vm_area_struct *vma;
 	struct task_struct *task;
 	struct mm_struct *mm;
-	ino_t ino;
+	unsigned long nr_files, pos, i;
+	struct flex_array *fa = NULL;
+	struct map_files_info info;
+	struct map_files_info *p;
 	int ret;
 
-	ret = -EACCES;
+	ret = -EPERM;
 	if (!capable(CAP_SYS_ADMIN))
 		goto out;
 
 	ret = -ENOENT;
-	task = get_proc_task(inode);
+	task = get_proc_task(file_inode(file));
 	if (!task)
 		goto out;
 
@@ -1914,91 +1944,73 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		goto out_put_task;
 
 	ret = 0;
-	switch (filp->f_pos) {
-	case 0:
-		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
-			goto out_put_task;
-		filp->f_pos++;
-	case 1:
-		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-			goto out_put_task;
-		filp->f_pos++;
-	default:
-	{
-		unsigned long nr_files, pos, i;
-		struct flex_array *fa = NULL;
-		struct map_files_info info;
-		struct map_files_info *p;
-
-		mm = get_task_mm(task);
-		if (!mm)
-			goto out_put_task;
-		down_read(&mm->mmap_sem);
+	if (!dir_emit_dots(file, ctx))
+		goto out_put_task;
 
-		nr_files = 0;
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_put_task;
+	down_read(&mm->mmap_sem);
 
-		/*
-		 * We need two passes here:
-		 *
-		 *  1) Collect vmas of mapped files with mmap_sem taken
-		 *  2) Release mmap_sem and instantiate entries
-		 *
-		 * otherwise we get lockdep complained, since filldir()
-		 * routine might require mmap_sem taken in might_fault().
-		 */
+	nr_files = 0;
 
-		for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
-			if (vma->vm_file && ++pos > filp->f_pos)
-				nr_files++;
-		}
+	/*
+	 * We need two passes here:
+	 *
+	 *  1) Collect vmas of mapped files with mmap_sem taken
+	 *  2) Release mmap_sem and instantiate entries
+	 *
+	 * otherwise we get lockdep complained, since filldir()
+	 * routine might require mmap_sem taken in might_fault().
+	 */
 
-		if (nr_files) {
-			fa = flex_array_alloc(sizeof(info), nr_files,
-						GFP_KERNEL);
-			if (!fa || flex_array_prealloc(fa, 0, nr_files,
-							GFP_KERNEL)) {
-				ret = -ENOMEM;
-				if (fa)
-					flex_array_free(fa);
-				up_read(&mm->mmap_sem);
-				mmput(mm);
-				goto out_put_task;
-			}
-			for (i = 0, vma = mm->mmap, pos = 2; vma;
-					vma = vma->vm_next) {
-				if (!vma->vm_file)
-					continue;
-				if (++pos <= filp->f_pos)
-					continue;
-
-				info.mode = vma->vm_file->f_mode;
-				info.len = snprintf(info.name,
-						sizeof(info.name), "%lx-%lx",
-						vma->vm_start, vma->vm_end);
-				if (flex_array_put(fa, i++, &info, GFP_KERNEL))
-					BUG();
-			}
+	for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+		if (vma->vm_file && ++pos > ctx->pos)
+			nr_files++;
+	}
+
+	if (nr_files) {
+		fa = flex_array_alloc(sizeof(info), nr_files,
+					GFP_KERNEL);
+		if (!fa || flex_array_prealloc(fa, 0, nr_files,
+						GFP_KERNEL)) {
+			ret = -ENOMEM;
+			if (fa)
+				flex_array_free(fa);
+			up_read(&mm->mmap_sem);
+			mmput(mm);
+			goto out_put_task;
 		}
-		up_read(&mm->mmap_sem);
-
-		for (i = 0; i < nr_files; i++) {
-			p = flex_array_get(fa, i);
-			ret = proc_fill_cache(filp, dirent, filldir,
-					      p->name, p->len,
-					      proc_map_files_instantiate,
-					      task,
-					      (void *)(unsigned long)p->mode);
-			if (ret)
-				break;
-			filp->f_pos++;
+		for (i = 0, vma = mm->mmap, pos = 2; vma;
+				vma = vma->vm_next) {
+			if (!vma->vm_file)
+				continue;
+			if (++pos <= ctx->pos)
+				continue;
+
+			info.mode = vma->vm_file->f_mode;
+			info.len = snprintf(info.name,
+					sizeof(info.name), "%lx-%lx",
+					vma->vm_start, vma->vm_end);
+			if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+				BUG();
 		}
-		if (fa)
-			flex_array_free(fa);
-		mmput(mm);
 	}
+	up_read(&mm->mmap_sem);
+
+	for (i = 0; i < nr_files; i++) {
+		p = flex_array_get(fa, i);
+		if (!proc_fill_cache(file, ctx,
+				      p->name, p->len,
+				      proc_map_files_instantiate,
+				      task,
+				      (void *)(unsigned long)p->mode))
+			break;
+		ctx->pos++;
 	}
+	if (fa)
+		flex_array_free(fa);
+	mmput(mm);
 
 out_put_task:
 	put_task_struct(task);
@@ -2008,19 +2020,115 @@ out:
 
 static const struct file_operations proc_map_files_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_map_files_readdir,
+	.iterate	= proc_map_files_readdir,
 	.llseek		= default_llseek,
 };
 
+struct timers_private {
+	struct pid *pid;
+	struct task_struct *task;
+	struct sighand_struct *sighand;
+	struct pid_namespace *ns;
+	unsigned long flags;
+};
+
+static void *timers_start(struct seq_file *m, loff_t *pos)
+{
+	struct timers_private *tp = m->private;
+
+	tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
+	if (!tp->task)
+		return ERR_PTR(-ESRCH);
+
+	tp->sighand = lock_task_sighand(tp->task, &tp->flags);
+	if (!tp->sighand)
+		return ERR_PTR(-ESRCH);
+
+	return seq_list_start(&tp->task->signal->posix_timers, *pos);
+}
+
+static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct timers_private *tp = m->private;
+	return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+}
+
+static void timers_stop(struct seq_file *m, void *v)
+{
+	struct timers_private *tp = m->private;
+
+	if (tp->sighand) {
+		unlock_task_sighand(tp->task, &tp->flags);
+		tp->sighand = NULL;
+	}
+
+	if (tp->task) {
+		put_task_struct(tp->task);
+		tp->task = NULL;
+	}
+}
+
+static int show_timer(struct seq_file *m, void *v)
+{
+	struct k_itimer *timer;
+	struct timers_private *tp = m->private;
+	int notify;
+	static char *nstr[] = {
+		[SIGEV_SIGNAL] = "signal",
+		[SIGEV_NONE] = "none",
+		[SIGEV_THREAD] = "thread",
+	};
+
+	timer = list_entry((struct list_head *)v, struct k_itimer, list);
+	notify = timer->it_sigev_notify;
+
+	seq_printf(m, "ID: %d\n", timer->it_id);
+	seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo,
+			timer->sigq->info.si_value.sival_ptr);
+	seq_printf(m, "notify: %s/%s.%d\n",
+		nstr[notify & ~SIGEV_THREAD_ID],
+		(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
+		pid_nr_ns(timer->it_pid, tp->ns));
+	seq_printf(m, "ClockID: %d\n", timer->it_clock);
+
+	return 0;
+}
+
+static const struct seq_operations proc_timers_seq_ops = {
+	.start	= timers_start,
+	.next	= timers_next,
+	.stop	= timers_stop,
+	.show	= show_timer,
+};
+
+static int proc_timers_open(struct inode *inode, struct file *file)
+{
+	struct timers_private *tp;
+
+	tp = __seq_open_private(file, &proc_timers_seq_ops,
+			sizeof(struct timers_private));
+	if (!tp)
+		return -ENOMEM;
+
+	tp->pid = proc_pid(inode);
+	tp->ns = inode->i_sb->s_fs_info;
+	return 0;
+}
+
+static const struct file_operations proc_timers_operations = {
+	.open		= proc_timers_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
 #endif /* CONFIG_CHECKPOINT_RESTORE */
 
-static struct dentry *proc_pident_instantiate(struct inode *dir,
+static int proc_pident_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
 	const struct pid_entry *p = ptr;
 	struct inode *inode;
 	struct proc_inode *ei;
-	struct dentry *error = ERR_PTR(-ENOENT);
 
 	inode = proc_pid_make_inode(dir->i_sb, task);
 	if (!inode)
@@ -2039,9 +2147,9 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
 	if (pid_revalidate(dentry, 0))
-		error = NULL;
+		return 0;
 out:
-	return error;
+	return -ENOENT;
 }
 
 static struct dentry *proc_pident_lookup(struct inode *dir, 
@@ -2049,11 +2157,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 					 const struct pid_entry *ents,
 					 unsigned int nents)
 {
-	struct dentry *error;
+	int error;
 	struct task_struct *task = get_proc_task(dir);
 	const struct pid_entry *p, *last;
 
-	error = ERR_PTR(-ENOENT);
+	error = -ENOENT;
 
 	if (!task)
 		goto out_no_task;
@@ -2076,77 +2184,40 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 out:
 	put_task_struct(task);
 out_no_task:
-	return error;
-}
-
-static int proc_pident_fill_cache(struct file *filp, void *dirent,
-	filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
-{
-	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
-				proc_pident_instantiate, task, p);
+	return ERR_PTR(error);
 }
 
-static int proc_pident_readdir(struct file *filp,
-		void *dirent, filldir_t filldir,
+static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
 		const struct pid_entry *ents, unsigned int nents)
 {
-	int i;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
-	const struct pid_entry *p, *last;
-	ino_t ino;
-	int ret;
+	struct task_struct *task = get_proc_task(file_inode(file));
+	const struct pid_entry *p;
 
-	ret = -ENOENT;
 	if (!task)
-		goto out_no_task;
+		return -ENOENT;
 
-	ret = 0;
-	i = filp->f_pos;
-	switch (i) {
-	case 0:
-		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
-		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	default:
-		i -= 2;
-		if (i >= nents) {
-			ret = 1;
-			goto out;
-		}
-		p = ents + i;
-		last = &ents[nents - 1];
-		while (p <= last) {
-			if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
-				goto out;
-			filp->f_pos++;
-			p++;
-		}
-	}
+	if (!dir_emit_dots(file, ctx))
+		goto out;
 
-	ret = 1;
+	if (ctx->pos >= nents + 2)
+		goto out;
+
+	for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+		if (!proc_fill_cache(file, ctx, p->name, p->len,
+				proc_pident_instantiate, task, p))
+			break;
+		ctx->pos++;
+	}
 out:
 	put_task_struct(task);
-out_no_task:
-	return ret;
+	return 0;
 }
 
 #ifdef CONFIG_SECURITY
 static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 				  size_t count, loff_t *ppos)
 {
-	struct inode * inode = file->f_path.dentry->d_inode;
+	struct inode * inode = file_inode(file);
 	char *p = NULL;
 	ssize_t length;
 	struct task_struct *task = get_proc_task(inode);
@@ -2167,7 +2238,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 				   size_t count, loff_t *ppos)
 {
-	struct inode * inode = file->f_path.dentry->d_inode;
+	struct inode * inode = file_inode(file);
 	char *page;
 	ssize_t length;
 	struct task_struct *task = get_proc_task(inode);
@@ -2224,16 +2295,15 @@ static const struct pid_entry attr_dir_stuff[] = {
 	REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
 };
 
-static int proc_attr_dir_readdir(struct file * filp,
-			     void * dirent, filldir_t filldir)
+static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
 {
-	return proc_pident_readdir(filp,dirent,filldir,
-				   attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
+	return proc_pident_readdir(file, ctx, 
+				   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
 }
 
 static const struct file_operations proc_attr_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_attr_dir_readdir,
+	.iterate	= proc_attr_dir_readdir,
 	.llseek		= default_llseek,
 };
 
@@ -2256,7 +2326,7 @@ static const struct inode_operations proc_attr_dir_inode_operations = {
 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
 					 size_t count, loff_t *ppos)
 {
-	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file_inode(file));
 	struct mm_struct *mm;
 	char buffer[PROC_NUMBUF];
 	size_t len;
@@ -2308,7 +2378,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
 		goto out_no_task;
 
 	ret = -ESRCH;
-	task = get_proc_task(file->f_dentry->d_inode);
+	task = get_proc_task(file_inode(file));
 	if (!task)
 		goto out_no_task;
 
@@ -2582,18 +2652,20 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
 	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
 #endif
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	REG("timers",	  S_IRUGO, proc_timers_operations),
+#endif
 };
 
-static int proc_tgid_base_readdir(struct file * filp,
-			     void * dirent, filldir_t filldir)
+static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
 {
-	return proc_pident_readdir(filp,dirent,filldir,
-				   tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
+	return proc_pident_readdir(file, ctx,
+				   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
 }
 
 static const struct file_operations proc_tgid_base_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_tgid_base_readdir,
+	.iterate	= proc_tgid_base_readdir,
 	.llseek		= default_llseek,
 };
 
@@ -2618,6 +2690,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
 
 	name.name = buf;
 	name.len = snprintf(buf, sizeof(buf), "%d", pid);
+	/* no ->d_hash() rejects on procfs */
 	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
 	if (dentry) {
 		shrink_dcache_parent(dentry);
@@ -2694,11 +2767,10 @@ void proc_flush_task(struct task_struct *task)
 	}
 }
 
-static struct dentry *proc_pid_instantiate(struct inode *dir,
-					   struct dentry * dentry,
-					   struct task_struct *task, const void *ptr)
+static int proc_pid_instantiate(struct inode *dir,
+				   struct dentry * dentry,
+				   struct task_struct *task, const void *ptr)
 {
-	struct dentry *error = ERR_PTR(-ENOENT);
 	struct inode *inode;
 
 	inode = proc_pid_make_inode(dir->i_sb, task);
@@ -2718,14 +2790,14 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
 	if (pid_revalidate(dentry, 0))
-		error = NULL;
+		return 0;
 out:
-	return error;
+	return -ENOENT;
 }
 
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
-	struct dentry *result = NULL;
+	int result = 0;
 	struct task_struct *task;
 	unsigned tgid;
 	struct pid_namespace *ns;
@@ -2746,7 +2818,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
 	result = proc_pid_instantiate(dir, dentry, task, NULL);
 	put_task_struct(task);
 out:
-	return result;
+	return ERR_PTR(result);
 }
 
 /*
@@ -2792,52 +2864,44 @@ retry:
 	return iter;
 }
 
-#define TGID_OFFSET (FIRST_PROCESS_ENTRY)
-
-static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	struct tgid_iter iter)
-{
-	char name[PROC_NUMBUF];
-	int len = snprintf(name, sizeof(name), "%d", iter.tgid);
-	return proc_fill_cache(filp, dirent, filldir, name, len,
-				proc_pid_instantiate, iter.task, NULL);
-}
-
-static int fake_filldir(void *buf, const char *name, int namelen,
-			loff_t offset, u64 ino, unsigned d_type)
-{
-	return 0;
-}
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
 
 /* for the /proc/ directory itself, after non-process stuff has been done */
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
+int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct tgid_iter iter;
-	struct pid_namespace *ns;
-	filldir_t __filldir;
+	struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info;
+	loff_t pos = ctx->pos;
 
-	if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
-		goto out;
+	if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
+		return 0;
 
-	ns = filp->f_dentry->d_sb->s_fs_info;
+	if (pos == TGID_OFFSET - 1) {
+		struct inode *inode = ns->proc_self->d_inode;
+		if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
+			return 0;
+		iter.tgid = 0;
+	} else {
+		iter.tgid = pos - TGID_OFFSET;
+	}
 	iter.task = NULL;
-	iter.tgid = filp->f_pos - TGID_OFFSET;
 	for (iter = next_tgid(ns, iter);
 	     iter.task;
 	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
-		if (has_pid_permissions(ns, iter.task, 2))
-			__filldir = filldir;
-		else
-			__filldir = fake_filldir;
+		char name[PROC_NUMBUF];
+		int len;
+		if (!has_pid_permissions(ns, iter.task, 2))
+			continue;
 
-		filp->f_pos = iter.tgid + TGID_OFFSET;
-		if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
+		len = snprintf(name, sizeof(name), "%d", iter.tgid);
+		ctx->pos = iter.tgid + TGID_OFFSET;
+		if (!proc_fill_cache(file, ctx, name, len,
+				     proc_pid_instantiate, iter.task, NULL)) {
 			put_task_struct(iter.task);
-			goto out;
+			return 0;
 		}
 	}
-	filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
-out:
+	ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
 	return 0;
 }
 
@@ -2925,11 +2989,10 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
 };
 
-static int proc_tid_base_readdir(struct file * filp,
-			     void * dirent, filldir_t filldir)
+static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
 {
-	return proc_pident_readdir(filp,dirent,filldir,
-				   tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
+	return proc_pident_readdir(file, ctx,
+				   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
 }
 
 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -2940,7 +3003,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
 
 static const struct file_operations proc_tid_base_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_tid_base_readdir,
+	.iterate	= proc_tid_base_readdir,
 	.llseek		= default_llseek,
 };
 
@@ -2950,10 +3013,9 @@ static const struct inode_operations proc_tid_base_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
-static struct dentry *proc_task_instantiate(struct inode *dir,
+static int proc_task_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
-	struct dentry *error = ERR_PTR(-ENOENT);
 	struct inode *inode;
 	inode = proc_pid_make_inode(dir->i_sb, task);
 
@@ -2972,14 +3034,14 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
 	if (pid_revalidate(dentry, 0))
-		error = NULL;
+		return 0;
 out:
-	return error;
+	return -ENOENT;
 }
 
 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
-	struct dentry *result = ERR_PTR(-ENOENT);
+	int result = -ENOENT;
 	struct task_struct *task;
 	struct task_struct *leader = get_proc_task(dir);
 	unsigned tid;
@@ -3009,7 +3071,7 @@ out_drop_task:
 out:
 	put_task_struct(leader);
 out_no_task:
-	return result;
+	return ERR_PTR(result);
 }
 
 /*
@@ -3081,30 +3143,16 @@ static struct task_struct *next_tid(struct task_struct *start)
 	return pos;
 }
 
-static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	struct task_struct *task, int tid)
-{
-	char name[PROC_NUMBUF];
-	int len = snprintf(name, sizeof(name), "%d", tid);
-	return proc_fill_cache(filp, dirent, filldir, name, len,
-				proc_task_instantiate, task, NULL);
-}
-
 /* for the /proc/TGID/task/ directories */
-static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
 	struct task_struct *leader = NULL;
-	struct task_struct *task;
-	int retval = -ENOENT;
-	ino_t ino;
-	int tid;
+	struct task_struct *task = get_proc_task(file_inode(file));
 	struct pid_namespace *ns;
+	int tid;
 
-	task = get_proc_task(inode);
 	if (!task)
-		goto out_no_task;
+		return -ENOENT;
 	rcu_read_lock();
 	if (pid_alive(task)) {
 		leader = task->group_leader;
@@ -3113,46 +3161,36 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	rcu_read_unlock();
 	put_task_struct(task);
 	if (!leader)
-		goto out_no_task;
-	retval = 0;
+		return -ENOENT;
 
-	switch ((unsigned long)filp->f_pos) {
-	case 0:
-		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
-		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos++;
-		/* fall through */
-	}
+	if (!dir_emit_dots(file, ctx))
+		goto out;
 
 	/* f_version caches the tgid value that the last readdir call couldn't
 	 * return. lseek aka telldir automagically resets f_version to 0.
 	 */
-	ns = filp->f_dentry->d_sb->s_fs_info;
-	tid = (int)filp->f_version;
-	filp->f_version = 0;
-	for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
+	ns = file->f_dentry->d_sb->s_fs_info;
+	tid = (int)file->f_version;
+	file->f_version = 0;
+	for (task = first_tid(leader, tid, ctx->pos - 2, ns);
 	     task;
-	     task = next_tid(task), filp->f_pos++) {
+	     task = next_tid(task), ctx->pos++) {
+		char name[PROC_NUMBUF];
+		int len;
 		tid = task_pid_nr_ns(task, ns);
-		if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
+		len = snprintf(name, sizeof(name), "%d", tid);
+		if (!proc_fill_cache(file, ctx, name, len,
+				proc_task_instantiate, task, NULL)) {
 			/* returning this tgid failed, save it as the first
 			 * pid for the next readir call */
-			filp->f_version = (u64)tid;
+			file->f_version = (u64)tid;
 			put_task_struct(task);
 			break;
 		}
 	}
 out:
 	put_task_struct(leader);
-out_no_task:
-	return retval;
+	return 0;
 }
 
 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -3178,6 +3216,6 @@ static const struct inode_operations proc_task_inode_operations = {
 
 static const struct file_operations proc_task_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_task_readdir,
+	.iterate	= proc_task_readdir,
 	.llseek		= default_llseek,
 };
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d7a4a28ef63..75f2890abbd 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -167,11 +167,10 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
 	return ret;
 }
 
-static struct dentry *
+static int
 proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
 		    struct task_struct *task, const void *ptr)
 {
-	struct dentry *error = ERR_PTR(-ENOENT);
 	unsigned fd = (unsigned long)ptr;
 	struct proc_inode *ei;
 	struct inode *inode;
@@ -194,9 +193,9 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
 
 	/* Close the race of the process dying before we return the dentry */
 	if (tid_fd_revalidate(dentry, 0))
-		error = NULL;
+		return 0;
  out:
-	return error;
+	return -ENOENT;
 }
 
 static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -204,7 +203,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
 					   instantiate_t instantiate)
 {
 	struct task_struct *task = get_proc_task(dir);
-	struct dentry *result = ERR_PTR(-ENOENT);
+	int result = -ENOENT;
 	unsigned fd = name_to_int(dentry);
 
 	if (!task)
@@ -216,77 +215,61 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
 out:
 	put_task_struct(task);
 out_no_task:
-	return result;
+	return ERR_PTR(result);
 }
 
-static int proc_readfd_common(struct file * filp, void * dirent,
-			      filldir_t filldir, instantiate_t instantiate)
+static int proc_readfd_common(struct file *file, struct dir_context *ctx,
+			      instantiate_t instantiate)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *p = get_proc_task(inode);
+	struct task_struct *p = get_proc_task(file_inode(file));
 	struct files_struct *files;
-	unsigned int fd, ino;
-	int retval;
+	unsigned int fd;
 
-	retval = -ENOENT;
 	if (!p)
-		goto out_no_task;
-	retval = 0;
-
-	fd = filp->f_pos;
-	switch (fd) {
-		case 0:
-			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-				goto out;
-			filp->f_pos++;
-		case 1:
-			ino = parent_ino(dentry);
-			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-				goto out;
-			filp->f_pos++;
-		default:
-			files = get_files_struct(p);
-			if (!files)
-				goto out;
-			rcu_read_lock();
-			for (fd = filp->f_pos - 2;
-			     fd < files_fdtable(files)->max_fds;
-			     fd++, filp->f_pos++) {
-				char name[PROC_NUMBUF];
-				int len;
-				int rv;
-
-				if (!fcheck_files(files, fd))
-					continue;
-				rcu_read_unlock();
+		return -ENOENT;
 
-				len = snprintf(name, sizeof(name), "%d", fd);
-				rv = proc_fill_cache(filp, dirent, filldir,
-						     name, len, instantiate, p,
-						     (void *)(unsigned long)fd);
-				if (rv < 0)
-					goto out_fd_loop;
-				rcu_read_lock();
-			}
-			rcu_read_unlock();
-out_fd_loop:
-			put_files_struct(files);
+	if (!dir_emit_dots(file, ctx))
+		goto out;
+	if (!dir_emit_dots(file, ctx))
+		goto out;
+	files = get_files_struct(p);
+	if (!files)
+		goto out;
+
+	rcu_read_lock();
+	for (fd = ctx->pos - 2;
+	     fd < files_fdtable(files)->max_fds;
+	     fd++, ctx->pos++) {
+		char name[PROC_NUMBUF];
+		int len;
+
+		if (!fcheck_files(files, fd))
+			continue;
+		rcu_read_unlock();
+
+		len = snprintf(name, sizeof(name), "%d", fd);
+		if (!proc_fill_cache(file, ctx,
+				     name, len, instantiate, p,
+				     (void *)(unsigned long)fd))
+			goto out_fd_loop;
+		rcu_read_lock();
 	}
+	rcu_read_unlock();
+out_fd_loop:
+	put_files_struct(files);
 out:
 	put_task_struct(p);
-out_no_task:
-	return retval;
+	return 0;
 }
 
-static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfd(struct file *file, struct dir_context *ctx)
 {
-	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
+	return proc_readfd_common(file, ctx, proc_fd_instantiate);
 }
 
 const struct file_operations proc_fd_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_readfd,
+	.iterate	= proc_readfd,
 	.llseek		= default_llseek,
 };
 
@@ -316,11 +299,10 @@ const struct inode_operations proc_fd_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
-static struct dentry *
+static int
 proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
 			struct task_struct *task, const void *ptr)
 {
-	struct dentry *error = ERR_PTR(-ENOENT);
 	unsigned fd = (unsigned long)ptr;
 	struct proc_inode *ei;
 	struct inode *inode;
@@ -340,9 +322,9 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
 
 	/* Close the race of the process dying before we return the dentry */
 	if (tid_fd_revalidate(dentry, 0))
-		error = NULL;
+		return 0;
  out:
-	return error;
+	return -ENOENT;
 }
 
 static struct dentry *
@@ -351,9 +333,9 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
 	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
 }
 
-static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
 {
-	return proc_readfd_common(filp, dirent, filldir,
+	return proc_readfd_common(file, ctx,
 				  proc_fdinfo_instantiate);
 }
 
@@ -364,6 +346,6 @@ const struct inode_operations proc_fdinfo_inode_operations = {
 
 const struct file_operations proc_fdinfo_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_readfdinfo,
+	.iterate	= proc_readfdinfo,
 	.llseek		= default_llseek,
 };
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
index cbb1d47deda..7c047f256ae 100644
--- a/fs/proc/fd.h
+++ b/fs/proc/fd.h
@@ -11,4 +11,9 @@ extern const struct inode_operations proc_fdinfo_inode_operations;
 
 extern int proc_fd_permission(struct inode *inode, int mask);
 
+static inline int proc_fd(struct inode *inode)
+{
+	return PROC_I(inode)->fd;
+}
+
 #endif /* __PROCFS_FD_H__ */
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 76ddae83daa..94441a40733 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/printk.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/idr.h>
@@ -35,222 +36,6 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry
 	return !memcmp(name, de->name, len);
 }
 
-/* buffer size is one page but our output routines use some slack for overruns */
-#define PROC_BLOCK_SIZE	(PAGE_SIZE - 1024)
-
-static ssize_t
-__proc_file_read(struct file *file, char __user *buf, size_t nbytes,
-	       loff_t *ppos)
-{
-	struct inode * inode = file->f_path.dentry->d_inode;
-	char 	*page;
-	ssize_t	retval=0;
-	int	eof=0;
-	ssize_t	n, count;
-	char	*start;
-	struct proc_dir_entry * dp;
-	unsigned long long pos;
-
-	/*
-	 * Gaah, please just use "seq_file" instead. The legacy /proc
-	 * interfaces cut loff_t down to off_t for reads, and ignore
-	 * the offset entirely for writes..
-	 */
-	pos = *ppos;
-	if (pos > MAX_NON_LFS)
-		return 0;
-	if (nbytes > MAX_NON_LFS - pos)
-		nbytes = MAX_NON_LFS - pos;
-
-	dp = PDE(inode);
-	if (!(page = (char*) __get_free_page(GFP_TEMPORARY)))
-		return -ENOMEM;
-
-	while ((nbytes > 0) && !eof) {
-		count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);
-
-		start = NULL;
-		if (dp->read_proc) {
-			/*
-			 * How to be a proc read function
-			 * ------------------------------
-			 * Prototype:
-			 *    int f(char *buffer, char **start, off_t offset,
-			 *          int count, int *peof, void *dat)
-			 *
-			 * Assume that the buffer is "count" bytes in size.
-			 *
-			 * If you know you have supplied all the data you
-			 * have, set *peof.
-			 *
-			 * You have three ways to return data:
-			 * 0) Leave *start = NULL.  (This is the default.)
-			 *    Put the data of the requested offset at that
-			 *    offset within the buffer.  Return the number (n)
-			 *    of bytes there are from the beginning of the
-			 *    buffer up to the last byte of data.  If the
-			 *    number of supplied bytes (= n - offset) is 
-			 *    greater than zero and you didn't signal eof
-			 *    and the reader is prepared to take more data
-			 *    you will be called again with the requested
-			 *    offset advanced by the number of bytes 
-			 *    absorbed.  This interface is useful for files
-			 *    no larger than the buffer.
-			 * 1) Set *start = an unsigned long value less than
-			 *    the buffer address but greater than zero.
-			 *    Put the data of the requested offset at the
-			 *    beginning of the buffer.  Return the number of
-			 *    bytes of data placed there.  If this number is
-			 *    greater than zero and you didn't signal eof
-			 *    and the reader is prepared to take more data
-			 *    you will be called again with the requested
-			 *    offset advanced by *start.  This interface is
-			 *    useful when you have a large file consisting
-			 *    of a series of blocks which you want to count
-			 *    and return as wholes.
-			 *    (Hack by Paul.Russell@rustcorp.com.au)
-			 * 2) Set *start = an address within the buffer.
-			 *    Put the data of the requested offset at *start.
-			 *    Return the number of bytes of data placed there.
-			 *    If this number is greater than zero and you
-			 *    didn't signal eof and the reader is prepared to
-			 *    take more data you will be called again with the
-			 *    requested offset advanced by the number of bytes
-			 *    absorbed.
-			 */
-			n = dp->read_proc(page, &start, *ppos,
-					  count, &eof, dp->data);
-		} else
-			break;
-
-		if (n == 0)   /* end of file */
-			break;
-		if (n < 0) {  /* error */
-			if (retval == 0)
-				retval = n;
-			break;
-		}
-
-		if (start == NULL) {
-			if (n > PAGE_SIZE) {
-				printk(KERN_ERR
-				       "proc_file_read: Apparent buffer overflow!\n");
-				n = PAGE_SIZE;
-			}
-			n -= *ppos;
-			if (n <= 0)
-				break;
-			if (n > count)
-				n = count;
-			start = page + *ppos;
-		} else if (start < page) {
-			if (n > PAGE_SIZE) {
-				printk(KERN_ERR
-				       "proc_file_read: Apparent buffer overflow!\n");
-				n = PAGE_SIZE;
-			}
-			if (n > count) {
-				/*
-				 * Don't reduce n because doing so might
-				 * cut off part of a data block.
-				 */
-				printk(KERN_WARNING
-				       "proc_file_read: Read count exceeded\n");
-			}
-		} else /* start >= page */ {
-			unsigned long startoff = (unsigned long)(start - page);
-			if (n > (PAGE_SIZE - startoff)) {
-				printk(KERN_ERR
-				       "proc_file_read: Apparent buffer overflow!\n");
-				n = PAGE_SIZE - startoff;
-			}
-			if (n > count)
-				n = count;
-		}
-		
- 		n -= copy_to_user(buf, start < page ? page : start, n);
-		if (n == 0) {
-			if (retval == 0)
-				retval = -EFAULT;
-			break;
-		}
-
-		*ppos += start < page ? (unsigned long)start : n;
-		nbytes -= n;
-		buf += n;
-		retval += n;
-	}
-	free_page((unsigned long) page);
-	return retval;
-}
-
-static ssize_t
-proc_file_read(struct file *file, char __user *buf, size_t nbytes,
-	       loff_t *ppos)
-{
-	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
-	ssize_t rv = -EIO;
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
-	}
-	pde->pde_users++;
-	spin_unlock(&pde->pde_unload_lock);
-
-	rv = __proc_file_read(file, buf, nbytes, ppos);
-
-	pde_users_dec(pde);
-	return rv;
-}
-
-static ssize_t
-proc_file_write(struct file *file, const char __user *buffer,
-		size_t count, loff_t *ppos)
-{
-	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
-	ssize_t rv = -EIO;
-
-	if (pde->write_proc) {
-		spin_lock(&pde->pde_unload_lock);
-		if (!pde->proc_fops) {
-			spin_unlock(&pde->pde_unload_lock);
-			return rv;
-		}
-		pde->pde_users++;
-		spin_unlock(&pde->pde_unload_lock);
-
-		/* FIXME: does this routine need ppos?  probably... */
-		rv = pde->write_proc(file, buffer, count, pde->data);
-		pde_users_dec(pde);
-	}
-	return rv;
-}
-
-
-static loff_t
-proc_file_lseek(struct file *file, loff_t offset, int orig)
-{
-	loff_t retval = -EINVAL;
-	switch (orig) {
-	case 1:
-		offset += file->f_pos;
-	/* fallthrough */
-	case 0:
-		if (offset < 0 || offset > MAX_NON_LFS)
-			break;
-		file->f_pos = retval = offset;
-	}
-	return retval;
-}
-
-static const struct file_operations proc_file_operations = {
-	.llseek		= proc_file_lseek,
-	.read		= proc_file_read,
-	.write		= proc_file_write,
-};
-
 static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -380,7 +165,7 @@ void proc_free_inum(unsigned int inum)
 
 static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	nd_set_link(nd, PDE(dentry->d_inode)->data);
+	nd_set_link(nd, __PDE_DATA(dentry->d_inode));
 	return NULL;
 }
 
@@ -412,8 +197,7 @@ static const struct dentry_operations proc_dentry_operations =
 struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 		struct dentry *dentry)
 {
-	struct inode *inode = NULL;
-	int error = -ENOENT;
+	struct inode *inode;
 
 	spin_lock(&proc_subdir_lock);
 	for (de = de->subdir; de ; de = de->next) {
@@ -422,22 +206,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 		if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
 			pde_get(de);
 			spin_unlock(&proc_subdir_lock);
-			error = -ENOMEM;
 			inode = proc_get_inode(dir->i_sb, de);
-			goto out_unlock;
+			if (!inode)
+				return ERR_PTR(-ENOMEM);
+			d_set_d_op(dentry, &proc_dentry_operations);
+			d_add(dentry, inode);
+			return NULL;
 		}
 	}
 	spin_unlock(&proc_subdir_lock);
-out_unlock:
-
-	if (inode) {
-		d_set_d_op(dentry, &proc_dentry_operations);
-		d_add(dentry, inode);
-		return NULL;
-	}
-	if (de)
-		pde_put(de);
-	return ERR_PTR(error);
+	return ERR_PTR(-ENOENT);
 }
 
 struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
@@ -455,76 +233,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
  * value of the readdir() call, as long as it's non-negative
  * for success..
  */
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
-		filldir_t filldir)
+int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
+		    struct dir_context *ctx)
 {
-	unsigned int ino;
 	int i;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	int ret = 0;
-
-	ino = inode->i_ino;
-	i = filp->f_pos;
-	switch (i) {
-		case 0:
-			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-				goto out;
-			i++;
-			filp->f_pos++;
-			/* fall through */
-		case 1:
-			if (filldir(dirent, "..", 2, i,
-				    parent_ino(filp->f_path.dentry),
-				    DT_DIR) < 0)
-				goto out;
-			i++;
-			filp->f_pos++;
-			/* fall through */
-		default:
-			spin_lock(&proc_subdir_lock);
-			de = de->subdir;
-			i -= 2;
-			for (;;) {
-				if (!de) {
-					ret = 1;
-					spin_unlock(&proc_subdir_lock);
-					goto out;
-				}
-				if (!i)
-					break;
-				de = de->next;
-				i--;
-			}
 
-			do {
-				struct proc_dir_entry *next;
-
-				/* filldir passes info to user space */
-				pde_get(de);
-				spin_unlock(&proc_subdir_lock);
-				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
-					    de->low_ino, de->mode >> 12) < 0) {
-					pde_put(de);
-					goto out;
-				}
-				spin_lock(&proc_subdir_lock);
-				filp->f_pos++;
-				next = de->next;
-				pde_put(de);
-				de = next;
-			} while (de);
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	spin_lock(&proc_subdir_lock);
+	de = de->subdir;
+	i = ctx->pos - 2;
+	for (;;) {
+		if (!de) {
 			spin_unlock(&proc_subdir_lock);
+			return 0;
+		}
+		if (!i)
+			break;
+		de = de->next;
+		i--;
 	}
-	ret = 1;
-out:
-	return ret;	
+
+	do {
+		struct proc_dir_entry *next;
+		pde_get(de);
+		spin_unlock(&proc_subdir_lock);
+		if (!dir_emit(ctx, de->name, de->namelen,
+			    de->low_ino, de->mode >> 12)) {
+			pde_put(de);
+			return 0;
+		}
+		spin_lock(&proc_subdir_lock);
+		ctx->pos++;
+		next = de->next;
+		pde_put(de);
+		de = next;
+	} while (de);
+	spin_unlock(&proc_subdir_lock);
+	return 0;
 }
 
-int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
+int proc_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 
-	return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+	return proc_readdir_de(PDE(inode), file, ctx);
 }
 
 /*
@@ -535,7 +289,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
 static const struct file_operations proc_dir_operations = {
 	.llseek			= generic_file_llseek,
 	.read			= generic_read_dir,
-	.readdir		= proc_readdir,
+	.iterate		= proc_readdir,
 };
 
 /*
@@ -557,26 +311,24 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 		return ret;
 
 	if (S_ISDIR(dp->mode)) {
-		if (dp->proc_iops == NULL) {
-			dp->proc_fops = &proc_dir_operations;
-			dp->proc_iops = &proc_dir_inode_operations;
-		}
+		dp->proc_fops = &proc_dir_operations;
+		dp->proc_iops = &proc_dir_inode_operations;
 		dir->nlink++;
 	} else if (S_ISLNK(dp->mode)) {
-		if (dp->proc_iops == NULL)
-			dp->proc_iops = &proc_link_inode_operations;
+		dp->proc_iops = &proc_link_inode_operations;
 	} else if (S_ISREG(dp->mode)) {
-		if (dp->proc_fops == NULL)
-			dp->proc_fops = &proc_file_operations;
-		if (dp->proc_iops == NULL)
-			dp->proc_iops = &proc_file_inode_operations;
+		BUG_ON(dp->proc_fops == NULL);
+		dp->proc_iops = &proc_file_inode_operations;
+	} else {
+		WARN_ON(1);
+		return -EINVAL;
 	}
 
 	spin_lock(&proc_subdir_lock);
 
 	for (tmp = dir->subdir; tmp; tmp = tmp->next)
 		if (strcmp(tmp->name, dp->name) == 0) {
-			WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
+			WARN(1, "proc_dir_entry '%s/%s' already registered\n",
 				dir->name, dp->name);
 			break;
 		}
@@ -652,13 +404,17 @@ struct proc_dir_entry *proc_symlink(const char *name,
 }
 EXPORT_SYMBOL(proc_symlink);
 
-struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
-		struct proc_dir_entry *parent)
+struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, void *data)
 {
 	struct proc_dir_entry *ent;
 
+	if (mode == 0)
+		mode = S_IRUGO | S_IXUGO;
+
 	ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
 	if (ent) {
+		ent->data = data;
 		if (proc_register(parent, ent) < 0) {
 			kfree(ent);
 			ent = NULL;
@@ -666,82 +422,39 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
 	}
 	return ent;
 }
-EXPORT_SYMBOL(proc_mkdir_mode);
+EXPORT_SYMBOL_GPL(proc_mkdir_data);
 
-struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
-		struct proc_dir_entry *parent)
+struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
+				       struct proc_dir_entry *parent)
 {
-	struct proc_dir_entry *ent;
-
-	ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2);
-	if (ent) {
-		ent->data = net;
-		if (proc_register(parent, ent) < 0) {
-			kfree(ent);
-			ent = NULL;
-		}
-	}
-	return ent;
+	return proc_mkdir_data(name, mode, parent, NULL);
 }
-EXPORT_SYMBOL_GPL(proc_net_mkdir);
+EXPORT_SYMBOL(proc_mkdir_mode);
 
 struct proc_dir_entry *proc_mkdir(const char *name,
 		struct proc_dir_entry *parent)
 {
-	return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
+	return proc_mkdir_data(name, 0, parent, NULL);
 }
 EXPORT_SYMBOL(proc_mkdir);
 
-struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode,
-					 struct proc_dir_entry *parent)
-{
-	struct proc_dir_entry *ent;
-	nlink_t nlink;
-
-	if (S_ISDIR(mode)) {
-		if ((mode & S_IALLUGO) == 0)
-			mode |= S_IRUGO | S_IXUGO;
-		nlink = 2;
-	} else {
-		if ((mode & S_IFMT) == 0)
-			mode |= S_IFREG;
-		if ((mode & S_IALLUGO) == 0)
-			mode |= S_IRUGO;
-		nlink = 1;
-	}
-
-	ent = __proc_create(&parent, name, mode, nlink);
-	if (ent) {
-		if (proc_register(parent, ent) < 0) {
-			kfree(ent);
-			ent = NULL;
-		}
-	}
-	return ent;
-}
-EXPORT_SYMBOL(create_proc_entry);
-
 struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 					struct proc_dir_entry *parent,
 					const struct file_operations *proc_fops,
 					void *data)
 {
 	struct proc_dir_entry *pde;
-	nlink_t nlink;
+	if ((mode & S_IFMT) == 0)
+		mode |= S_IFREG;
 
-	if (S_ISDIR(mode)) {
-		if ((mode & S_IALLUGO) == 0)
-			mode |= S_IRUGO | S_IXUGO;
-		nlink = 2;
-	} else {
-		if ((mode & S_IFMT) == 0)
-			mode |= S_IFREG;
-		if ((mode & S_IALLUGO) == 0)
-			mode |= S_IRUGO;
-		nlink = 1;
+	if (!S_ISREG(mode)) {
+		WARN_ON(1);	/* use proc_mkdir() */
+		return NULL;
 	}
 
-	pde = __proc_create(&parent, name, mode, nlink);
+	if ((mode & S_IALLUGO) == 0)
+		mode |= S_IRUGO;
+	pde = __proc_create(&parent, name, mode, 1);
 	if (!pde)
 		goto out;
 	pde->proc_fops = proc_fops;
@@ -755,6 +468,19 @@ out:
 	return NULL;
 }
 EXPORT_SYMBOL(proc_create_data);
+ 
+void proc_set_size(struct proc_dir_entry *de, loff_t size)
+{
+	de->size = size;
+}
+EXPORT_SYMBOL(proc_set_size);
+
+void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
+{
+	de->uid = uid;
+	de->gid = gid;
+}
+EXPORT_SYMBOL(proc_set_user);
 
 static void free_proc_entry(struct proc_dir_entry *de)
 {
@@ -802,44 +528,88 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 		return;
 	}
 
-	spin_lock(&de->pde_unload_lock);
-	/*
-	 * Stop accepting new callers into module. If you're
-	 * dynamically allocating ->proc_fops, save a pointer somewhere.
-	 */
-	de->proc_fops = NULL;
-	/* Wait until all existing callers into module are done. */
-	if (de->pde_users > 0) {
-		DECLARE_COMPLETION_ONSTACK(c);
+	proc_entry_rundown(de);
 
-		if (!de->pde_unload_completion)
-			de->pde_unload_completion = &c;
+	if (S_ISDIR(de->mode))
+		parent->nlink--;
+	de->nlink = 0;
+	WARN(de->subdir, "%s: removing non-empty directory "
+			 "'%s/%s', leaking at least '%s'\n", __func__,
+			 de->parent->name, de->name, de->subdir->name);
+	pde_put(de);
+}
+EXPORT_SYMBOL(remove_proc_entry);
 
-		spin_unlock(&de->pde_unload_lock);
+int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry **p;
+	struct proc_dir_entry *root = NULL, *de, *next;
+	const char *fn = name;
+	unsigned int len;
 
-		wait_for_completion(de->pde_unload_completion);
+	spin_lock(&proc_subdir_lock);
+	if (__xlate_proc_name(name, &parent, &fn) != 0) {
+		spin_unlock(&proc_subdir_lock);
+		return -ENOENT;
+	}
+	len = strlen(fn);
 
-		spin_lock(&de->pde_unload_lock);
+	for (p = &parent->subdir; *p; p=&(*p)->next ) {
+		if (proc_match(len, fn, *p)) {
+			root = *p;
+			*p = root->next;
+			root->next = NULL;
+			break;
+		}
+	}
+	if (!root) {
+		spin_unlock(&proc_subdir_lock);
+		return -ENOENT;
 	}
+	de = root;
+	while (1) {
+		next = de->subdir;
+		if (next) {
+			de->subdir = next->next;
+			next->next = NULL;
+			de = next;
+			continue;
+		}
+		spin_unlock(&proc_subdir_lock);
 
-	while (!list_empty(&de->pde_openers)) {
-		struct pde_opener *pdeo;
+		proc_entry_rundown(de);
+		next = de->parent;
+		if (S_ISDIR(de->mode))
+			next->nlink--;
+		de->nlink = 0;
+		if (de == root)
+			break;
+		pde_put(de);
 
-		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
-		list_del(&pdeo->lh);
-		spin_unlock(&de->pde_unload_lock);
-		pdeo->release(pdeo->inode, pdeo->file);
-		kfree(pdeo);
-		spin_lock(&de->pde_unload_lock);
+		spin_lock(&proc_subdir_lock);
+		de = next;
 	}
-	spin_unlock(&de->pde_unload_lock);
+	pde_put(root);
+	return 0;
+}
+EXPORT_SYMBOL(remove_proc_subtree);
 
-	if (S_ISDIR(de->mode))
-		parent->nlink--;
-	de->nlink = 0;
-	WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
-			"'%s/%s', leaking at least '%s'\n", __func__,
-			de->parent->name, de->name, de->subdir->name);
-	pde_put(de);
+void *proc_get_parent_data(const struct inode *inode)
+{
+	struct proc_dir_entry *de = PDE(inode);
+	return de->parent->data;
 }
-EXPORT_SYMBOL(remove_proc_entry);
+EXPORT_SYMBOL_GPL(proc_get_parent_data);
+
+void proc_remove(struct proc_dir_entry *de)
+{
+	if (de)
+		remove_proc_subtree(de->name, de->parent);
+}
+EXPORT_SYMBOL(proc_remove);
+
+void *PDE_DATA(const struct inode *inode)
+{
+	return __PDE_DATA(inode);
+}
+EXPORT_SYMBOL(PDE_DATA);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 439ae688650..073aea60cf8 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -13,6 +13,7 @@
 #include <linux/stat.h>
 #include <linux/completion.h>
 #include <linux/poll.h>
+#include <linux/printk.h>
 #include <linux/file.h>
 #include <linux/limits.h>
 #include <linux/init.h>
@@ -21,6 +22,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
+#include <linux/magic.h>
 
 #include <asm/uaccess.h>
 
@@ -49,8 +51,8 @@ static void proc_evict_inode(struct inode *inode)
 		sysctl_head_put(head);
 	}
 	/* Release any associated namespace */
-	ns_ops = PROC_I(inode)->ns_ops;
-	ns = PROC_I(inode)->ns;
+	ns_ops = PROC_I(inode)->ns.ns_ops;
+	ns = PROC_I(inode)->ns.ns;
 	if (ns_ops && ns)
 		ns_ops->put(ns);
 }
@@ -71,8 +73,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei->pde = NULL;
 	ei->sysctl = NULL;
 	ei->sysctl_entry = NULL;
-	ei->ns = NULL;
-	ei->ns_ops = NULL;
+	ei->ns.ns = NULL;
+	ei->ns.ns_ops = NULL;
 	inode = &ei->vfs_inode;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	return inode;
@@ -128,186 +130,158 @@ static const struct super_operations proc_sops = {
 	.show_options	= proc_show_options,
 };
 
-static void __pde_users_dec(struct proc_dir_entry *pde)
+enum {BIAS = -1U<<31};
+
+static inline int use_pde(struct proc_dir_entry *pde)
 {
-	pde->pde_users--;
-	if (pde->pde_unload_completion && pde->pde_users == 0)
-		complete(pde->pde_unload_completion);
+	return atomic_inc_unless_negative(&pde->in_use);
 }
 
-void pde_users_dec(struct proc_dir_entry *pde)
+static void unuse_pde(struct proc_dir_entry *pde)
 {
-	spin_lock(&pde->pde_unload_lock);
-	__pde_users_dec(pde);
-	spin_unlock(&pde->pde_unload_lock);
+	if (atomic_dec_return(&pde->in_use) == BIAS)
+		complete(pde->pde_unload_completion);
 }
 
-static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
+/* pde is locked */
+static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 {
-	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
-	loff_t rv = -EINVAL;
-	loff_t (*llseek)(struct file *, loff_t, int);
-
-	spin_lock(&pde->pde_unload_lock);
-	/*
-	 * remove_proc_entry() is going to delete PDE (as part of module
-	 * cleanup sequence). No new callers into module allowed.
-	 */
-	if (!pde->proc_fops) {
+	if (pdeo->closing) {
+		/* somebody else is doing that, just wait */
+		DECLARE_COMPLETION_ONSTACK(c);
+		pdeo->c = &c;
+		spin_unlock(&pde->pde_unload_lock);
+		wait_for_completion(&c);
+		spin_lock(&pde->pde_unload_lock);
+	} else {
+		struct file *file;
+		pdeo->closing = 1;
 		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+		file = pdeo->file;
+		pde->proc_fops->release(file_inode(file), file);
+		spin_lock(&pde->pde_unload_lock);
+		list_del_init(&pdeo->lh);
+		if (pdeo->c)
+			complete(pdeo->c);
+		kfree(pdeo);
 	}
-	/*
-	 * Bump refcount so that remove_proc_entry will wail for ->llseek to
-	 * complete.
-	 */
-	pde->pde_users++;
-	/*
-	 * Save function pointer under lock, to protect against ->proc_fops
-	 * NULL'ifying right after ->pde_unload_lock is dropped.
-	 */
-	llseek = pde->proc_fops->llseek;
-	spin_unlock(&pde->pde_unload_lock);
+}
 
-	if (!llseek)
-		llseek = default_llseek;
-	rv = llseek(file, offset, whence);
+void proc_entry_rundown(struct proc_dir_entry *de)
+{
+	DECLARE_COMPLETION_ONSTACK(c);
+	/* Wait until all existing callers into module are done. */
+	de->pde_unload_completion = &c;
+	if (atomic_add_return(BIAS, &de->in_use) != BIAS)
+		wait_for_completion(&c);
+
+	spin_lock(&de->pde_unload_lock);
+	while (!list_empty(&de->pde_openers)) {
+		struct pde_opener *pdeo;
+		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
+		close_pdeo(de, pdeo);
+	}
+	spin_unlock(&de->pde_unload_lock);
+}
 
-	pde_users_dec(pde);
+static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	loff_t rv = -EINVAL;
+	if (use_pde(pde)) {
+		loff_t (*llseek)(struct file *, loff_t, int);
+		llseek = pde->proc_fops->llseek;
+		if (!llseek)
+			llseek = default_llseek;
+		rv = llseek(file, offset, whence);
+		unuse_pde(pde);
+	}
 	return rv;
 }
 
 static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
-	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
-	ssize_t rv = -EIO;
 	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	ssize_t rv = -EIO;
+	if (use_pde(pde)) {
+		read = pde->proc_fops->read;
+		if (read)
+			rv = read(file, buf, count, ppos);
+		unuse_pde(pde);
 	}
-	pde->pde_users++;
-	read = pde->proc_fops->read;
-	spin_unlock(&pde->pde_unload_lock);
-
-	if (read)
-		rv = read(file, buf, count, ppos);
-
-	pde_users_dec(pde);
 	return rv;
 }
 
 static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 {
-	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
-	ssize_t rv = -EIO;
 	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	ssize_t rv = -EIO;
+	if (use_pde(pde)) {
+		write = pde->proc_fops->write;
+		if (write)
+			rv = write(file, buf, count, ppos);
+		unuse_pde(pde);
 	}
-	pde->pde_users++;
-	write = pde->proc_fops->write;
-	spin_unlock(&pde->pde_unload_lock);
-
-	if (write)
-		rv = write(file, buf, count, ppos);
-
-	pde_users_dec(pde);
 	return rv;
 }
 
 static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
 {
-	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	struct proc_dir_entry *pde = PDE(file_inode(file));
 	unsigned int rv = DEFAULT_POLLMASK;
 	unsigned int (*poll)(struct file *, struct poll_table_struct *);
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+	if (use_pde(pde)) {
+		poll = pde->proc_fops->poll;
+		if (poll)
+			rv = poll(file, pts);
+		unuse_pde(pde);
 	}
-	pde->pde_users++;
-	poll = pde->proc_fops->poll;
-	spin_unlock(&pde->pde_unload_lock);
-
-	if (poll)
-		rv = poll(file, pts);
-
-	pde_users_dec(pde);
 	return rv;
 }
 
 static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	struct proc_dir_entry *pde = PDE(file_inode(file));
 	long rv = -ENOTTY;
 	long (*ioctl)(struct file *, unsigned int, unsigned long);
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+	if (use_pde(pde)) {
+		ioctl = pde->proc_fops->unlocked_ioctl;
+		if (ioctl)
+			rv = ioctl(file, cmd, arg);
+		unuse_pde(pde);
 	}
-	pde->pde_users++;
-	ioctl = pde->proc_fops->unlocked_ioctl;
-	spin_unlock(&pde->pde_unload_lock);
-
-	if (ioctl)
-		rv = ioctl(file, cmd, arg);
-
-	pde_users_dec(pde);
 	return rv;
 }
 
 #ifdef CONFIG_COMPAT
 static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	struct proc_dir_entry *pde = PDE(file_inode(file));
 	long rv = -ENOTTY;
 	long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+	if (use_pde(pde)) {
+		compat_ioctl = pde->proc_fops->compat_ioctl;
+		if (compat_ioctl)
+			rv = compat_ioctl(file, cmd, arg);
+		unuse_pde(pde);
 	}
-	pde->pde_users++;
-	compat_ioctl = pde->proc_fops->compat_ioctl;
-	spin_unlock(&pde->pde_unload_lock);
-
-	if (compat_ioctl)
-		rv = compat_ioctl(file, cmd, arg);
-
-	pde_users_dec(pde);
 	return rv;
 }
 #endif
 
 static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	struct proc_dir_entry *pde = PDE(file_inode(file));
 	int rv = -EIO;
 	int (*mmap)(struct file *, struct vm_area_struct *);
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+	if (use_pde(pde)) {
+		mmap = pde->proc_fops->mmap;
+		if (mmap)
+			rv = mmap(file, vma);
+		unuse_pde(pde);
 	}
-	pde->pde_users++;
-	mmap = pde->proc_fops->mmap;
-	spin_unlock(&pde->pde_unload_lock);
-
-	if (mmap)
-		rv = mmap(file, vma);
-
-	pde_users_dec(pde);
 	return rv;
 }
 
@@ -329,91 +303,47 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	 * by hand in remove_proc_entry(). For this, save opener's credentials
 	 * for later.
 	 */
-	pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
+	pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
 	if (!pdeo)
 		return -ENOMEM;
 
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+	if (!use_pde(pde)) {
 		kfree(pdeo);
 		return -ENOENT;
 	}
-	pde->pde_users++;
 	open = pde->proc_fops->open;
 	release = pde->proc_fops->release;
-	spin_unlock(&pde->pde_unload_lock);
 
 	if (open)
 		rv = open(inode, file);
 
-	spin_lock(&pde->pde_unload_lock);
 	if (rv == 0 && release) {
 		/* To know what to release. */
-		pdeo->inode = inode;
 		pdeo->file = file;
 		/* Strictly for "too late" ->release in proc_reg_release(). */
-		pdeo->release = release;
+		spin_lock(&pde->pde_unload_lock);
 		list_add(&pdeo->lh, &pde->pde_openers);
+		spin_unlock(&pde->pde_unload_lock);
 	} else
 		kfree(pdeo);
-	__pde_users_dec(pde);
-	spin_unlock(&pde->pde_unload_lock);
-	return rv;
-}
-
-static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde,
-					struct inode *inode, struct file *file)
-{
-	struct pde_opener *pdeo;
 
-	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
-		if (pdeo->inode == inode && pdeo->file == file)
-			return pdeo;
-	}
-	return NULL;
+	unuse_pde(pde);
+	return rv;
 }
 
 static int proc_reg_release(struct inode *inode, struct file *file)
 {
 	struct proc_dir_entry *pde = PDE(inode);
-	int rv = 0;
-	int (*release)(struct inode *, struct file *);
 	struct pde_opener *pdeo;
-
 	spin_lock(&pde->pde_unload_lock);
-	pdeo = find_pde_opener(pde, inode, file);
-	if (!pde->proc_fops) {
-		/*
-		 * Can't simply exit, __fput() will think that everything is OK,
-		 * and move on to freeing struct file. remove_proc_entry() will
-		 * find slacker in opener's list and will try to do non-trivial
-		 * things with struct file. Therefore, remove opener from list.
-		 *
-		 * But if opener is removed from list, who will ->release it?
-		 */
-		if (pdeo) {
-			list_del(&pdeo->lh);
-			spin_unlock(&pde->pde_unload_lock);
-			rv = pdeo->release(inode, file);
-			kfree(pdeo);
-		} else
-			spin_unlock(&pde->pde_unload_lock);
-		return rv;
-	}
-	pde->pde_users++;
-	release = pde->proc_fops->release;
-	if (pdeo) {
-		list_del(&pdeo->lh);
-		kfree(pdeo);
+	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
+		if (pdeo->file == file) {
+			close_pdeo(pde, pdeo);
+			break;
+		}
 	}
 	spin_unlock(&pde->pde_unload_lock);
-
-	if (release)
-		rv = release(inode, file);
-
-	pde_users_dec(pde);
-	return rv;
+	return 0;
 }
 
 static const struct file_operations proc_reg_file_ops = {
@@ -445,12 +375,10 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 
 struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 {
-	struct inode * inode;
+	struct inode *inode = new_inode_pseudo(sb);
 
-	inode = iget_locked(sb, de->low_ino);
-	if (!inode)
-		return NULL;
-	if (inode->i_state & I_NEW) {
+	if (inode) {
+		inode->i_ino = de->low_ino;
 		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 		PROC_I(inode)->pde = de;
 
@@ -463,8 +391,8 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 			inode->i_size = de->size;
 		if (de->nlink)
 			set_nlink(inode, de->nlink);
-		if (de->proc_iops)
-			inode->i_op = de->proc_iops;
+		WARN_ON(!de->proc_iops);
+		inode->i_op = de->proc_iops;
 		if (de->proc_fops) {
 			if (S_ISREG(inode->i_mode)) {
 #ifdef CONFIG_COMPAT
@@ -478,14 +406,15 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 				inode->i_fop = de->proc_fops;
 			}
 		}
-		unlock_new_inode(inode);
 	} else
 	       pde_put(de);
 	return inode;
-}			
+}
 
 int proc_fill_super(struct super_block *s)
 {
+	struct inode *root_inode;
+
 	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
@@ -494,11 +423,17 @@ int proc_fill_super(struct super_block *s)
 	s->s_time_gran = 1;
 	
 	pde_get(&proc_root);
-	s->s_root = d_make_root(proc_get_inode(s, &proc_root));
-	if (s->s_root)
-		return 0;
+	root_inode = proc_get_inode(s, &proc_root);
+	if (!root_inode) {
+		pr_err("proc_fill_super: get root inode failed\n");
+		return -ENOMEM;
+	}
+
+	s->s_root = d_make_root(root_inode);
+	if (!s->s_root) {
+		pr_err("proc_fill_super: allocate dentry failed\n");
+		return -ENOMEM;
+	}
 
-	printk("proc_read_super: get root inode failed\n");
-	pde_put(&proc_root);
-	return -ENOMEM;
+	return proc_setup_self(s);
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 252544c0520..651d09a11dd 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -1,4 +1,4 @@
-/* internal.h: internal procfs definitions
+/* Internal procfs definitions
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -9,79 +9,83 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/sched.h>
 #include <linux/proc_fs.h>
-struct  ctl_table_header;
-struct  mempolicy;
+#include <linux/proc_ns.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/binfmts.h>
 
-extern struct proc_dir_entry proc_root;
-extern void proc_self_init(void);
-#ifdef CONFIG_PROC_SYSCTL
-extern int proc_sys_init(void);
-extern void sysctl_head_put(struct ctl_table_header *head);
-#else
-static inline void proc_sys_init(void) { }
-static inline void sysctl_head_put(struct ctl_table_header *head) { }
-#endif
-#ifdef CONFIG_NET
-extern int proc_net_init(void);
-#else
-static inline int proc_net_init(void) { return 0; }
-#endif
+struct ctl_table_header;
+struct mempolicy;
 
-struct vmalloc_info {
-	unsigned long	used;
-	unsigned long	largest_chunk;
+/*
+ * This is not completely implemented yet. The idea is to
+ * create an in-memory tree (like the actual /proc filesystem
+ * tree) of these proc_dir_entries, so that we can dynamically
+ * add new files to /proc.
+ *
+ * The "next" pointer creates a linked list of one /proc directory,
+ * while parent/subdir create the directory structure (every
+ * /proc file has a parent, but "subdir" is NULL for all
+ * non-directory entries).
+ */
+struct proc_dir_entry {
+	unsigned int low_ino;
+	umode_t mode;
+	nlink_t nlink;
+	kuid_t uid;
+	kgid_t gid;
+	loff_t size;
+	const struct inode_operations *proc_iops;
+	const struct file_operations *proc_fops;
+	struct proc_dir_entry *next, *parent, *subdir;
+	void *data;
+	atomic_t count;		/* use count */
+	atomic_t in_use;	/* number of callers into module in progress; */
+			/* negative -> it's going away RSN */
+	struct completion *pde_unload_completion;
+	struct list_head pde_openers;	/* who did ->open, but not ->release */
+	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+	u8 namelen;
+	char name[];
 };
 
-#ifdef CONFIG_MMU
-#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
-extern void get_vmalloc_info(struct vmalloc_info *vmi);
-#else
-
-#define VMALLOC_TOTAL 0UL
-#define get_vmalloc_info(vmi)			\
-do {						\
-	(vmi)->used = 0;			\
-	(vmi)->largest_chunk = 0;		\
-} while(0)
-#endif
-
-extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
-				struct pid *pid, struct task_struct *task);
-extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
-				struct pid *pid, struct task_struct *task);
-extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
-				struct pid *pid, struct task_struct *task);
-extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
-				struct pid *pid, struct task_struct *task);
-extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
-
-extern const struct file_operations proc_tid_children_operations;
-extern const struct file_operations proc_pid_maps_operations;
-extern const struct file_operations proc_tid_maps_operations;
-extern const struct file_operations proc_pid_numa_maps_operations;
-extern const struct file_operations proc_tid_numa_maps_operations;
-extern const struct file_operations proc_pid_smaps_operations;
-extern const struct file_operations proc_tid_smaps_operations;
-extern const struct file_operations proc_clear_refs_operations;
-extern const struct file_operations proc_pagemap_operations;
-extern const struct file_operations proc_net_operations;
-extern const struct inode_operations proc_net_inode_operations;
-extern const struct inode_operations proc_pid_link_inode_operations;
+union proc_op {
+	int (*proc_get_link)(struct dentry *, struct path *);
+	int (*proc_read)(struct task_struct *task, char *page);
+	int (*proc_show)(struct seq_file *m,
+		struct pid_namespace *ns, struct pid *pid,
+		struct task_struct *task);
+};
 
-struct proc_maps_private {
+struct proc_inode {
 	struct pid *pid;
-	struct task_struct *task;
-#ifdef CONFIG_MMU
-	struct vm_area_struct *tail_vma;
-#endif
-#ifdef CONFIG_NUMA
-	struct mempolicy *task_mempolicy;
-#endif
+	int fd;
+	union proc_op op;
+	struct proc_dir_entry *pde;
+	struct ctl_table_header *sysctl;
+	struct ctl_table *sysctl_entry;
+	struct proc_ns ns;
+	struct inode vfs_inode;
 };
 
-void proc_init_inodecache(void);
+/*
+ * General functions
+ */
+static inline struct proc_inode *PROC_I(const struct inode *inode)
+{
+	return container_of(inode, struct proc_inode, vfs_inode);
+}
+
+static inline struct proc_dir_entry *PDE(const struct inode *inode)
+{
+	return PROC_I(inode)->pde;
+}
+
+static inline void *__PDE_DATA(const struct inode *inode)
+{
+	return PDE(inode)->data;
+}
 
 static inline struct pid *proc_pid(struct inode *inode)
 {
@@ -93,11 +97,6 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
 	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
 
-static inline int proc_fd(struct inode *inode)
-{
-	return PROC_I(inode)->fd;
-}
-
 static inline int task_dumpable(struct task_struct *task)
 {
 	int dumpable = 0;
@@ -108,20 +107,11 @@ static inline int task_dumpable(struct task_struct *task)
 	if (mm)
 		dumpable = get_dumpable(mm);
 	task_unlock(task);
-	if (dumpable == SUID_DUMPABLE_ENABLED)
+	if (dumpable == SUID_DUMP_USER)
 		return 1;
 	return 0;
 }
 
-static inline int pid_delete_dentry(const struct dentry * dentry)
-{
-	/* Is the task we represent dead?
-	 * If so, then don't put the dentry on the lru list,
-	 * kill it immediately.
-	 */
-	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
-}
-
 static inline unsigned name_to_int(struct dentry *dentry)
 {
 	const char *name = dentry->d_name.name;
@@ -144,63 +134,165 @@ out:
 	return ~0U;
 }
 
-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
-		struct dentry *dentry);
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
-		filldir_t filldir);
+/*
+ * Offset of the first process in the /proc root directory..
+ */
+#define FIRST_PROCESS_ENTRY 256
 
-struct pde_opener {
-	struct inode *inode;
-	struct file *file;
-	int (*release)(struct inode *, struct file *);
-	struct list_head lh;
-};
-void pde_users_dec(struct proc_dir_entry *pde);
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 13
+
+/*
+ * array.c
+ */
+extern const struct file_operations proc_tid_children_operations;
+
+extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
+			 struct pid *, struct task_struct *);
+extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
+			  struct pid *, struct task_struct *);
+extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
+			   struct pid *, struct task_struct *);
+extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
+			  struct pid *, struct task_struct *);
+
+/*
+ * base.c
+ */
+extern const struct dentry_operations pid_dentry_operations;
+extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int proc_setattr(struct dentry *, struct iattr *);
+extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
+extern int pid_revalidate(struct dentry *, unsigned int);
+extern int pid_delete_dentry(const struct dentry *);
+extern int proc_pid_readdir(struct file *, struct dir_context *);
+extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+extern loff_t mem_lseek(struct file *, loff_t, int);
 
+/* Lookups */
+typedef int instantiate_t(struct inode *, struct dentry *,
+				     struct task_struct *, const void *);
+extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
+			   instantiate_t, struct task_struct *, const void *);
+
+/*
+ * generic.c
+ */
 extern spinlock_t proc_subdir_lock;
 
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int);
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
-unsigned long task_vsize(struct mm_struct *);
-unsigned long task_statm(struct mm_struct *,
-	unsigned long *, unsigned long *, unsigned long *, unsigned long *);
-void task_mem(struct seq_file *, struct mm_struct *);
+extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
+extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
+				     struct dentry *);
+extern int proc_readdir(struct file *, struct dir_context *);
+extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
 
 static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
 {
 	atomic_inc(&pde->count);
 	return pde;
 }
-void pde_put(struct proc_dir_entry *pde);
-
-int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-int proc_remount(struct super_block *sb, int *flags, char *data);
+extern void pde_put(struct proc_dir_entry *);
 
 /*
- * These are generic /proc routines that use the internal
- * "struct proc_dir_entry" tree to traverse the filesystem.
- *
- * The /proc root directory has extended versions to take care
- * of the /proc/<pid> subdirectories.
+ * inode.c
  */
-int proc_readdir(struct file *, void *, filldir_t);
-struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
+struct pde_opener {
+	struct file *file;
+	struct list_head lh;
+	int closing;
+	struct completion *c;
+};
 
+extern const struct inode_operations proc_pid_link_inode_operations;
 
+extern void proc_init_inodecache(void);
+extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+extern int proc_fill_super(struct super_block *);
+extern void proc_entry_rundown(struct proc_dir_entry *);
 
-/* Lookups */
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
-				struct task_struct *, const void *);
-int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	const char *name, int len,
-	instantiate_t instantiate, struct task_struct *task, const void *ptr);
-int pid_revalidate(struct dentry *dentry, unsigned int flags);
-struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
-extern const struct dentry_operations pid_dentry_operations;
-int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
-int proc_setattr(struct dentry *dentry, struct iattr *attr);
+/*
+ * proc_devtree.c
+ */
+#ifdef CONFIG_PROC_DEVICETREE
+extern void proc_device_tree_init(void);
+#endif
 
+/*
+ * proc_namespaces.c
+ */
 extern const struct inode_operations proc_ns_dir_inode_operations;
 extern const struct file_operations proc_ns_dir_operations;
 
+/*
+ * proc_net.c
+ */
+extern const struct file_operations proc_net_operations;
+extern const struct inode_operations proc_net_inode_operations;
+
+#ifdef CONFIG_NET
+extern int proc_net_init(void);
+#else
+static inline int proc_net_init(void) { return 0; }
+#endif
+
+/*
+ * proc_self.c
+ */
+extern int proc_setup_self(struct super_block *);
+
+/*
+ * proc_sysctl.c
+ */
+#ifdef CONFIG_PROC_SYSCTL
+extern int proc_sys_init(void);
+extern void sysctl_head_put(struct ctl_table_header *);
+#else
+static inline void proc_sys_init(void) { }
+static inline void sysctl_head_put(struct ctl_table_header *head) { }
+#endif
+
+/*
+ * proc_tty.c
+ */
+#ifdef CONFIG_TTY
+extern void proc_tty_init(void);
+#else
+static inline void proc_tty_init(void) {}
+#endif
+
+/*
+ * root.c
+ */
+extern struct proc_dir_entry proc_root;
+
+extern void proc_self_init(void);
+extern int proc_remount(struct super_block *, int *, char *);
+
+/*
+ * task_[no]mmu.c
+ */
+struct proc_maps_private {
+	struct pid *pid;
+	struct task_struct *task;
+#ifdef CONFIG_MMU
+	struct vm_area_struct *tail_vma;
+#endif
+#ifdef CONFIG_NUMA
+	struct mempolicy *task_mempolicy;
+#endif
+};
+
+extern const struct file_operations proc_pid_maps_operations;
+extern const struct file_operations proc_tid_maps_operations;
+extern const struct file_operations proc_pid_numa_maps_operations;
+extern const struct file_operations proc_tid_numa_maps_operations;
+extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_tid_smaps_operations;
+extern const struct file_operations proc_clear_refs_operations;
+extern const struct file_operations proc_pagemap_operations;
+
+extern unsigned long task_vsize(struct mm_struct *);
+extern unsigned long task_statm(struct mm_struct *,
+				unsigned long *, unsigned long *,
+				unsigned long *, unsigned long *);
+extern void task_mem(struct seq_file *, struct mm_struct *);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e96d4f18ca3..06ea155e1a5 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -11,12 +11,15 @@
 
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
+#include <linux/kcore.h>
 #include <linux/user.h>
 #include <linux/capability.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <linux/notifier.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
+#include <linux/printk.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -26,6 +29,7 @@
 #include <linux/ioport.h>
 #include <linux/memory.h>
 #include <asm/sections.h>
+#include "internal.h"
 
 #define CORE_STR "CORE"
 
@@ -404,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	prpsinfo.pr_zomb	= 0;
 
 	strcpy(prpsinfo.pr_fname, "vmlinux");
-	strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ);
+	strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
 
 	nhdr->p_filesz	+= notesize(&notes[1]);
 	bufp = storenote(&notes[1], bufp);
@@ -563,7 +567,6 @@ static const struct file_operations proc_kcore_operations = {
 	.llseek		= default_llseek,
 };
 
-#ifdef CONFIG_MEMORY_HOTPLUG
 /* just remember that we have to update kcore */
 static int __meminit kcore_callback(struct notifier_block *self,
 				    unsigned long action, void *arg)
@@ -577,8 +580,11 @@ static int __meminit kcore_callback(struct notifier_block *self,
 	}
 	return NOTIFY_OK;
 }
-#endif
 
+static struct notifier_block kcore_callback_nb __meminitdata = {
+	.notifier_call = kcore_callback,
+	.priority = 0,
+};
 
 static struct kcore_list kcore_vmalloc;
 
@@ -619,7 +625,7 @@ static int __init proc_kcore_init(void)
 	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
 				      &proc_kcore_operations);
 	if (!proc_root_kcore) {
-		printk(KERN_ERR "couldn't create /proc/kcore\n");
+		pr_err("couldn't create /proc/kcore\n");
 		return 0; /* Always returns 0. */
 	}
 	/* Store text area if it's special */
@@ -630,7 +636,7 @@ static int __init proc_kcore_init(void)
 	add_modules_range();
 	/* Store direct-map area from physical memory map */
 	kcore_update_ram();
-	hotplug_memory_notifier(kcore_callback, 0);
+	register_hotmemory_notifier(&kcore_callback_nb);
 
 	return 0;
 }
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index bd4b5a740ff..bdfabdaefdc 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait;
 
 static int kmsg_open(struct inode * inode, struct file * file)
 {
-	return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
+	return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
 }
 
 static int kmsg_release(struct inode * inode, struct file * file)
 {
-	(void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
+	(void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC);
 	return 0;
 }
 
@@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
 			 size_t count, loff_t *ppos)
 {
 	if ((file->f_flags & O_NONBLOCK) &&
-	    !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+	    !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
 		return -EAGAIN;
-	return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
+	return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC);
 }
 
 static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &log_wait, wait);
-	if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
+	if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
 		return POLLIN | POLLRDNORM;
 	return 0;
 }
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 80e4645f799..5aa847a603c 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -11,6 +11,7 @@
 #include <linux/swap.h>
 #include <linux/vmstat.h>
 #include <linux/atomic.h>
+#include <linux/vmalloc.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include "internal.h"
@@ -40,7 +41,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		* sysctl_overcommit_ratio / 100) + total_swap_pages;
 
 	cached = global_page_state(NR_FILE_PAGES) -
-			total_swapcache_pages - i.bufferram;
+			total_swapcache_pages() - i.bufferram;
 	if (cached < 0)
 		cached = 0;
 
@@ -109,7 +110,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeram),
 		K(i.bufferram),
 		K(cached),
-		K(total_swapcache_pages),
+		K(total_swapcache_pages()),
 		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
 		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
 		K(pages[LRU_ACTIVE_ANON]),
@@ -158,7 +159,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		vmi.used >> 10,
 		vmi.largest_chunk >> 10
 #ifdef CONFIG_MEMORY_FAILURE
-		,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+		,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 		,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c
deleted file mode 100644
index 8ae221dfd01..00000000000
--- a/fs/proc/mmu.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* mmu.c: mmu memory info files
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/spinlock.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <asm/pgtable.h>
-#include "internal.h"
-
-void get_vmalloc_info(struct vmalloc_info *vmi)
-{
-	struct vm_struct *vma;
-	unsigned long free_area_size;
-	unsigned long prev_end;
-
-	vmi->used = 0;
-
-	if (!vmlist) {
-		vmi->largest_chunk = VMALLOC_TOTAL;
-	}
-	else {
-		vmi->largest_chunk = 0;
-
-		prev_end = VMALLOC_START;
-
-		read_lock(&vmlist_lock);
-
-		for (vma = vmlist; vma; vma = vma->next) {
-			unsigned long addr = (unsigned long) vma->addr;
-
-			/*
-			 * Some archs keep another range for modules in vmlist
-			 */
-			if (addr < VMALLOC_START)
-				continue;
-			if (addr >= VMALLOC_END)
-				break;
-
-			vmi->used += vma->size;
-
-			free_area_size = addr - prev_end;
-			if (vmi->largest_chunk < free_area_size)
-				vmi->largest_chunk = free_area_size;
-
-			prev_end = vma->size + addr;
-		}
-
-		if (VMALLOC_END - prev_end > vmi->largest_chunk)
-			vmi->largest_chunk = VMALLOC_END - prev_end;
-
-		read_unlock(&vmlist_lock);
-	}
-}
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index b7a47196c8c..49a7fff2e83 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -51,7 +51,7 @@ static int ns_delete_dentry(const struct dentry *dentry)
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 {
 	struct inode *inode = dentry->d_inode;
-	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
+	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
 
 	return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
 		ns_ops->name, inode->i_ino);
@@ -95,8 +95,8 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb,
 		inode->i_op = &ns_inode_operations;
 		inode->i_mode = S_IFREG | S_IRUGO;
 		inode->i_fop = &ns_file_operations;
-		ei->ns_ops = ns_ops;
-		ei->ns = ns;
+		ei->ns.ns_ops = ns_ops;
+		ei->ns.ns = ns;
 		unlock_new_inode(inode);
 	} else {
 		ns_ops->put(ns);
@@ -118,7 +118,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
 	struct super_block *sb = inode->i_sb;
 	struct proc_inode *ei = PROC_I(inode);
 	struct task_struct *task;
-	struct dentry *ns_dentry;
+	struct path ns_path;
 	void *error = ERR_PTR(-EACCES);
 
 	task = get_proc_task(inode);
@@ -128,14 +128,14 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto out_put_task;
 
-	ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops);
-	if (IS_ERR(ns_dentry)) {
-		error = ERR_CAST(ns_dentry);
+	ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops);
+	if (IS_ERR(ns_path.dentry)) {
+		error = ERR_CAST(ns_path.dentry);
 		goto out_put_task;
 	}
 
-	dput(nd->path.dentry);
-	nd->path.dentry = ns_dentry;
+	ns_path.mnt = mntget(nd->path.mnt);
+	nd_jump_link(nd, &ns_path);
 	error = NULL;
 
 out_put_task:
@@ -148,7 +148,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 {
 	struct inode *inode = dentry->d_inode;
 	struct proc_inode *ei = PROC_I(inode);
-	const struct proc_ns_operations *ns_ops = ei->ns_ops;
+	const struct proc_ns_operations *ns_ops = ei->ns.ns_ops;
 	struct task_struct *task;
 	void *ns;
 	char name[50];
@@ -187,13 +187,12 @@ static const struct inode_operations proc_ns_link_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
-static struct dentry *proc_ns_instantiate(struct inode *dir,
+static int proc_ns_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
 	const struct proc_ns_operations *ns_ops = ptr;
 	struct inode *inode;
 	struct proc_inode *ei;
-	struct dentry *error = ERR_PTR(-ENOENT);
 
 	inode = proc_pid_make_inode(dir->i_sb, task);
 	if (!inode)
@@ -202,96 +201,58 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
 	ei = PROC_I(inode);
 	inode->i_mode = S_IFLNK|S_IRWXUGO;
 	inode->i_op = &proc_ns_link_inode_operations;
-	ei->ns_ops = ns_ops;
+	ei->ns.ns_ops = ns_ops;
 
 	d_set_d_op(dentry, &pid_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
 	if (pid_revalidate(dentry, 0))
-		error = NULL;
+		return 0;
 out:
-	return error;
-}
-
-static int proc_ns_fill_cache(struct file *filp, void *dirent,
-	filldir_t filldir, struct task_struct *task,
-	const struct proc_ns_operations *ops)
-{
-	return proc_fill_cache(filp, dirent, filldir,
-				ops->name, strlen(ops->name),
-				proc_ns_instantiate, task, ops);
+	return -ENOENT;
 }
 
-static int proc_ns_dir_readdir(struct file *filp, void *dirent,
-				filldir_t filldir)
+static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
 {
-	int i;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(file_inode(file));
 	const struct proc_ns_operations **entry, **last;
-	ino_t ino;
-	int ret;
 
-	ret = -ENOENT;
 	if (!task)
-		goto out_no_task;
+		return -ENOENT;
 
-	ret = 0;
-	i = filp->f_pos;
-	switch (i) {
-	case 0:
-		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
-		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	default:
-		i -= 2;
-		if (i >= ARRAY_SIZE(ns_entries)) {
-			ret = 1;
-			goto out;
-		}
-		entry = ns_entries + i;
-		last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
-		while (entry <= last) {
-			if (proc_ns_fill_cache(filp, dirent, filldir,
-						task, *entry) < 0)
-				goto out;
-			filp->f_pos++;
-			entry++;
-		}
+	if (!dir_emit_dots(file, ctx))
+		goto out;
+	if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
+		goto out;
+	entry = ns_entries + (ctx->pos - 2);
+	last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+	while (entry <= last) {
+		const struct proc_ns_operations *ops = *entry;
+		if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
+				     proc_ns_instantiate, task, ops))
+			break;
+		ctx->pos++;
+		entry++;
 	}
-
-	ret = 1;
 out:
 	put_task_struct(task);
-out_no_task:
-	return ret;
+	return 0;
 }
 
 const struct file_operations proc_ns_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_ns_dir_readdir,
+	.iterate	= proc_ns_dir_readdir,
 };
 
 static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 				struct dentry *dentry, unsigned int flags)
 {
-	struct dentry *error;
+	int error;
 	struct task_struct *task = get_proc_task(dir);
 	const struct proc_ns_operations **entry, **last;
 	unsigned int len = dentry->d_name.len;
 
-	error = ERR_PTR(-ENOENT);
+	error = -ENOENT;
 
 	if (!task)
 		goto out_no_task;
@@ -310,7 +271,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
 out:
 	put_task_struct(task);
 out_no_task:
-	return error;
+	return ERR_PTR(error);
 }
 
 const struct inode_operations proc_ns_dir_inode_operations = {
@@ -337,6 +298,11 @@ out_invalid:
 	return ERR_PTR(-EINVAL);
 }
 
+struct proc_ns *get_proc_ns(struct inode *inode)
+{
+	return &PROC_I(inode)->ns;
+}
+
 bool proc_ns_inode(struct inode *inode)
 {
 	return inode->i_fop == &ns_file_operations;
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index b1822dde55c..ccfd99bd1c5 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -45,7 +45,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 	file = region->vm_file;
 
 	if (file) {
-		struct inode *inode = region->vm_file->f_path.dentry->d_inode;
+		struct inode *inode = file_inode(region->vm_file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 	}
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index de20ec480fa..106a8357063 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -8,10 +8,11 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/printk.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/of.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
@@ -40,7 +41,7 @@ static int property_proc_show(struct seq_file *m, void *v)
 
 static int property_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, property_proc_show, PDE(inode)->data);
+	return single_open(file, property_proc_show, __PDE_DATA(inode));
 }
 
 static const struct file_operations property_proc_fops = {
@@ -110,8 +111,8 @@ void proc_device_tree_update_prop(struct proc_dir_entry *pde,
 		if (ent->data == oldprop)
 			break;
 	if (ent == NULL) {
-		printk(KERN_WARNING "device-tree: property \"%s\" "
-		       " does not exist\n", oldprop->name);
+		pr_warn("device-tree: property \"%s\" does not exist\n",
+			oldprop->name);
 	} else {
 		ent->data = newprop;
 		ent->size = newprop->length;
@@ -153,8 +154,8 @@ static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de,
 realloc:
 	fixed_name = kmalloc(fixup_len, GFP_KERNEL);
 	if (fixed_name == NULL) {
-		printk(KERN_ERR "device-tree: Out of memory trying to fixup "
-				"name \"%s\"\n", name);
+		pr_err("device-tree: Out of memory trying to fixup "
+		       "name \"%s\"\n", name);
 		return name;
 	}
 
@@ -175,8 +176,8 @@ retry:
 		goto retry;
 	}
 
-	printk(KERN_WARNING "device-tree: Duplicate name in %s, "
-			"renamed to \"%s\"\n", np->full_name, fixed_name);
+	pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n",
+		np->full_name, fixed_name);
 
 	return fixed_name;
 }
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index fe72cd073de..4677bb7dc7c 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -26,6 +26,10 @@
 
 #include "internal.h"
 
+static inline struct net *PDE_NET(struct proc_dir_entry *pde)
+{
+	return pde->parent->data;
+}
 
 static struct net *get_proc_net(const struct inode *inode)
 {
@@ -156,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = {
 	.getattr	= proc_tgid_net_getattr,
 };
 
-static int proc_tgid_net_readdir(struct file *filp, void *dirent,
-		filldir_t filldir)
+static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
 {
 	int ret;
 	struct net *net;
 
 	ret = -EINVAL;
-	net = get_proc_task_net(filp->f_path.dentry->d_inode);
+	net = get_proc_task_net(file_inode(file));
 	if (net != NULL) {
-		ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+		ret = proc_readdir_de(net->proc_net, file, ctx);
 		put_net(net);
 	}
 	return ret;
@@ -174,23 +177,9 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
 const struct file_operations proc_net_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= proc_tgid_net_readdir,
+	.iterate	= proc_tgid_net_readdir,
 };
 
-
-struct proc_dir_entry *proc_net_fops_create(struct net *net,
-	const char *name, umode_t mode, const struct file_operations *fops)
-{
-	return proc_create(name, mode, net->proc_net, fops);
-}
-EXPORT_SYMBOL_GPL(proc_net_fops_create);
-
-void proc_net_remove(struct net *net, const char *name)
-{
-	remove_proc_entry(name, net->proc_net);
-}
-EXPORT_SYMBOL_GPL(proc_net_remove);
-
 static __net_init int proc_net_ns_init(struct net *net)
 {
 	struct proc_dir_entry *netd, *net_statd;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1827d88ad58..71290463a1d 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -5,6 +5,7 @@
 #include <linux/sysctl.h>
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
+#include <linux/printk.h>
 #include <linux/security.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
@@ -57,7 +58,7 @@ static void sysctl_print_dir(struct ctl_dir *dir)
 {
 	if (dir->header.parent)
 		sysctl_print_dir(dir->header.parent);
-	printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname);
+	pr_cont("%s/", dir->header.ctl_table[0].procname);
 }
 
 static int namecmp(const char *name1, int len1, const char *name2, int len2)
@@ -134,9 +135,9 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
 		else if (cmp > 0)
 			p = &(*p)->rb_right;
 		else {
-			printk(KERN_ERR "sysctl duplicate entry: ");
+			pr_err("sysctl duplicate entry: ");
 			sysctl_print_dir(head->parent);
-			printk(KERN_CONT "/%s\n", entry->procname);
+			pr_cont("/%s\n", entry->procname);
 			return -EEXIST;
 		}
 	}
@@ -478,7 +479,7 @@ out:
 static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 		size_t count, loff_t *ppos, int write)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 	ssize_t error;
@@ -542,7 +543,7 @@ static int proc_sys_open(struct inode *inode, struct file *filp)
 
 static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 	unsigned int ret = DEFAULT_POLLMASK;
@@ -572,12 +573,12 @@ out:
 	return ret;
 }
 
-static int proc_sys_fill_cache(struct file *filp, void *dirent,
-				filldir_t filldir,
+static bool proc_sys_fill_cache(struct file *file,
+				struct dir_context *ctx,
 				struct ctl_table_header *head,
 				struct ctl_table *table)
 {
-	struct dentry *child, *dir = filp->f_path.dentry;
+	struct dentry *child, *dir = file->f_path.dentry;
 	struct inode *inode;
 	struct qstr qname;
 	ino_t ino = 0;
@@ -594,38 +595,38 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
 			inode = proc_sys_make_inode(dir->d_sb, head, table);
 			if (!inode) {
 				dput(child);
-				return -ENOMEM;
+				return false;
 			} else {
 				d_set_d_op(child, &proc_sys_dentry_operations);
 				d_add(child, inode);
 			}
 		} else {
-			return -ENOMEM;
+			return false;
 		}
 	}
 	inode = child->d_inode;
 	ino  = inode->i_ino;
 	type = inode->i_mode >> 12;
 	dput(child);
-	return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+	return dir_emit(ctx, qname.name, qname.len, ino, type);
 }
 
-static int proc_sys_link_fill_cache(struct file *filp, void *dirent,
-				    filldir_t filldir,
+static bool proc_sys_link_fill_cache(struct file *file,
+				    struct dir_context *ctx,
 				    struct ctl_table_header *head,
 				    struct ctl_table *table)
 {
-	int err, ret = 0;
+	bool ret = true;
 	head = sysctl_head_grab(head);
 
 	if (S_ISLNK(table->mode)) {
 		/* It is not an error if we can not follow the link ignore it */
-		err = sysctl_follow_link(&head, &table, current->nsproxy);
+		int err = sysctl_follow_link(&head, &table, current->nsproxy);
 		if (err)
 			goto out;
 	}
 
-	ret = proc_sys_fill_cache(filp, dirent, filldir, head, table);
+	ret = proc_sys_fill_cache(file, ctx, head, table);
 out:
 	sysctl_head_finish(head);
 	return ret;
@@ -633,67 +634,50 @@ out:
 
 static int scan(struct ctl_table_header *head, ctl_table *table,
 		unsigned long *pos, struct file *file,
-		void *dirent, filldir_t filldir)
+		struct dir_context *ctx)
 {
-	int res;
+	bool res;
 
-	if ((*pos)++ < file->f_pos)
-		return 0;
+	if ((*pos)++ < ctx->pos)
+		return true;
 
 	if (unlikely(S_ISLNK(table->mode)))
-		res = proc_sys_link_fill_cache(file, dirent, filldir, head, table);
+		res = proc_sys_link_fill_cache(file, ctx, head, table);
 	else
-		res = proc_sys_fill_cache(file, dirent, filldir, head, table);
+		res = proc_sys_fill_cache(file, ctx, head, table);
 
-	if (res == 0)
-		file->f_pos = *pos;
+	if (res)
+		ctx->pos = *pos;
 
 	return res;
 }
 
-static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table_header *head = grab_header(file_inode(file));
 	struct ctl_table_header *h = NULL;
 	struct ctl_table *entry;
 	struct ctl_dir *ctl_dir;
 	unsigned long pos;
-	int ret = -EINVAL;
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);
 
 	ctl_dir = container_of(head, struct ctl_dir, header);
 
-	ret = 0;
-	/* Avoid a switch here: arm builds fail with missing __cmpdi2 */
-	if (filp->f_pos == 0) {
-		if (filldir(dirent, ".", 1, filp->f_pos,
-				inode->i_ino, DT_DIR) < 0)
-			goto out;
-		filp->f_pos++;
-	}
-	if (filp->f_pos == 1) {
-		if (filldir(dirent, "..", 2, filp->f_pos,
-				parent_ino(dentry), DT_DIR) < 0)
-			goto out;
-		filp->f_pos++;
-	}
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
 	pos = 2;
 
 	for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
-		ret = scan(h, entry, &pos, filp, dirent, filldir);
-		if (ret) {
+		if (!scan(h, entry, &pos, file, ctx)) {
 			sysctl_head_finish(h);
 			break;
 		}
 	}
-	ret = 1;
-out:
 	sysctl_head_finish(head);
-	return ret;
+	return 0;
 }
 
 static int proc_sys_permission(struct inode *inode, int mask)
@@ -768,7 +752,7 @@ static const struct file_operations proc_sys_file_operations = {
 
 static const struct file_operations proc_sys_dir_file_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_sys_readdir,
+	.iterate	= proc_sys_readdir,
 	.llseek		= generic_file_llseek,
 };
 
@@ -812,15 +796,16 @@ static int sysctl_is_seen(struct ctl_table_header *p)
 	return res;
 }
 
-static int proc_sys_compare(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	struct ctl_table_header *head;
+	struct inode *inode;
+
 	/* Although proc doesn't have negative dentries, rcu-walk means
 	 * that inode here can be NULL */
 	/* AV: can it, indeed? */
+	inode = ACCESS_ONCE(dentry->d_inode);
 	if (!inode)
 		return 1;
 	if (name->len != len)
@@ -927,9 +912,9 @@ found:
 	subdir->header.nreg++;
 failed:
 	if (unlikely(IS_ERR(subdir))) {
-		printk(KERN_ERR "sysctl could not get directory: ");
+		pr_err("sysctl could not get directory: ");
 		sysctl_print_dir(dir);
-		printk(KERN_CONT "/%*.*s %ld\n",
+		pr_cont("/%*.*s %ld\n",
 			namelen, namelen, name, PTR_ERR(subdir));
 	}
 	drop_sysctl_table(&dir->header);
@@ -995,8 +980,8 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n",
-		path, table->procname, &vaf);
+	pr_err("sysctl table check failed: %s/%s %pV\n",
+	       path, table->procname, &vaf);
 
 	va_end(args);
 	return -EINVAL;
@@ -1510,9 +1495,9 @@ static void put_links(struct ctl_table_header *header)
 			drop_sysctl_table(link_head);
 		}
 		else {
-			printk(KERN_ERR "sysctl link missing during unregister: ");
+			pr_err("sysctl link missing during unregister: ");
 			sysctl_print_dir(parent);
-			printk(KERN_CONT "/%s\n", name);
+			pr_cont("/%s\n", name);
 		}
 	}
 }
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c6e9fac26ba..229e366598d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
+#include <linux/user_namespace.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
 #include <linux/parser.h>
@@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	} else {
 		ns = task_active_pid_ns(current);
 		options = data;
+
+		if (!current_user_ns()->may_mount_proc)
+			return ERR_PTR(-EPERM);
 	}
 
 	sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
@@ -137,6 +141,8 @@ static void proc_kill_sb(struct super_block *sb)
 	struct pid_namespace *ns;
 
 	ns = (struct pid_namespace *)sb->s_fs_info;
+	if (ns->proc_self)
+		dput(ns->proc_self);
 	kill_anon_super(sb);
 	put_pid_ns(ns);
 }
@@ -196,21 +202,14 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
 	return proc_pid_lookup(dir, dentry, flags);
 }
 
-static int proc_root_readdir(struct file * filp,
-	void * dirent, filldir_t filldir)
+static int proc_root_readdir(struct file *file, struct dir_context *ctx)
 {
-	unsigned int nr = filp->f_pos;
-	int ret;
-
-	if (nr < FIRST_PROCESS_ENTRY) {
-		int error = proc_readdir(filp, dirent, filldir);
-		if (error <= 0)
-			return error;
-		filp->f_pos = FIRST_PROCESS_ENTRY;
+	if (ctx->pos < FIRST_PROCESS_ENTRY) {
+		proc_readdir(file, ctx);
+		ctx->pos = FIRST_PROCESS_ENTRY;
 	}
 
-	ret = proc_pid_readdir(filp, dirent, filldir);
-	return ret;
+	return proc_pid_readdir(file, ctx);
 }
 
 /*
@@ -220,7 +219,7 @@ static int proc_root_readdir(struct file * filp,
  */
 static const struct file_operations proc_root_operations = {
 	.read		 = generic_read_dir,
-	.readdir	 = proc_root_readdir,
+	.iterate	 = proc_root_readdir,
 	.llseek		= default_llseek,
 };
 
diff --git a/fs/proc/self.c b/fs/proc/self.c
index aa5cc3bff14..6b6a993b5c2 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -1,6 +1,8 @@
-#include <linux/proc_fs.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
 
 /*
  * /proc/self:
@@ -48,12 +50,43 @@ static const struct inode_operations proc_self_inode_operations = {
 	.put_link	= proc_self_put_link,
 };
 
-void __init proc_self_init(void)
+static unsigned self_inum;
+
+int proc_setup_self(struct super_block *s)
 {
-	struct proc_dir_entry *proc_self_symlink;
-	mode_t mode;
+	struct inode *root_inode = s->s_root->d_inode;
+	struct pid_namespace *ns = s->s_fs_info;
+	struct dentry *self;
+	
+	mutex_lock(&root_inode->i_mutex);
+	self = d_alloc_name(s->s_root, "self");
+	if (self) {
+		struct inode *inode = new_inode_pseudo(s);
+		if (inode) {
+			inode->i_ino = self_inum;
+			inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+			inode->i_mode = S_IFLNK | S_IRWXUGO;
+			inode->i_uid = GLOBAL_ROOT_UID;
+			inode->i_gid = GLOBAL_ROOT_GID;
+			inode->i_op = &proc_self_inode_operations;
+			d_add(self, inode);
+		} else {
+			dput(self);
+			self = ERR_PTR(-ENOMEM);
+		}
+	} else {
+		self = ERR_PTR(-ENOMEM);
+	}
+	mutex_unlock(&root_inode->i_mutex);
+	if (IS_ERR(self)) {
+		pr_err("proc_fill_super: can't allocate /proc/self\n");
+		return PTR_ERR(self);
+	}
+	ns->proc_self = self;
+	return 0;
+}
 
-	mode = S_IFLNK | S_IRWXUGO;
-	proc_self_symlink = proc_create("self", mode, NULL, NULL );
-	proc_self_symlink->proc_iops = &proc_self_inode_operations;
+void __init proc_self_init(void)
+{
+	proc_alloc_inum(&self_inum);
 }
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e296572c73e..1cf86c0e868 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -184,7 +184,7 @@ static int show_stat(struct seq_file *p, void *v)
 
 static int stat_open(struct inode *inode, struct file *file)
 {
-	unsigned size = 1024 + 128 * num_possible_cpus();
+	size_t size = 1024 + 128 * num_possible_cpus();
 	char *buf;
 	struct seq_file *m;
 	int res;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca5ce7f9f80..dbf61f6174f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -271,7 +272,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	const char *name = NULL;
 
 	if (file) {
-		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		struct inode *inode = file_inode(vma->vm_file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
@@ -688,10 +689,58 @@ const struct file_operations proc_tid_smaps_operations = {
 	.release	= seq_release_private,
 };
 
+/*
+ * We do not want to have constant page-shift bits sitting in
+ * pagemap entries and are about to reuse them some time soon.
+ *
+ * Here's the "migration strategy":
+ * 1. when the system boots these bits remain what they are,
+ *    but a warning about future change is printed in log;
+ * 2. once anyone clears soft-dirty bits via clear_refs file,
+ *    these flag is set to denote, that user is aware of the
+ *    new API and those page-shift bits change their meaning.
+ *    The respective warning is printed in dmesg;
+ * 3. In a couple of releases we will remove all the mentions
+ *    of page-shift in pagemap entries.
+ */
+
+static bool soft_dirty_cleared __read_mostly;
+
+enum clear_refs_types {
+	CLEAR_REFS_ALL = 1,
+	CLEAR_REFS_ANON,
+	CLEAR_REFS_MAPPED,
+	CLEAR_REFS_SOFT_DIRTY,
+	CLEAR_REFS_LAST,
+};
+
+struct clear_refs_private {
+	struct vm_area_struct *vma;
+	enum clear_refs_types type;
+};
+
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+		unsigned long addr, pte_t *pte)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+	/*
+	 * The soft-dirty tracker uses #PF-s to catch writes
+	 * to pages, so write-protect the pte as well. See the
+	 * Documentation/vm/soft-dirty.txt for full description
+	 * of how soft-dirty works.
+	 */
+	pte_t ptent = *pte;
+	ptent = pte_wrprotect(ptent);
+	ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+	set_pte_at(vma->vm_mm, addr, pte, ptent);
+#endif
+}
+
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 				unsigned long end, struct mm_walk *walk)
 {
-	struct vm_area_struct *vma = walk->private;
+	struct clear_refs_private *cp = walk->private;
+	struct vm_area_struct *vma = cp->vma;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
@@ -706,6 +755,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!pte_present(ptent))
 			continue;
 
+		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+			clear_soft_dirty(vma, addr, pte);
+			continue;
+		}
+
 		page = vm_normal_page(vma, addr, ptent);
 		if (!page)
 			continue;
@@ -719,10 +773,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
-#define CLEAR_REFS_ALL 1
-#define CLEAR_REFS_ANON 2
-#define CLEAR_REFS_MAPPED 3
-
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
@@ -730,7 +780,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 	char buffer[PROC_NUMBUF];
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
-	int type;
+	enum clear_refs_types type;
+	int itype;
 	int rv;
 
 	memset(buffer, 0, sizeof(buffer));
@@ -738,23 +789,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count))
 		return -EFAULT;
-	rv = kstrtoint(strstrip(buffer), 10, &type);
+	rv = kstrtoint(strstrip(buffer), 10, &itype);
 	if (rv < 0)
 		return rv;
-	if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
+	type = (enum clear_refs_types)itype;
+	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
 		return -EINVAL;
-	task = get_proc_task(file->f_path.dentry->d_inode);
+
+	if (type == CLEAR_REFS_SOFT_DIRTY) {
+		soft_dirty_cleared = true;
+		pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
+				"See the linux/Documentation/vm/pagemap.txt for details.\n");
+	}
+
+	task = get_proc_task(file_inode(file));
 	if (!task)
 		return -ESRCH;
 	mm = get_task_mm(task);
 	if (mm) {
+		struct clear_refs_private cp = {
+			.type = type,
+		};
 		struct mm_walk clear_refs_walk = {
 			.pmd_entry = clear_refs_pte_range,
 			.mm = mm,
+			.private = &cp,
 		};
 		down_read(&mm->mmap_sem);
+		if (type == CLEAR_REFS_SOFT_DIRTY)
+			mmu_notifier_invalidate_range_start(mm, 0, -1);
 		for (vma = mm->mmap; vma; vma = vma->vm_next) {
-			clear_refs_walk.private = vma;
+			cp.vma = vma;
 			if (is_vm_hugetlb_page(vma))
 				continue;
 			/*
@@ -773,6 +838,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			walk_page_range(vma->vm_start, vma->vm_end,
 					&clear_refs_walk);
 		}
+		if (type == CLEAR_REFS_SOFT_DIRTY)
+			mmu_notifier_invalidate_range_end(mm, 0, -1);
 		flush_tlb_mm(mm);
 		up_read(&mm->mmap_sem);
 		mmput(mm);
@@ -794,6 +861,7 @@ typedef struct {
 struct pagemapread {
 	int pos, len;
 	pagemap_entry_t *buffer;
+	bool v2;
 };
 
 #define PAGEMAP_WALK_SIZE	(PMD_SIZE)
@@ -807,14 +875,17 @@ struct pagemapread {
 #define PM_PSHIFT_BITS      6
 #define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
 #define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define __PM_PSHIFT(x)      (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
 #define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
 #define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+/* in "new" pagemap pshift bits are occupied with more status bits */
+#define PM_STATUS2(v2, x)   (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
 
+#define __PM_SOFT_DIRTY      (1LL)
 #define PM_PRESENT          PM_STATUS(4LL)
 #define PM_SWAP             PM_STATUS(2LL)
 #define PM_FILE             PM_STATUS(1LL)
-#define PM_NOT_PRESENT      PM_PSHIFT(PAGE_SHIFT)
+#define PM_NOT_PRESENT(v2)  PM_STATUS2(v2, 0)
 #define PM_END_OF_BUFFER    1
 
 static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +908,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 	struct pagemapread *pm = walk->private;
 	unsigned long addr;
 	int err = 0;
-	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
 
 	for (addr = start; addr < end; addr += PAGE_SIZE) {
 		err = add_to_pagemap(addr, &pme, pm);
@@ -847,11 +918,12 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 	return err;
 }
 
-static void pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 		struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
 	u64 frame, flags;
 	struct page *page = NULL;
+	int flags2 = 0;
 
 	if (pte_present(pte)) {
 		frame = pte_pfn(pte);
@@ -866,19 +938,21 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme,
 		if (is_migration_entry(entry))
 			page = migration_entry_to_page(entry);
 	} else {
-		*pme = make_pme(PM_NOT_PRESENT);
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2));
 		return;
 	}
 
 	if (page && !PageAnon(page))
 		flags |= PM_FILE;
+	if (pte_soft_dirty(pte))
+		flags2 |= __PM_SOFT_DIRTY;
 
-	*pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags);
+	*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
-					pmd_t pmd, int offset)
+static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+		pmd_t pmd, int offset, int pmd_flags2)
 {
 	/*
 	 * Currently pmd for thp is always present because thp can not be
@@ -887,13 +961,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
 	 */
 	if (pmd_present(pmd))
 		*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
-				| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+				| PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
 	else
-		*pme = make_pme(PM_NOT_PRESENT);
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2));
 }
 #else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
-						pmd_t pmd, int offset)
+static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+		pmd_t pmd, int offset, int pmd_flags2)
 {
 }
 #endif
@@ -905,17 +979,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	struct pagemapread *pm = walk->private;
 	pte_t *pte;
 	int err = 0;
-	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
+	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
 
 	/* find the first VMA at or above 'addr' */
 	vma = find_vma(walk->mm, addr);
 	if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+		int pmd_flags2;
+
+		pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
 		for (; addr != end; addr += PAGE_SIZE) {
 			unsigned long offset;
 
 			offset = (addr & ~PAGEMAP_WALK_MASK) >>
 					PAGE_SHIFT;
-			thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
+			thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
 			err = add_to_pagemap(addr, &pme, pm);
 			if (err)
 				break;
@@ -932,7 +1009,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		 * and need a new, higher one */
 		if (vma && (addr >= vma->vm_end)) {
 			vma = find_vma(walk->mm, addr);
-			pme = make_pme(PM_NOT_PRESENT);
+			pme = make_pme(PM_NOT_PRESENT(pm->v2));
 		}
 
 		/* check that 'vma' actually covers this address,
@@ -940,7 +1017,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 		if (vma && (vma->vm_start <= addr) &&
 		    !is_vm_hugetlb_page(vma)) {
 			pte = pte_offset_map(pmd, addr);
-			pte_to_pagemap_entry(&pme, vma, addr, *pte);
+			pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
 			/* unmap before userspace copy */
 			pte_unmap(pte);
 		}
@@ -955,14 +1032,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme,
+static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 					pte_t pte, int offset)
 {
 	if (pte_present(pte))
 		*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
-				| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
+				| PM_STATUS2(pm->v2, 0) | PM_PRESENT);
 	else
-		*pme = make_pme(PM_NOT_PRESENT);
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2));
 }
 
 /* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1053,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
 
 	for (; addr != end; addr += PAGE_SIZE) {
 		int offset = (addr & ~hmask) >> PAGE_SHIFT;
-		huge_pte_to_pagemap_entry(&pme, *pte, offset);
+		huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
 		err = add_to_pagemap(addr, &pme, pm);
 		if (err)
 			return err;
@@ -1015,7 +1092,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
 static ssize_t pagemap_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
-	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+	struct task_struct *task = get_proc_task(file_inode(file));
 	struct mm_struct *mm;
 	struct pagemapread pm;
 	int ret = -ESRCH;
@@ -1038,6 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	if (!count)
 		goto out_task;
 
+	pm.v2 = soft_dirty_cleared;
 	pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
 	pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
 	ret = -ENOMEM;
@@ -1110,9 +1188,18 @@ out:
 	return ret;
 }
 
+static int pagemap_open(struct inode *inode, struct file *file)
+{
+	pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
+			"to stop being page-shift some time soon. See the "
+			"linux/Documentation/vm/pagemap.txt for details.\n");
+	return 0;
+}
+
 const struct file_operations proc_pagemap_operations = {
 	.llseek		= mem_lseek, /* borrow this */
 	.read		= pagemap_read,
+	.open		= pagemap_open,
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 1ccfa537f5f..56123a6f462 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -149,7 +149,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 	file = vma->vm_file;
 
 	if (file) {
-		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		struct inode *inode = file_inode(vma->vm_file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 9610ac772d7..06189462590 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v)
 	for_each_possible_cpu(i)
 		idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
 
-	do_posix_clock_monotonic_gettime(&uptime);
-	monotonic_to_bootbased(&uptime);
+	get_monotonic_boottime(&uptime);
 	nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
 	idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
 	idle.tv_nsec = rem;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0d5071d2998..28503172f2e 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -8,19 +8,22 @@
  */
 
 #include <linux/mm.h>
-#include <linux/proc_fs.h>
+#include <linux/kcore.h>
 #include <linux/user.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/printk.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
 #include <linux/crash_dump.h>
 #include <linux/list.h>
+#include <linux/vmalloc.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
+#include "internal.h"
 
 /* List representing chunks of contiguous memory areas and their offsets in
  * vmcore file.
@@ -30,6 +33,10 @@ static LIST_HEAD(vmcore_list);
 /* Stores the pointer to the buffer containing kernel elf core headers. */
 static char *elfcorebuf;
 static size_t elfcorebuf_sz;
+static size_t elfcorebuf_sz_orig;
+
+static char *elfnotes_buf;
+static size_t elfnotes_sz;
 
 /* Total size of vmcore file. */
 static u64 vmcore_size;
@@ -116,27 +123,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
 	return read;
 }
 
-/* Maps vmcore file offset to respective physical address in memroy. */
-static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
-					struct vmcore **m_ptr)
-{
-	struct vmcore *m;
-	u64 paddr;
-
-	list_for_each_entry(m, vc_list, list) {
-		u64 start, end;
-		start = m->offset;
-		end = m->offset + m->size - 1;
-		if (offset >= start && offset <= end) {
-			paddr = m->paddr + offset - start;
-			*m_ptr = m;
-			return paddr;
-		}
-	}
-	*m_ptr = NULL;
-	return 0;
-}
-
 /* Read from the ELF header and then the crash dump. On error, negative value is
  * returned otherwise number of bytes read are returned.
  */
@@ -145,8 +131,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 {
 	ssize_t acc = 0, tmp;
 	size_t tsz;
-	u64 start, nr_bytes;
-	struct vmcore *curr_m = NULL;
+	u64 start;
+	struct vmcore *m = NULL;
 
 	if (buflen == 0 || *fpos >= vmcore_size)
 		return 0;
@@ -157,9 +143,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 
 	/* Read ELF core header */
 	if (*fpos < elfcorebuf_sz) {
-		tsz = elfcorebuf_sz - *fpos;
-		if (buflen < tsz)
-			tsz = buflen;
+		tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
 		if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
 			return -EFAULT;
 		buflen -= tsz;
@@ -172,45 +156,161 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 			return acc;
 	}
 
-	start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m);
-	if (!curr_m)
-        	return -EINVAL;
-	if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
-		tsz = buflen;
-
-	/* Calculate left bytes in current memory segment. */
-	nr_bytes = (curr_m->size - (start - curr_m->paddr));
-	if (tsz > nr_bytes)
-		tsz = nr_bytes;
-
-	while (buflen) {
-		tmp = read_from_oldmem(buffer, tsz, &start, 1);
-		if (tmp < 0)
-			return tmp;
+	/* Read Elf note segment */
+	if (*fpos < elfcorebuf_sz + elfnotes_sz) {
+		void *kaddr;
+
+		tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
+		kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
+		if (copy_to_user(buffer, kaddr, tsz))
+			return -EFAULT;
 		buflen -= tsz;
 		*fpos += tsz;
 		buffer += tsz;
 		acc += tsz;
-		if (start >= (curr_m->paddr + curr_m->size)) {
-			if (curr_m->list.next == &vmcore_list)
-				return acc;	/*EOF*/
-			curr_m = list_entry(curr_m->list.next,
-						struct vmcore, list);
-			start = curr_m->paddr;
+
+		/* leave now if filled buffer already */
+		if (buflen == 0)
+			return acc;
+	}
+
+	list_for_each_entry(m, &vmcore_list, list) {
+		if (*fpos < m->offset + m->size) {
+			tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
+			start = m->paddr + *fpos - m->offset;
+			tmp = read_from_oldmem(buffer, tsz, &start, 1);
+			if (tmp < 0)
+				return tmp;
+			buflen -= tsz;
+			*fpos += tsz;
+			buffer += tsz;
+			acc += tsz;
+
+			/* leave now if filled buffer already */
+			if (buflen == 0)
+				return acc;
 		}
-		if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
-			tsz = buflen;
-		/* Calculate left bytes in current memory segment. */
-		nr_bytes = (curr_m->size - (start - curr_m->paddr));
-		if (tsz > nr_bytes)
-			tsz = nr_bytes;
 	}
+
 	return acc;
 }
 
+/**
+ * alloc_elfnotes_buf - allocate buffer for ELF note segment in
+ *                      vmalloc memory
+ *
+ * @notes_sz: size of buffer
+ *
+ * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
+ * the buffer to user-space by means of remap_vmalloc_range().
+ *
+ * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
+ * disabled and there's no need to allow users to mmap the buffer.
+ */
+static inline char *alloc_elfnotes_buf(size_t notes_sz)
+{
+#ifdef CONFIG_MMU
+	return vmalloc_user(notes_sz);
+#else
+	return vzalloc(notes_sz);
+#endif
+}
+
+/*
+ * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
+ * essential for mmap_vmcore() in order to map physically
+ * non-contiguous objects (ELF header, ELF note segment and memory
+ * regions in the 1st kernel pointed to by PT_LOAD entries) into
+ * virtually contiguous user-space in ELF layout.
+ */
+#ifdef CONFIG_MMU
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start;
+	u64 start, end, len, tsz;
+	struct vmcore *m;
+
+	start = (u64)vma->vm_pgoff << PAGE_SHIFT;
+	end = start + size;
+
+	if (size > vmcore_size || end > vmcore_size)
+		return -EINVAL;
+
+	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
+		return -EPERM;
+
+	vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+	vma->vm_flags |= VM_MIXEDMAP;
+
+	len = 0;
+
+	if (start < elfcorebuf_sz) {
+		u64 pfn;
+
+		tsz = min(elfcorebuf_sz - (size_t)start, size);
+		pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
+		if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
+				    vma->vm_page_prot))
+			return -EAGAIN;
+		size -= tsz;
+		start += tsz;
+		len += tsz;
+
+		if (size == 0)
+			return 0;
+	}
+
+	if (start < elfcorebuf_sz + elfnotes_sz) {
+		void *kaddr;
+
+		tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
+		kaddr = elfnotes_buf + start - elfcorebuf_sz;
+		if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
+						kaddr, tsz))
+			goto fail;
+		size -= tsz;
+		start += tsz;
+		len += tsz;
+
+		if (size == 0)
+			return 0;
+	}
+
+	list_for_each_entry(m, &vmcore_list, list) {
+		if (start < m->offset + m->size) {
+			u64 paddr = 0;
+
+			tsz = min_t(size_t, m->offset + m->size - start, size);
+			paddr = m->paddr + start - m->offset;
+			if (remap_pfn_range(vma, vma->vm_start + len,
+					    paddr >> PAGE_SHIFT, tsz,
+					    vma->vm_page_prot))
+				goto fail;
+			size -= tsz;
+			start += tsz;
+			len += tsz;
+
+			if (size == 0)
+				return 0;
+		}
+	}
+
+	return 0;
+fail:
+	do_munmap(vma->vm_mm, vma->vm_start, len);
+	return -EAGAIN;
+}
+#else
+static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
+{
+	return -ENOSYS;
+}
+#endif
+
 static const struct file_operations proc_vmcore_operations = {
 	.read		= read_vmcore,
 	.llseek		= default_llseek,
+	.mmap		= mmap_vmcore,
 };
 
 static struct vmcore* __init get_new_element(void)
@@ -218,61 +318,40 @@ static struct vmcore* __init get_new_element(void)
 	return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
 }
 
-static u64 __init get_vmcore_size_elf64(char *elfptr)
+static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+				  struct list_head *vc_list)
 {
-	int i;
 	u64 size;
-	Elf64_Ehdr *ehdr_ptr;
-	Elf64_Phdr *phdr_ptr;
-
-	ehdr_ptr = (Elf64_Ehdr *)elfptr;
-	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
-	size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
-	for (i = 0; i < ehdr_ptr->e_phnum; i++) {
-		size += phdr_ptr->p_memsz;
-		phdr_ptr++;
-	}
-	return size;
-}
-
-static u64 __init get_vmcore_size_elf32(char *elfptr)
-{
-	int i;
-	u64 size;
-	Elf32_Ehdr *ehdr_ptr;
-	Elf32_Phdr *phdr_ptr;
+	struct vmcore *m;
 
-	ehdr_ptr = (Elf32_Ehdr *)elfptr;
-	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
-	size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr));
-	for (i = 0; i < ehdr_ptr->e_phnum; i++) {
-		size += phdr_ptr->p_memsz;
-		phdr_ptr++;
+	size = elfsz + elfnotesegsz;
+	list_for_each_entry(m, vc_list, list) {
+		size += m->size;
 	}
 	return size;
 }
 
-/* Merges all the PT_NOTE headers into one. */
-static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
-						struct list_head *vc_list)
+/**
+ * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
 {
-	int i, nr_ptnote=0, rc=0;
-	char *tmp;
-	Elf64_Ehdr *ehdr_ptr;
-	Elf64_Phdr phdr, *phdr_ptr;
+	int i, rc=0;
+	Elf64_Phdr *phdr_ptr;
 	Elf64_Nhdr *nhdr_ptr;
-	u64 phdr_sz = 0, note_off;
 
-	ehdr_ptr = (Elf64_Ehdr *)elfptr;
-	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+	phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
-		int j;
 		void *notes_section;
-		struct vmcore *new;
 		u64 offset, max_sz, sz, real_sz = 0;
 		if (phdr_ptr->p_type != PT_NOTE)
 			continue;
-		nr_ptnote++;
 		max_sz = phdr_ptr->p_memsz;
 		offset = phdr_ptr->p_offset;
 		notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -284,7 +363,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 			return rc;
 		}
 		nhdr_ptr = notes_section;
-		for (j = 0; j < max_sz; j += sz) {
+		while (real_sz < max_sz) {
 			if (nhdr_ptr->n_namesz == 0)
 				break;
 			sz = sizeof(Elf64_Nhdr) +
@@ -293,26 +372,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 			real_sz += sz;
 			nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
 		}
-
-		/* Add this contiguous chunk of notes section to vmcore list.*/
-		new = get_new_element();
-		if (!new) {
-			kfree(notes_section);
-			return -ENOMEM;
-		}
-		new->paddr = phdr_ptr->p_offset;
-		new->size = real_sz;
-		list_add_tail(&new->list, vc_list);
-		phdr_sz += real_sz;
 		kfree(notes_section);
+		phdr_ptr->p_memsz = real_sz;
+	}
+
+	return 0;
+}
+
+/**
+ * get_note_number_and_size_elf64 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
+						 int *nr_ptnote, u64 *sz_ptnote)
+{
+	int i;
+	Elf64_Phdr *phdr_ptr;
+
+	*nr_ptnote = *sz_ptnote = 0;
+
+	phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		*nr_ptnote += 1;
+		*sz_ptnote += phdr_ptr->p_memsz;
+	}
+
+	return 0;
+}
+
+/**
+ * copy_notes_elf64 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf64
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
+{
+	int i, rc=0;
+	Elf64_Phdr *phdr_ptr;
+
+	phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 offset;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		offset = phdr_ptr->p_offset;
+		rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+		if (rc < 0)
+			return rc;
+		notes_buf += phdr_ptr->p_memsz;
 	}
 
+	return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+					   char **notes_buf, size_t *notes_sz)
+{
+	int i, nr_ptnote=0, rc=0;
+	char *tmp;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr phdr;
+	u64 phdr_sz = 0, note_off;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+
+	rc = update_note_header_size_elf64(ehdr_ptr);
+	if (rc < 0)
+		return rc;
+
+	rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
+	if (rc < 0)
+		return rc;
+
+	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
+	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	if (!*notes_buf)
+		return -ENOMEM;
+
+	rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
+	if (rc < 0)
+		return rc;
+
 	/* Prepare merged PT_NOTE program header. */
 	phdr.p_type    = PT_NOTE;
 	phdr.p_flags   = 0;
 	note_off = sizeof(Elf64_Ehdr) +
 			(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
-	phdr.p_offset  = note_off;
+	phdr.p_offset  = roundup(note_off, PAGE_SIZE);
 	phdr.p_vaddr   = phdr.p_paddr = 0;
 	phdr.p_filesz  = phdr.p_memsz = phdr_sz;
 	phdr.p_align   = 0;
@@ -326,6 +501,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 	i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
 	*elfsz = *elfsz - i;
 	memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+	memset(elfptr + *elfsz, 0, i);
+	*elfsz = roundup(*elfsz, PAGE_SIZE);
 
 	/* Modify e_phnum to reflect merged headers. */
 	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -333,27 +510,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 	return 0;
 }
 
-/* Merges all the PT_NOTE headers into one. */
-static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
-						struct list_head *vc_list)
+/**
+ * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
+ *
+ * @ehdr_ptr: ELF header
+ *
+ * This function updates p_memsz member of each PT_NOTE entry in the
+ * program header table pointed to by @ehdr_ptr to real size of ELF
+ * note segment.
+ */
+static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
 {
-	int i, nr_ptnote=0, rc=0;
-	char *tmp;
-	Elf32_Ehdr *ehdr_ptr;
-	Elf32_Phdr phdr, *phdr_ptr;
+	int i, rc=0;
+	Elf32_Phdr *phdr_ptr;
 	Elf32_Nhdr *nhdr_ptr;
-	u64 phdr_sz = 0, note_off;
 
-	ehdr_ptr = (Elf32_Ehdr *)elfptr;
-	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
+	phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
-		int j;
 		void *notes_section;
-		struct vmcore *new;
 		u64 offset, max_sz, sz, real_sz = 0;
 		if (phdr_ptr->p_type != PT_NOTE)
 			continue;
-		nr_ptnote++;
 		max_sz = phdr_ptr->p_memsz;
 		offset = phdr_ptr->p_offset;
 		notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -365,7 +542,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 			return rc;
 		}
 		nhdr_ptr = notes_section;
-		for (j = 0; j < max_sz; j += sz) {
+		while (real_sz < max_sz) {
 			if (nhdr_ptr->n_namesz == 0)
 				break;
 			sz = sizeof(Elf32_Nhdr) +
@@ -374,26 +551,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 			real_sz += sz;
 			nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
 		}
-
-		/* Add this contiguous chunk of notes section to vmcore list.*/
-		new = get_new_element();
-		if (!new) {
-			kfree(notes_section);
-			return -ENOMEM;
-		}
-		new->paddr = phdr_ptr->p_offset;
-		new->size = real_sz;
-		list_add_tail(&new->list, vc_list);
-		phdr_sz += real_sz;
 		kfree(notes_section);
+		phdr_ptr->p_memsz = real_sz;
+	}
+
+	return 0;
+}
+
+/**
+ * get_note_number_and_size_elf32 - get the number of PT_NOTE program
+ * headers and sum of real size of their ELF note segment headers and
+ * data.
+ *
+ * @ehdr_ptr: ELF header
+ * @nr_ptnote: buffer for the number of PT_NOTE program headers
+ * @sz_ptnote: buffer for size of unique PT_NOTE program header
+ *
+ * This function is used to merge multiple PT_NOTE program headers
+ * into a unique single one. The resulting unique entry will have
+ * @sz_ptnote in its phdr->p_mem.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
+						 int *nr_ptnote, u64 *sz_ptnote)
+{
+	int i;
+	Elf32_Phdr *phdr_ptr;
+
+	*nr_ptnote = *sz_ptnote = 0;
+
+	phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		*nr_ptnote += 1;
+		*sz_ptnote += phdr_ptr->p_memsz;
+	}
+
+	return 0;
+}
+
+/**
+ * copy_notes_elf32 - copy ELF note segments in a given buffer
+ *
+ * @ehdr_ptr: ELF header
+ * @notes_buf: buffer into which ELF note segments are copied
+ *
+ * This function is used to copy ELF note segment in the 1st kernel
+ * into the buffer @notes_buf in the 2nd kernel. It is assumed that
+ * size of the buffer @notes_buf is equal to or larger than sum of the
+ * real ELF note segment headers and data.
+ *
+ * It is assumed that program headers with PT_NOTE type pointed to by
+ * @ehdr_ptr has already been updated by update_note_header_size_elf32
+ * and each of PT_NOTE program headers has actual ELF note segment
+ * size in its p_memsz member.
+ */
+static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
+{
+	int i, rc=0;
+	Elf32_Phdr *phdr_ptr;
+
+	phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 offset;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		offset = phdr_ptr->p_offset;
+		rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+		if (rc < 0)
+			return rc;
+		notes_buf += phdr_ptr->p_memsz;
 	}
 
+	return 0;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
+					   char **notes_buf, size_t *notes_sz)
+{
+	int i, nr_ptnote=0, rc=0;
+	char *tmp;
+	Elf32_Ehdr *ehdr_ptr;
+	Elf32_Phdr phdr;
+	u64 phdr_sz = 0, note_off;
+
+	ehdr_ptr = (Elf32_Ehdr *)elfptr;
+
+	rc = update_note_header_size_elf32(ehdr_ptr);
+	if (rc < 0)
+		return rc;
+
+	rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
+	if (rc < 0)
+		return rc;
+
+	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
+	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	if (!*notes_buf)
+		return -ENOMEM;
+
+	rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
+	if (rc < 0)
+		return rc;
+
 	/* Prepare merged PT_NOTE program header. */
 	phdr.p_type    = PT_NOTE;
 	phdr.p_flags   = 0;
 	note_off = sizeof(Elf32_Ehdr) +
 			(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
-	phdr.p_offset  = note_off;
+	phdr.p_offset  = roundup(note_off, PAGE_SIZE);
 	phdr.p_vaddr   = phdr.p_paddr = 0;
 	phdr.p_filesz  = phdr.p_memsz = phdr_sz;
 	phdr.p_align   = 0;
@@ -407,6 +680,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 	i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
 	*elfsz = *elfsz - i;
 	memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
+	memset(elfptr + *elfsz, 0, i);
+	*elfsz = roundup(*elfsz, PAGE_SIZE);
 
 	/* Modify e_phnum to reflect merged headers. */
 	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -418,6 +693,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
  * the new offset fields of exported program headers. */
 static int __init process_ptload_program_headers_elf64(char *elfptr,
 						size_t elfsz,
+						size_t elfnotes_sz,
 						struct list_head *vc_list)
 {
 	int i;
@@ -429,32 +705,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
 	ehdr_ptr = (Elf64_Ehdr *)elfptr;
 	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
 
-	/* First program header is PT_NOTE header. */
-	vmcore_off = sizeof(Elf64_Ehdr) +
-			(ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
-			phdr_ptr->p_memsz; /* Note sections */
+	/* Skip Elf header, program headers and Elf note segment. */
+	vmcore_off = elfsz + elfnotes_sz;
 
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 paddr, start, end, size;
+
 		if (phdr_ptr->p_type != PT_LOAD)
 			continue;
 
+		paddr = phdr_ptr->p_offset;
+		start = rounddown(paddr, PAGE_SIZE);
+		end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+		size = end - start;
+
 		/* Add this contiguous chunk of memory to vmcore list.*/
 		new = get_new_element();
 		if (!new)
 			return -ENOMEM;
-		new->paddr = phdr_ptr->p_offset;
-		new->size = phdr_ptr->p_memsz;
+		new->paddr = start;
+		new->size = size;
 		list_add_tail(&new->list, vc_list);
 
 		/* Update the program header offset. */
-		phdr_ptr->p_offset = vmcore_off;
-		vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+		phdr_ptr->p_offset = vmcore_off + (paddr - start);
+		vmcore_off = vmcore_off + size;
 	}
 	return 0;
 }
 
 static int __init process_ptload_program_headers_elf32(char *elfptr,
 						size_t elfsz,
+						size_t elfnotes_sz,
 						struct list_head *vc_list)
 {
 	int i;
@@ -466,43 +748,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 	ehdr_ptr = (Elf32_Ehdr *)elfptr;
 	phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
 
-	/* First program header is PT_NOTE header. */
-	vmcore_off = sizeof(Elf32_Ehdr) +
-			(ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
-			phdr_ptr->p_memsz; /* Note sections */
+	/* Skip Elf header, program headers and Elf note segment. */
+	vmcore_off = elfsz + elfnotes_sz;
 
 	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		u64 paddr, start, end, size;
+
 		if (phdr_ptr->p_type != PT_LOAD)
 			continue;
 
+		paddr = phdr_ptr->p_offset;
+		start = rounddown(paddr, PAGE_SIZE);
+		end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
+		size = end - start;
+
 		/* Add this contiguous chunk of memory to vmcore list.*/
 		new = get_new_element();
 		if (!new)
 			return -ENOMEM;
-		new->paddr = phdr_ptr->p_offset;
-		new->size = phdr_ptr->p_memsz;
+		new->paddr = start;
+		new->size = size;
 		list_add_tail(&new->list, vc_list);
 
 		/* Update the program header offset */
-		phdr_ptr->p_offset = vmcore_off;
-		vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+		phdr_ptr->p_offset = vmcore_off + (paddr - start);
+		vmcore_off = vmcore_off + size;
 	}
 	return 0;
 }
 
 /* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets_elf64(char *elfptr,
-						struct list_head *vc_list)
+static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
+					   struct list_head *vc_list)
 {
 	loff_t vmcore_off;
-	Elf64_Ehdr *ehdr_ptr;
 	struct vmcore *m;
 
-	ehdr_ptr = (Elf64_Ehdr *)elfptr;
-
-	/* Skip Elf header and program headers. */
-	vmcore_off = sizeof(Elf64_Ehdr) +
-			(ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
+	/* Skip Elf header, program headers and Elf note segment. */
+	vmcore_off = elfsz + elfnotes_sz;
 
 	list_for_each_entry(m, vc_list, list) {
 		m->offset = vmcore_off;
@@ -510,24 +793,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr,
 	}
 }
 
-/* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets_elf32(char *elfptr,
-						struct list_head *vc_list)
+static void free_elfcorebuf(void)
 {
-	loff_t vmcore_off;
-	Elf32_Ehdr *ehdr_ptr;
-	struct vmcore *m;
-
-	ehdr_ptr = (Elf32_Ehdr *)elfptr;
-
-	/* Skip Elf header and program headers. */
-	vmcore_off = sizeof(Elf32_Ehdr) +
-			(ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
-
-	list_for_each_entry(m, vc_list, list) {
-		m->offset = vmcore_off;
-		vmcore_off += m->size;
-	}
+	free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
+	elfcorebuf = NULL;
+	vfree(elfnotes_buf);
+	elfnotes_buf = NULL;
 }
 
 static int __init parse_crash_elf64_headers(void)
@@ -553,37 +824,37 @@ static int __init parse_crash_elf64_headers(void)
 		ehdr.e_ehsize != sizeof(Elf64_Ehdr) ||
 		ehdr.e_phentsize != sizeof(Elf64_Phdr) ||
 		ehdr.e_phnum == 0) {
-		printk(KERN_WARNING "Warning: Core image elf header is not"
-					"sane\n");
+		pr_warn("Warning: Core image elf header is not sane\n");
 		return -EINVAL;
 	}
 
 	/* Read in all elf headers. */
-	elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr);
-	elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+	elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
+				ehdr.e_phnum * sizeof(Elf64_Phdr);
+	elfcorebuf_sz = elfcorebuf_sz_orig;
+	elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					      get_order(elfcorebuf_sz_orig));
 	if (!elfcorebuf)
 		return -ENOMEM;
 	addr = elfcorehdr_addr;
-	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
-	if (rc < 0) {
-		kfree(elfcorebuf);
-		return rc;
-	}
+	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
+	if (rc < 0)
+		goto fail;
 
 	/* Merge all PT_NOTE headers into one. */
-	rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
-	if (rc) {
-		kfree(elfcorebuf);
-		return rc;
-	}
+	rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
+				      &elfnotes_buf, &elfnotes_sz);
+	if (rc)
+		goto fail;
 	rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
-							&vmcore_list);
-	if (rc) {
-		kfree(elfcorebuf);
-		return rc;
-	}
-	set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
+						  elfnotes_sz, &vmcore_list);
+	if (rc)
+		goto fail;
+	set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
 	return 0;
+fail:
+	free_elfcorebuf();
+	return rc;
 }
 
 static int __init parse_crash_elf32_headers(void)
@@ -609,37 +880,36 @@ static int __init parse_crash_elf32_headers(void)
 		ehdr.e_ehsize != sizeof(Elf32_Ehdr) ||
 		ehdr.e_phentsize != sizeof(Elf32_Phdr) ||
 		ehdr.e_phnum == 0) {
-		printk(KERN_WARNING "Warning: Core image elf header is not"
-					"sane\n");
+		pr_warn("Warning: Core image elf header is not sane\n");
 		return -EINVAL;
 	}
 
 	/* Read in all elf headers. */
-	elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
-	elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+	elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
+	elfcorebuf_sz = elfcorebuf_sz_orig;
+	elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					      get_order(elfcorebuf_sz_orig));
 	if (!elfcorebuf)
 		return -ENOMEM;
 	addr = elfcorehdr_addr;
-	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
-	if (rc < 0) {
-		kfree(elfcorebuf);
-		return rc;
-	}
+	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
+	if (rc < 0)
+		goto fail;
 
 	/* Merge all PT_NOTE headers into one. */
-	rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
-	if (rc) {
-		kfree(elfcorebuf);
-		return rc;
-	}
+	rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
+				      &elfnotes_buf, &elfnotes_sz);
+	if (rc)
+		goto fail;
 	rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
-								&vmcore_list);
-	if (rc) {
-		kfree(elfcorebuf);
-		return rc;
-	}
-	set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list);
+						  elfnotes_sz, &vmcore_list);
+	if (rc)
+		goto fail;
+	set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
 	return 0;
+fail:
+	free_elfcorebuf();
+	return rc;
 }
 
 static int __init parse_crash_elf_headers(void)
@@ -653,8 +923,7 @@ static int __init parse_crash_elf_headers(void)
 	if (rc < 0)
 		return rc;
 	if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
-		printk(KERN_WARNING "Warning: Core image elf header"
-					" not found\n");
+		pr_warn("Warning: Core image elf header not found\n");
 		return -EINVAL;
 	}
 
@@ -662,21 +931,19 @@ static int __init parse_crash_elf_headers(void)
 		rc = parse_crash_elf64_headers();
 		if (rc)
 			return rc;
-
-		/* Determine vmcore size. */
-		vmcore_size = get_vmcore_size_elf64(elfcorebuf);
 	} else if (e_ident[EI_CLASS] == ELFCLASS32) {
 		rc = parse_crash_elf32_headers();
 		if (rc)
 			return rc;
-
-		/* Determine vmcore size. */
-		vmcore_size = get_vmcore_size_elf32(elfcorebuf);
 	} else {
-		printk(KERN_WARNING "Warning: Core image elf header is not"
-					" sane\n");
+		pr_warn("Warning: Core image elf header is not sane\n");
 		return -EINVAL;
 	}
+
+	/* Determine vmcore size. */
+	vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+				      &vmcore_list);
+
 	return 0;
 }
 
@@ -690,7 +957,7 @@ static int __init vmcore_init(void)
 		return rc;
 	rc = parse_crash_elf_headers();
 	if (rc) {
-		printk(KERN_WARNING "Kdump: vmcore not initialized\n");
+		pr_warn("Kdump: vmcore not initialized\n");
 		return rc;
 	}
 
@@ -707,7 +974,7 @@ void vmcore_cleanup(void)
 	struct list_head *pos, *next;
 
 	if (proc_vmcore) {
-		remove_proc_entry(proc_vmcore->name, proc_vmcore->parent);
+		proc_remove(proc_vmcore);
 		proc_vmcore = NULL;
 	}
 
@@ -719,7 +986,6 @@ void vmcore_cleanup(void)
 		list_del(&m->list);
 		kfree(m);
 	}
-	kfree(elfcorebuf);
-	elfcorebuf = NULL;
+	free_elfcorebuf();
 }
 EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 43b12807a51..76a4eeb9298 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -44,7 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
 	rec.parent_ip = parent_ip;
 	pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
 	psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
-			  sizeof(rec), psinfo);
+			  0, sizeof(rec), psinfo);
 
 	local_irq_restore(flags);
 }
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 67de74ca85f..71bf5f4ae84 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,6 +178,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
 	if (p->psi->erase)
 		p->psi->erase(p->type, p->id, p->count,
 			      dentry->d_inode->i_ctime, p->psi);
+	else
+		return -EPERM;
 
 	return simple_unlink(dir, dentry);
 }
@@ -324,6 +326,15 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 	case PSTORE_TYPE_MCE:
 		sprintf(name, "mce-%s-%lld", psname, id);
 		break;
+	case PSTORE_TYPE_PPC_RTAS:
+		sprintf(name, "rtas-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_PPC_OF:
+		sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_PPC_COMMON:
+		sprintf(name, "powerpc-common-%s-%lld", psname, id);
+		break;
 	case PSTORE_TYPE_UNKNOWN:
 		sprintf(name, "unknown-%s-%lld", psname, id);
 		break;
@@ -418,9 +429,25 @@ static struct file_system_type pstore_fs_type = {
 	.kill_sb	= pstore_kill_sb,
 };
 
+static struct kobject *pstore_kobj;
+
 static int __init init_pstore_fs(void)
 {
-	return register_filesystem(&pstore_fs_type);
+	int err = 0;
+
+	/* Create a convenient mount point for people to access pstore */
+	pstore_kobj = kobject_create_and_add("pstore", fs_kobj);
+	if (!pstore_kobj) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = register_filesystem(&pstore_fs_type);
+	if (err < 0)
+		kobject_put(pstore_kobj);
+
+out:
+	return err;
 }
 module_init(init_pstore_fs)
 
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 5ea2e77ff02..422962ae9fc 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -96,6 +96,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason)
 	}
 }
 
+bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
+{
+	/*
+	 * In case of NMI path, pstore shouldn't be blocked
+	 * regardless of reason.
+	 */
+	if (in_nmi())
+		return true;
+
+	switch (reason) {
+	/* In panic case, other cpus are stopped by smp_send_stop(). */
+	case KMSG_DUMP_PANIC:
+	/* Emergency restart shouldn't be blocked by spin lock. */
+	case KMSG_DUMP_EMERG:
+		return true;
+	default:
+		return false;
+	}
+}
+EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
+
 /*
  * callback from kmsg_dump. (s2,l2) has the most recently
  * written bytes, older bytes are in (s1,l1). Save as much
@@ -114,10 +135,12 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 
 	why = get_reason_str(reason);
 
-	if (in_nmi()) {
-		is_locked = spin_trylock(&psinfo->buf_lock);
-		if (!is_locked)
-			pr_err("pstore dump routine blocked in NMI, may corrupt error record\n");
+	if (pstore_cannot_block_path(reason)) {
+		is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags);
+		if (!is_locked) {
+			pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
+				       , in_nmi() ? "NMI" : why);
+		}
 	} else
 		spin_lock_irqsave(&psinfo->buf_lock, flags);
 	oopscount++;
@@ -136,16 +159,16 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 			break;
 
 		ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
-				    oopscount, hsize + len, psinfo);
+				    oopscount, hsize, hsize + len, psinfo);
 		if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
 			pstore_new_entry = 1;
 
 		total += hsize + len;
 		part++;
 	}
-	if (in_nmi()) {
+	if (pstore_cannot_block_path(reason)) {
 		if (is_locked)
-			spin_unlock(&psinfo->buf_lock);
+			spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 	} else
 		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 }
@@ -173,7 +196,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
 			spin_lock_irqsave(&psinfo->buf_lock, flags);
 		}
 		memcpy(psinfo->buf, s, c);
-		psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo);
+		psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
 		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 		s += c;
 		c = e - s;
@@ -198,9 +221,11 @@ static void pstore_register_console(void) {}
 static int pstore_write_compat(enum pstore_type_id type,
 			       enum kmsg_dump_reason reason,
 			       u64 *id, unsigned int part, int count,
-			       size_t size, struct pstore_info *psi)
+			       size_t hsize, size_t size,
+			       struct pstore_info *psi)
 {
-	return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi);
+	return psi->write_buf(type, reason, id, part, psinfo->buf, hsize,
+			     size, psi);
 }
 
 /*
@@ -216,17 +241,15 @@ int pstore_register(struct pstore_info *psi)
 {
 	struct module *owner = psi->owner;
 
+	if (backend && strcmp(backend, psi->name))
+		return -EPERM;
+
 	spin_lock(&pstore_lock);
 	if (psinfo) {
 		spin_unlock(&pstore_lock);
 		return -EBUSY;
 	}
 
-	if (backend && strcmp(backend, psi->name)) {
-		spin_unlock(&pstore_lock);
-		return -EINVAL;
-	}
-
 	if (!psi->write)
 		psi->write = pstore_write_compat;
 	psinfo = psi;
@@ -251,6 +274,9 @@ int pstore_register(struct pstore_info *psi)
 		add_timer(&pstore_timer);
 	}
 
+	pr_info("pstore: Registered %s as persistent store backend\n",
+		psi->name);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pstore_register);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 7003e5266f2..a6119f9469e 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -83,7 +83,7 @@ struct ramoops_context {
 	size_t console_size;
 	size_t ftrace_size;
 	int dump_oops;
-	int ecc_size;
+	struct persistent_ram_ecc_info ecc_info;
 	unsigned int max_dump_cnt;
 	unsigned int dump_write_cnt;
 	unsigned int dump_read_cnt;
@@ -136,6 +136,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 				   char **buf, struct pstore_info *psi)
 {
 	ssize_t size;
+	ssize_t ecc_notice_size;
 	struct ramoops_context *cxt = psi->data;
 	struct persistent_ram_zone *prz;
 
@@ -156,23 +157,33 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 	time->tv_nsec = 0;
 
 	size = persistent_ram_old_size(prz);
-	*buf = kmalloc(size, GFP_KERNEL);
+
+	/* ECC correction notice */
+	ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
+
+	*buf = kmalloc(size + ecc_notice_size + 1, GFP_KERNEL);
 	if (*buf == NULL)
 		return -ENOMEM;
+
 	memcpy(*buf, persistent_ram_old(prz), size);
+	persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1);
 
-	return size;
+	return size + ecc_notice_size;
 }
 
 static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
 {
 	char *hdr;
-	struct timeval timestamp;
+	struct timespec timestamp;
 	size_t len;
 
-	do_gettimeofday(&timestamp);
+	/* Report zeroed timestamp if called before timekeeping has resumed. */
+	if (__getnstimeofday(&timestamp)) {
+		timestamp.tv_sec = 0;
+		timestamp.tv_nsec = 0;
+	}
 	hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n",
-		(long)timestamp.tv_sec, (long)timestamp.tv_usec);
+		(long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000));
 	WARN_ON_ONCE(!hdr);
 	len = hdr ? strlen(hdr) : 0;
 	persistent_ram_write(prz, hdr, len);
@@ -184,7 +195,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
 static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
 					    enum kmsg_dump_reason reason,
 					    u64 *id, unsigned int part,
-					    const char *buf, size_t size,
+					    const char *buf,
+					    size_t hsize, size_t size,
 					    struct pstore_info *psi)
 {
 	struct ramoops_context *cxt = psi->data;
@@ -319,7 +331,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
 	for (i = 0; i < cxt->max_dump_cnt; i++) {
 		size_t sz = cxt->record_size;
 
-		cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, cxt->ecc_size);
+		cxt->przs[i] = persistent_ram_new(*paddr, sz, 0,
+						  &cxt->ecc_info);
 		if (IS_ERR(cxt->przs[i])) {
 			err = PTR_ERR(cxt->przs[i]);
 			dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
@@ -349,7 +362,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
 		return -ENOMEM;
 	}
 
-	*prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size);
+	*prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info);
 	if (IS_ERR(*prz)) {
 		int err = PTR_ERR(*prz);
 
@@ -387,8 +400,6 @@ static int ramoops_probe(struct platform_device *pdev)
 		goto fail_out;
 	}
 
-	if (!is_power_of_2(pdata->mem_size))
-		pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
 	if (!is_power_of_2(pdata->record_size))
 		pdata->record_size = rounddown_pow_of_two(pdata->record_size);
 	if (!is_power_of_2(pdata->console_size))
@@ -403,7 +414,7 @@ static int ramoops_probe(struct platform_device *pdev)
 	cxt->console_size = pdata->console_size;
 	cxt->ftrace_size = pdata->ftrace_size;
 	cxt->dump_oops = pdata->dump_oops;
-	cxt->ecc_size = pdata->ecc_size;
+	cxt->ecc_info = pdata->ecc_info;
 
 	paddr = cxt->phys_addr;
 
@@ -426,6 +437,7 @@ static int ramoops_probe(struct platform_device *pdev)
 		pr_err("memory size too small, minimum is %zu\n",
 			cxt->console_size + cxt->record_size +
 			cxt->ftrace_size);
+		err = -EINVAL;
 		goto fail_cnt;
 	}
 
@@ -443,6 +455,7 @@ static int ramoops_probe(struct platform_device *pdev)
 	spin_lock_init(&cxt->pstore.buf_lock);
 	if (!cxt->pstore.buf) {
 		pr_err("cannot allocate pstore buffer\n");
+		err = -ENOMEM;
 		goto fail_clear;
 	}
 
@@ -461,9 +474,9 @@ static int ramoops_probe(struct platform_device *pdev)
 	record_size = pdata->record_size;
 	dump_oops = pdata->dump_oops;
 
-	pr_info("attached 0x%lx@0x%llx, ecc: %d\n",
+	pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n",
 		cxt->size, (unsigned long long)cxt->phys_addr,
-		cxt->ecc_size);
+		cxt->ecc_info.ecc_size, cxt->ecc_info.block_size);
 
 	return 0;
 
@@ -535,7 +548,7 @@ static void ramoops_register_dummy(void)
 	 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
 	 * (using 1 byte for ECC isn't much of use anyway).
 	 */
-	dummy_data->ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
+	dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
 
 	dummy = platform_device_register_data(NULL, "ramoops", -1,
 			dummy_data, sizeof(struct ramoops_platform_data));
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 0306303be37..de272d42676 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -46,7 +46,7 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
 }
 
 /* increase and wrap the start pointer, returning the old value */
-static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
+static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
 {
 	int old;
 	int new;
@@ -62,7 +62,7 @@ static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
 }
 
 /* increase the size counter until it hits the max size */
-static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
+static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a)
 {
 	size_t old;
 	size_t new;
@@ -78,16 +78,63 @@ static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
 	} while (atomic_cmpxchg(&prz->buffer->size, old, new) != old);
 }
 
+static DEFINE_RAW_SPINLOCK(buffer_lock);
+
+/* increase and wrap the start pointer, returning the old value */
+static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
+{
+	int old;
+	int new;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&buffer_lock, flags);
+
+	old = atomic_read(&prz->buffer->start);
+	new = old + a;
+	while (unlikely(new > prz->buffer_size))
+		new -= prz->buffer_size;
+	atomic_set(&prz->buffer->start, new);
+
+	raw_spin_unlock_irqrestore(&buffer_lock, flags);
+
+	return old;
+}
+
+/* increase the size counter until it hits the max size */
+static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a)
+{
+	size_t old;
+	size_t new;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&buffer_lock, flags);
+
+	old = atomic_read(&prz->buffer->size);
+	if (old == prz->buffer_size)
+		goto exit;
+
+	new = old + a;
+	if (new > prz->buffer_size)
+		new = prz->buffer_size;
+	atomic_set(&prz->buffer->size, new);
+
+exit:
+	raw_spin_unlock_irqrestore(&buffer_lock, flags);
+}
+
+static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic;
+static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic;
+
 static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
 	uint8_t *data, size_t len, uint8_t *ecc)
 {
 	int i;
-	uint16_t par[prz->ecc_size];
+	uint16_t par[prz->ecc_info.ecc_size];
 
 	/* Initialize the parity buffer */
 	memset(par, 0, sizeof(par));
 	encode_rs8(prz->rs_decoder, data, len, par, 0);
-	for (i = 0; i < prz->ecc_size; i++)
+	for (i = 0; i < prz->ecc_info.ecc_size; i++)
 		ecc[i] = par[i];
 }
 
@@ -95,9 +142,9 @@ static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz,
 	void *data, size_t len, uint8_t *ecc)
 {
 	int i;
-	uint16_t par[prz->ecc_size];
+	uint16_t par[prz->ecc_info.ecc_size];
 
-	for (i = 0; i < prz->ecc_size; i++)
+	for (i = 0; i < prz->ecc_info.ecc_size; i++)
 		par[i] = ecc[i];
 	return decode_rs8(prz->rs_decoder, data, par, len,
 				NULL, 0, NULL, 0, NULL);
@@ -110,15 +157,15 @@ static void notrace persistent_ram_update_ecc(struct persistent_ram_zone *prz,
 	uint8_t *buffer_end = buffer->data + prz->buffer_size;
 	uint8_t *block;
 	uint8_t *par;
-	int ecc_block_size = prz->ecc_block_size;
-	int ecc_size = prz->ecc_size;
-	int size = prz->ecc_block_size;
+	int ecc_block_size = prz->ecc_info.block_size;
+	int ecc_size = prz->ecc_info.ecc_size;
+	int size = ecc_block_size;
 
-	if (!prz->ecc_size)
+	if (!ecc_size)
 		return;
 
 	block = buffer->data + (start & ~(ecc_block_size - 1));
-	par = prz->par_buffer + (start / ecc_block_size) * prz->ecc_size;
+	par = prz->par_buffer + (start / ecc_block_size) * ecc_size;
 
 	do {
 		if (block + ecc_block_size > buffer_end)
@@ -133,7 +180,7 @@ static void persistent_ram_update_header_ecc(struct persistent_ram_zone *prz)
 {
 	struct persistent_ram_buffer *buffer = prz->buffer;
 
-	if (!prz->ecc_size)
+	if (!prz->ecc_info.ecc_size)
 		return;
 
 	persistent_ram_encode_rs8(prz, (uint8_t *)buffer, sizeof(*buffer),
@@ -146,14 +193,14 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz)
 	uint8_t *block;
 	uint8_t *par;
 
-	if (!prz->ecc_size)
+	if (!prz->ecc_info.ecc_size)
 		return;
 
 	block = buffer->data;
 	par = prz->par_buffer;
 	while (block < buffer->data + buffer_size(prz)) {
 		int numerr;
-		int size = prz->ecc_block_size;
+		int size = prz->ecc_info.block_size;
 		if (block + size > buffer->data + prz->buffer_size)
 			size = buffer->data + prz->buffer_size - block;
 		numerr = persistent_ram_decode_rs8(prz, block, size, par);
@@ -166,44 +213,49 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz)
 				block);
 			prz->bad_blocks++;
 		}
-		block += prz->ecc_block_size;
-		par += prz->ecc_size;
+		block += prz->ecc_info.block_size;
+		par += prz->ecc_info.ecc_size;
 	}
 }
 
 static int persistent_ram_init_ecc(struct persistent_ram_zone *prz,
-				   int ecc_size)
+				   struct persistent_ram_ecc_info *ecc_info)
 {
 	int numerr;
 	struct persistent_ram_buffer *buffer = prz->buffer;
 	int ecc_blocks;
 	size_t ecc_total;
-	int ecc_symsize = 8;
-	int ecc_poly = 0x11d;
 
-	if (!ecc_size)
+	if (!ecc_info || !ecc_info->ecc_size)
 		return 0;
 
-	prz->ecc_block_size = 128;
-	prz->ecc_size = ecc_size;
+	prz->ecc_info.block_size = ecc_info->block_size ?: 128;
+	prz->ecc_info.ecc_size = ecc_info->ecc_size ?: 16;
+	prz->ecc_info.symsize = ecc_info->symsize ?: 8;
+	prz->ecc_info.poly = ecc_info->poly ?: 0x11d;
 
-	ecc_blocks = DIV_ROUND_UP(prz->buffer_size, prz->ecc_block_size);
-	ecc_total = (ecc_blocks + 1) * prz->ecc_size;
+	ecc_blocks = DIV_ROUND_UP(prz->buffer_size - prz->ecc_info.ecc_size,
+				  prz->ecc_info.block_size +
+				  prz->ecc_info.ecc_size);
+	ecc_total = (ecc_blocks + 1) * prz->ecc_info.ecc_size;
 	if (ecc_total >= prz->buffer_size) {
 		pr_err("%s: invalid ecc_size %u (total %zu, buffer size %zu)\n",
-		       __func__, prz->ecc_size, ecc_total, prz->buffer_size);
+		       __func__, prz->ecc_info.ecc_size,
+		       ecc_total, prz->buffer_size);
 		return -EINVAL;
 	}
 
 	prz->buffer_size -= ecc_total;
 	prz->par_buffer = buffer->data + prz->buffer_size;
-	prz->par_header = prz->par_buffer + ecc_blocks * prz->ecc_size;
+	prz->par_header = prz->par_buffer +
+			  ecc_blocks * prz->ecc_info.ecc_size;
 
 	/*
 	 * first consecutive root is 0
 	 * primitive element to generate roots = 1
 	 */
-	prz->rs_decoder = init_rs(ecc_symsize, ecc_poly, 0, 1, prz->ecc_size);
+	prz->rs_decoder = init_rs(prz->ecc_info.symsize, prz->ecc_info.poly,
+				  0, 1, prz->ecc_info.ecc_size);
 	if (prz->rs_decoder == NULL) {
 		pr_info("persistent_ram: init_rs failed\n");
 		return -EINVAL;
@@ -230,6 +282,9 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
 {
 	ssize_t ret;
 
+	if (!prz->ecc_info.ecc_size)
+		return 0;
+
 	if (prz->corrected_bytes || prz->bad_blocks)
 		ret = snprintf(str, len, ""
 			"\n%d Corrected bytes, %d unrecoverable blocks\n",
@@ -364,6 +419,9 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size)
 		return NULL;
 	}
 
+	buffer_start_add = buffer_start_add_locked;
+	buffer_size_add = buffer_size_add_locked;
+
 	return ioremap(start, size);
 }
 
@@ -391,11 +449,11 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
 }
 
 static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
-				    int ecc_size)
+				    struct persistent_ram_ecc_info *ecc_info)
 {
 	int ret;
 
-	ret = persistent_ram_init_ecc(prz, ecc_size);
+	ret = persistent_ram_init_ecc(prz, ecc_info);
 	if (ret)
 		return ret;
 
@@ -444,7 +502,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
 }
 
 struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
-					       u32 sig, int ecc_size)
+			u32 sig, struct persistent_ram_ecc_info *ecc_info)
 {
 	struct persistent_ram_zone *prz;
 	int ret = -ENOMEM;
@@ -459,7 +517,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
 	if (ret)
 		goto err;
 
-	ret = persistent_ram_post_init(prz, sig, ecc_size);
+	ret = persistent_ram_post_init(prz, sig, ecc_info);
 	if (ret)
 		goto err;
 
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 7b0329468a5..b218f965817 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -14,9 +14,9 @@
 #include <linux/buffer_head.h>
 #include "qnx4.h"
 
-static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx4_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	unsigned int offset;
 	struct buffer_head *bh;
 	struct qnx4_inode_entry *de;
@@ -26,48 +26,44 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int size;
 
 	QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
-	QNX4DEBUG((KERN_INFO "filp->f_pos         = %ld\n", (long) filp->f_pos));
+	QNX4DEBUG((KERN_INFO "pos                 = %ld\n", (long) ctx->pos));
 
-	while (filp->f_pos < inode->i_size) {
-		blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS );
+	while (ctx->pos < inode->i_size) {
+		blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS);
 		bh = sb_bread(inode->i_sb, blknum);
-		if(bh==NULL) {
+		if (bh == NULL) {
 			printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum);
-			break;
+			return 0;
 		}
-		ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
-		while (ix < QNX4_INODES_PER_BLOCK) {
+		ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
+		for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
 			offset = ix * QNX4_DIR_ENTRY_SIZE;
 			de = (struct qnx4_inode_entry *) (bh->b_data + offset);
-			size = strlen(de->di_fname);
-			if (size) {
-				if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX )
-					size = QNX4_SHORT_NAME_MAX;
-				else if ( size > QNX4_NAME_MAX )
-					size = QNX4_NAME_MAX;
-
-				if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
-					QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
-					if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
-						ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
-					else {
-						le  = (struct qnx4_link_info*)de;
-						ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
-							QNX4_INODES_PER_BLOCK +
-							le->dl_inode_ndx;
-					}
-					if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) {
-						brelse(bh);
-						goto out;
-					}
-				}
+			if (!de->di_fname[0])
+				continue;
+			if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
+				continue;
+			if (!(de->di_status & QNX4_FILE_LINK))
+				size = QNX4_SHORT_NAME_MAX;
+			else
+				size = QNX4_NAME_MAX;
+			size = strnlen(de->di_fname, size);
+			QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
+			if (!(de->di_status & QNX4_FILE_LINK))
+				ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
+			else {
+				le  = (struct qnx4_link_info*)de;
+				ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
+					QNX4_INODES_PER_BLOCK +
+					le->dl_inode_ndx;
+			}
+			if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) {
+				brelse(bh);
+				return 0;
 			}
-			ix++;
-			filp->f_pos += QNX4_DIR_ENTRY_SIZE;
 		}
 		brelse(bh);
 	}
-out:
 	return 0;
 }
 
@@ -75,7 +71,7 @@ const struct file_operations qnx4_dir_operations =
 {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= qnx4_readdir,
+	.iterate	= qnx4_readdir,
 	.fsync		= generic_file_fsync,
 };
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 43098bb5723..2e8caa62da7 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -412,6 +412,7 @@ static struct file_system_type qnx4_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("qnx4");
 
 static int __init init_qnx4_fs(void)
 {
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index dc597353db3..15b7d92ed60 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -65,8 +65,8 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
 
 static int qnx6_dir_longfilename(struct inode *inode,
 			struct qnx6_long_dir_entry *de,
-			void *dirent, loff_t pos,
-			unsigned de_inode, filldir_t filldir)
+			struct dir_context *ctx,
+			unsigned de_inode)
 {
 	struct qnx6_long_filename *lf;
 	struct super_block *s = inode->i_sb;
@@ -104,8 +104,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
 
 	QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n",
 					lf_size, lf->lf_fname, de_inode));
-	if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode,
-			DT_UNKNOWN) < 0) {
+	if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
 		qnx6_put_page(page);
 		return 0;
 	}
@@ -115,18 +114,19 @@ static int qnx6_dir_longfilename(struct inode *inode,
 	return 1;
 }
 
-static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int qnx6_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct super_block *s = inode->i_sb;
 	struct qnx6_sb_info *sbi = QNX6_SB(s);
-	loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1);
+	loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
 	unsigned long npages = dir_pages(inode);
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
 	bool done = false;
 
-	if (filp->f_pos >= inode->i_size)
+	ctx->pos = pos;
+	if (ctx->pos >= inode->i_size)
 		return 0;
 
 	for ( ; !done && n < npages; n++, start = 0) {
@@ -137,11 +137,11 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 		if (IS_ERR(page)) {
 			printk(KERN_ERR "qnx6_readdir: read failed\n");
-			filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT;
+			ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
 			return PTR_ERR(page);
 		}
 		de = ((struct qnx6_dir_entry *)page_address(page)) + start;
-		for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) {
+		for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) {
 			int size = de->de_size;
 			u32 no_inode = fs32_to_cpu(sbi, de->de_inode);
 
@@ -154,8 +154,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				   structure / block */
 				if (!qnx6_dir_longfilename(inode,
 					(struct qnx6_long_dir_entry *)de,
-					dirent, pos, no_inode,
-					filldir)) {
+					ctx, no_inode)) {
 					done = true;
 					break;
 				}
@@ -163,9 +162,8 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s"
 				   " inode:%u\n", size, de->de_fname,
 							no_inode));
-				if (filldir(dirent, de->de_fname, size,
-				      pos, no_inode, DT_UNKNOWN)
-					< 0) {
+				if (!dir_emit(ctx, de->de_fname, size,
+				      no_inode, DT_UNKNOWN)) {
 					done = true;
 					break;
 				}
@@ -173,7 +171,6 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		qnx6_put_page(page);
 	}
-	filp->f_pos = pos;
 	return 0;
 }
 
@@ -282,7 +279,7 @@ found:
 const struct file_operations qnx6_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= qnx6_readdir,
+	.iterate	= qnx6_readdir,
 	.fsync		= generic_file_fsync,
 };
 
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index b6addf56048..8d941edfefa 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -285,7 +285,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
 		if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) {
 			/* we got a big endian fs */
 			QNX6DEBUG((KERN_INFO "qnx6: fs got different"
-					" endianess.\n"));
+					" endianness.\n"));
 			return bh;
 		} else
 			sbi->s_bytesex = BYTESEX_LE;
@@ -672,6 +672,7 @@ static struct file_system_type qnx6_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("qnx6");
 
 static int __init init_qnx6_fs(void)
 {
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 05ae3c97f7a..fbad622841f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1439,8 +1439,11 @@ static void __dquot_initialize(struct inode *inode, int type)
 			 * did a write before quota was turned on
 			 */
 			rsv = inode_get_rsv_space(inode);
-			if (unlikely(rsv))
+			if (unlikely(rsv)) {
+				spin_lock(&dq_data_lock);
 				dquot_resv_space(inode->i_dquot[cnt], rsv);
+				spin_unlock(&dq_data_lock);
+			}
 		}
 	}
 out_err:
@@ -2582,7 +2585,7 @@ static int do_proc_dqstats(struct ctl_table *table, int write,
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 
-static ctl_table fs_dqstats_table[] = {
+static struct ctl_table fs_dqstats_table[] = {
 	{
 		.procname	= "lookups",
 		.data		= &dqstats.stat[DQST_LOOKUPS],
@@ -2651,7 +2654,7 @@ static ctl_table fs_dqstats_table[] = {
 	{ },
 };
 
-static ctl_table fs_table[] = {
+static struct ctl_table fs_table[] = {
 	{
 		.procname	= "quota",
 		.mode		= 0555,
@@ -2660,7 +2663,7 @@ static ctl_table fs_table[] = {
 	{ },
 };
 
-static ctl_table sys_table[] = {
+static struct ctl_table sys_table[] = {
 	{
 		.procname	= "fs",
 		.mode		= 0555,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index d5378d02858..8d5b438cc18 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -202,7 +202,7 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 					    unsigned long pgoff, unsigned long flags)
 {
 	unsigned long maxpages, lpages, nr, loop, ret;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct page **pages = NULL, **ptr, *page;
 	loff_t isize;
 
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index eab8c09d380..c24f1e10b94 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -260,6 +260,7 @@ static struct file_system_type ramfs_fs_type = {
 	.name		= "ramfs",
 	.mount		= ramfs_mount,
 	.kill_sb	= ramfs_kill_sb,
+	.fs_flags	= FS_USERNS_MOUNT,
 };
 static struct file_system_type rootfs_fs_type = {
 	.name		= "rootfs",
diff --git a/fs/read_write.c b/fs/read_write.c
index bb34af31528..122a3846d9e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,17 +9,23 @@
 #include <linux/fcntl.h>
 #include <linux/file.h>
 #include <linux/uio.h>
+#include <linux/aio.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include <linux/export.h>
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
 #include <linux/splice.h>
-#include "read_write.h"
+#include <linux/compat.h>
+#include "internal.h"
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
+typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
+typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
+		unsigned long, loff_t);
+
 const struct file_operations generic_ro_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -35,8 +41,19 @@ static inline int unsigned_offsets(struct file *file)
 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
 }
 
-static loff_t lseek_execute(struct file *file, struct inode *inode,
-		loff_t offset, loff_t maxsize)
+/**
+ * vfs_setpos - update the file offset for lseek
+ * @file:	file structure in question
+ * @offset:	file offset to seek to
+ * @maxsize:	maximum file size
+ *
+ * This is a low-level filesystem helper for updating the file offset to
+ * the value specified by @offset if the given offset is valid and it is
+ * not equal to the current file offset.
+ *
+ * Return the specified offset on success and -EINVAL on invalid offset.
+ */
+loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
 {
 	if (offset < 0 && !unsigned_offsets(file))
 		return -EINVAL;
@@ -49,6 +66,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
 	}
 	return offset;
 }
+EXPORT_SYMBOL(vfs_setpos);
 
 /**
  * generic_file_llseek_size - generic llseek implementation for regular files
@@ -70,8 +88,6 @@ loff_t
 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
 		loff_t maxsize, loff_t eof)
 {
-	struct inode *inode = file->f_mapping->host;
-
 	switch (whence) {
 	case SEEK_END:
 		offset += eof;
@@ -91,8 +107,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
 		 * like SEEK_SET.
 		 */
 		spin_lock(&file->f_lock);
-		offset = lseek_execute(file, inode, file->f_pos + offset,
-				       maxsize);
+		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 		spin_unlock(&file->f_lock);
 		return offset;
 	case SEEK_DATA:
@@ -114,7 +129,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
 		break;
 	}
 
-	return lseek_execute(file, inode, offset, maxsize);
+	return vfs_setpos(file, offset, maxsize);
 }
 EXPORT_SYMBOL(generic_file_llseek_size);
 
@@ -126,7 +141,7 @@ EXPORT_SYMBOL(generic_file_llseek_size);
  *
  * This is a generic implemenation of ->llseek useable for all normal local
  * filesystems.  It just updates the file offset to the value specified by
- * @offset and @whence under i_mutex.
+ * @offset and @whence.
  */
 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 {
@@ -139,6 +154,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 EXPORT_SYMBOL(generic_file_llseek);
 
 /**
+ * fixed_size_llseek - llseek implementation for fixed-sized devices
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @whence:	type of seek
+ * @size:	size of the file
+ *
+ */
+loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
+{
+	switch (whence) {
+	case SEEK_SET: case SEEK_CUR: case SEEK_END:
+		return generic_file_llseek_size(file, offset, whence,
+						size, size);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(fixed_size_llseek);
+
+/**
  * noop_llseek - No Operation Performed llseek implementation
  * @file:	file structure to seek on
  * @offset:	file offset to seek to
@@ -163,7 +198,7 @@ EXPORT_SYMBOL(no_llseek);
 
 loff_t default_llseek(struct file *file, loff_t offset, int whence)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	loff_t retval;
 
 	mutex_lock(&inode->i_mutex);
@@ -247,6 +282,13 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
 	return retval;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
+{
+	return sys_lseek(fd, offset, whence);
+}
+#endif
+
 #ifdef __ARCH_WANT_SYS_LLSEEK
 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 		unsigned long, offset_low, loff_t __user *, result,
@@ -278,19 +320,18 @@ out_putf:
 }
 #endif
 
-
 /*
  * rw_verify_area doesn't like huge counts. We limit
  * them to something that fits in "int" so that others
  * won't have to do range checks all the time.
  */
-int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
+int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 {
 	struct inode *inode;
 	loff_t pos;
 	int retval = -EINVAL;
 
-	inode = file->f_path.dentry->d_inode;
+	inode = file_inode(file);
 	if (unlikely((ssize_t) count < 0))
 		return retval;
 	pos = *ppos;
@@ -318,16 +359,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 }
 
-static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
-{
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	if (!kiocbIsKicked(iocb))
-		schedule();
-	else
-		kiocbClearKicked(iocb);
-	__set_current_state(TASK_RUNNING);
-}
-
 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 {
 	struct iovec iov = { .iov_base = buf, .iov_len = len };
@@ -339,13 +370,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
 	kiocb.ki_left = len;
 	kiocb.ki_nbytes = len;
 
-	for (;;) {
-		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
-		if (ret != -EIOCBRETRY)
-			break;
-		wait_on_retry_sync_kiocb(&kiocb);
-	}
-
+	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
 	*ppos = kiocb.ki_pos;
@@ -395,13 +420,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 	kiocb.ki_left = len;
 	kiocb.ki_nbytes = len;
 
-	for (;;) {
-		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
-		if (ret != -EIOCBRETRY)
-			break;
-		wait_on_retry_sync_kiocb(&kiocb);
-	}
-
+	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
 	*ppos = kiocb.ki_pos;
@@ -410,6 +429,33 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 
 EXPORT_SYMBOL(do_sync_write);
 
+ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
+{
+	mm_segment_t old_fs;
+	const char __user *p;
+	ssize_t ret;
+
+	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
+		return -EINVAL;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	p = (__force const char __user *)buf;
+	if (count > MAX_RW_COUNT)
+		count =  MAX_RW_COUNT;
+	if (file->f_op->write)
+		ret = file->f_op->write(file, p, count, pos);
+	else
+		ret = do_sync_write(file, p, count, pos);
+	set_fs(old_fs);
+	if (ret > 0) {
+		fsnotify_modify(file);
+		add_wchar(current, ret);
+	}
+	inc_syscw(current);
+	return ret;
+}
+
 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 {
 	ssize_t ret;
@@ -424,6 +470,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 	ret = rw_verify_area(WRITE, file, pos, count);
 	if (ret >= 0) {
 		count = ret;
+		file_start_write(file);
 		if (file->f_op->write)
 			ret = file->f_op->write(file, buf, count, pos);
 		else
@@ -433,6 +480,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 			add_wchar(current, ret);
 		}
 		inc_syscw(current);
+		file_end_write(file);
 	}
 
 	return ret;
@@ -458,7 +506,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 	if (f.file) {
 		loff_t pos = file_pos_read(f.file);
 		ret = vfs_read(f.file, buf, count, &pos);
-		file_pos_write(f.file, pos);
+		if (ret >= 0)
+			file_pos_write(f.file, pos);
 		fdput(f);
 	}
 	return ret;
@@ -473,15 +522,16 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 	if (f.file) {
 		loff_t pos = file_pos_read(f.file);
 		ret = vfs_write(f.file, buf, count, &pos);
-		file_pos_write(f.file, pos);
+		if (ret >= 0)
+			file_pos_write(f.file, pos);
 		fdput(f);
 	}
 
 	return ret;
 }
 
-SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
-			size_t count, loff_t pos)
+SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
+			size_t, count, loff_t, pos)
 {
 	struct fd f;
 	ssize_t ret = -EBADF;
@@ -499,17 +549,9 @@ SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
 
 	return ret;
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
-{
-	return SYSC_pread64((unsigned int) fd, (char __user *) buf,
-			    (size_t) count, pos);
-}
-SYSCALL_ALIAS(sys_pread64, SyS_pread64);
-#endif
 
-SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
-			 size_t count, loff_t pos)
+SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
+			 size_t, count, loff_t, pos)
 {
 	struct fd f;
 	ssize_t ret = -EBADF;
@@ -527,14 +569,6 @@ SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
 
 	return ret;
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
-{
-	return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
-			     (size_t) count, pos);
-}
-SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
-#endif
 
 /*
  * Reduce an iovec's length in-place.  Return the resulting number of segments
@@ -557,7 +591,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 }
 EXPORT_SYMBOL(iov_shorten);
 
-ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
+static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
 {
 	struct kiocb kiocb;
@@ -568,13 +602,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 	kiocb.ki_left = len;
 	kiocb.ki_nbytes = len;
 
-	for (;;) {
-		ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
-		if (ret != -EIOCBRETRY)
-			break;
-		wait_on_retry_sync_kiocb(&kiocb);
-	}
-
+	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 	if (ret == -EIOCBQUEUED)
 		ret = wait_on_sync_kiocb(&kiocb);
 	*ppos = kiocb.ki_pos;
@@ -582,7 +610,7 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 }
 
 /* Do it by hand, with file-ops */
-ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
+static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 		unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
 {
 	struct iovec *vector = iov;
@@ -724,6 +752,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
 	} else {
 		fn = (io_fn_t)file->f_op->write;
 		fnv = file->f_op->aio_write;
+		file_start_write(file);
 	}
 
 	if (fnv)
@@ -732,6 +761,9 @@ static ssize_t do_readv_writev(int type, struct file *file,
 	else
 		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 
+	if (type != READ)
+		file_end_write(file);
+
 out:
 	if (iov != iovstack)
 		kfree(iov);
@@ -779,7 +811,8 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 	if (f.file) {
 		loff_t pos = file_pos_read(f.file);
 		ret = vfs_readv(f.file, vec, vlen, &pos);
-		file_pos_write(f.file, pos);
+		if (ret >= 0)
+			file_pos_write(f.file, pos);
 		fdput(f);
 	}
 
@@ -798,7 +831,8 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 	if (f.file) {
 		loff_t pos = file_pos_read(f.file);
 		ret = vfs_writev(f.file, vec, vlen, &pos);
-		file_pos_write(f.file, pos);
+		if (ret >= 0)
+			file_pos_write(f.file, pos);
 		fdput(f);
 	}
 
@@ -862,12 +896,210 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
-ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
-		    loff_t max)
+#ifdef CONFIG_COMPAT
+
+static ssize_t compat_do_readv_writev(int type, struct file *file,
+			       const struct compat_iovec __user *uvector,
+			       unsigned long nr_segs, loff_t *pos)
+{
+	compat_ssize_t tot_len;
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = iovstack;
+	ssize_t ret;
+	io_fn_t fn;
+	iov_fn_t fnv;
+
+	ret = -EINVAL;
+	if (!file->f_op)
+		goto out;
+
+	ret = -EFAULT;
+	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+		goto out;
+
+	ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
+					       UIO_FASTIOV, iovstack, &iov);
+	if (ret <= 0)
+		goto out;
+
+	tot_len = ret;
+	ret = rw_verify_area(type, file, pos, tot_len);
+	if (ret < 0)
+		goto out;
+
+	fnv = NULL;
+	if (type == READ) {
+		fn = file->f_op->read;
+		fnv = file->f_op->aio_read;
+	} else {
+		fn = (io_fn_t)file->f_op->write;
+		fnv = file->f_op->aio_write;
+		file_start_write(file);
+	}
+
+	if (fnv)
+		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+						pos, fnv);
+	else
+		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+
+	if (type != READ)
+		file_end_write(file);
+
+out:
+	if (iov != iovstack)
+		kfree(iov);
+	if ((ret + (type == READ)) > 0) {
+		if (type == READ)
+			fsnotify_access(file);
+		else
+			fsnotify_modify(file);
+	}
+	return ret;
+}
+
+static size_t compat_readv(struct file *file,
+			   const struct compat_iovec __user *vec,
+			   unsigned long vlen, loff_t *pos)
+{
+	ssize_t ret = -EBADF;
+
+	if (!(file->f_mode & FMODE_READ))
+		goto out;
+
+	ret = -EINVAL;
+	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
+		goto out;
+
+	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
+
+out:
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
+		const struct compat_iovec __user *,vec,
+		unsigned long, vlen)
+{
+	struct fd f = fdget(fd);
+	ssize_t ret;
+	loff_t pos;
+
+	if (!f.file)
+		return -EBADF;
+	pos = f.file->f_pos;
+	ret = compat_readv(f.file, vec, vlen, &pos);
+	if (ret >= 0)
+		f.file->f_pos = pos;
+	fdput(f);
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
+		const struct compat_iovec __user *,vec,
+		unsigned long, vlen, loff_t, pos)
+{
+	struct fd f;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+	ret = -ESPIPE;
+	if (f.file->f_mode & FMODE_PREAD)
+		ret = compat_readv(f.file, vec, vlen, &pos);
+	fdput(f);
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
+		const struct compat_iovec __user *,vec,
+		unsigned long, vlen, u32, pos_low, u32, pos_high)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	return compat_sys_preadv64(fd, vec, vlen, pos);
+}
+
+static size_t compat_writev(struct file *file,
+			    const struct compat_iovec __user *vec,
+			    unsigned long vlen, loff_t *pos)
+{
+	ssize_t ret = -EBADF;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		goto out;
+
+	ret = -EINVAL;
+	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
+		goto out;
+
+	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
+
+out:
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
+		const struct compat_iovec __user *, vec,
+		unsigned long, vlen)
+{
+	struct fd f = fdget(fd);
+	ssize_t ret;
+	loff_t pos;
+
+	if (!f.file)
+		return -EBADF;
+	pos = f.file->f_pos;
+	ret = compat_writev(f.file, vec, vlen, &pos);
+	if (ret >= 0)
+		f.file->f_pos = pos;
+	fdput(f);
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
+		const struct compat_iovec __user *,vec,
+		unsigned long, vlen, loff_t, pos)
+{
+	struct fd f;
+	ssize_t ret;
+
+	if (pos < 0)
+		return -EINVAL;
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+	ret = -ESPIPE;
+	if (f.file->f_mode & FMODE_PWRITE)
+		ret = compat_writev(f.file, vec, vlen, &pos);
+	fdput(f);
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
+		const struct compat_iovec __user *,vec,
+		unsigned long, vlen, u32, pos_low, u32, pos_high)
+{
+	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+	return compat_sys_pwritev64(fd, vec, vlen, pos);
+}
+#endif
+
+static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
+		  	   size_t count, loff_t max)
 {
 	struct fd in, out;
 	struct inode *in_inode, *out_inode;
 	loff_t pos;
+	loff_t out_pos;
 	ssize_t retval;
 	int fl;
 
@@ -881,12 +1113,14 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
 	if (!(in.file->f_mode & FMODE_READ))
 		goto fput_in;
 	retval = -ESPIPE;
-	if (!ppos)
-		ppos = &in.file->f_pos;
-	else
+	if (!ppos) {
+		pos = in.file->f_pos;
+	} else {
+		pos = *ppos;
 		if (!(in.file->f_mode & FMODE_PREAD))
 			goto fput_in;
-	retval = rw_verify_area(READ, in.file, ppos, count);
+	}
+	retval = rw_verify_area(READ, in.file, &pos, count);
 	if (retval < 0)
 		goto fput_in;
 	count = retval;
@@ -901,9 +1135,10 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
 	if (!(out.file->f_mode & FMODE_WRITE))
 		goto fput_out;
 	retval = -EINVAL;
-	in_inode = in.file->f_path.dentry->d_inode;
-	out_inode = out.file->f_path.dentry->d_inode;
-	retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
+	in_inode = file_inode(in.file);
+	out_inode = file_inode(out.file);
+	out_pos = out.file->f_pos;
+	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
 	if (retval < 0)
 		goto fput_out;
 	count = retval;
@@ -911,7 +1146,6 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
 	if (!max)
 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
 
-	pos = *ppos;
 	if (unlikely(pos + count > max)) {
 		retval = -EOVERFLOW;
 		if (pos >= max)
@@ -930,18 +1164,25 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
 	if (in.file->f_flags & O_NONBLOCK)
 		fl = SPLICE_F_NONBLOCK;
 #endif
-	retval = do_splice_direct(in.file, ppos, out.file, count, fl);
+	file_start_write(out.file);
+	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
+	file_end_write(out.file);
 
 	if (retval > 0) {
 		add_rchar(current, retval);
 		add_wchar(current, retval);
 		fsnotify_access(in.file);
 		fsnotify_modify(out.file);
+		out.file->f_pos = out_pos;
+		if (ppos)
+			*ppos = pos;
+		else
+			in.file->f_pos = pos;
 	}
 
 	inc_syscr(current);
 	inc_syscw(current);
-	if (*ppos > max)
+	if (pos > max)
 		retval = -EOVERFLOW;
 
 fput_out:
@@ -987,3 +1228,43 @@ SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, si
 
 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
 }
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
+		compat_off_t __user *, offset, compat_size_t, count)
+{
+	loff_t pos;
+	off_t off;
+	ssize_t ret;
+
+	if (offset) {
+		if (unlikely(get_user(off, offset)))
+			return -EFAULT;
+		pos = off;
+		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
+		if (unlikely(put_user(pos, offset)))
+			return -EFAULT;
+		return ret;
+	}
+
+	return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+
+COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
+		compat_loff_t __user *, offset, compat_size_t, count)
+{
+	loff_t pos;
+	ssize_t ret;
+
+	if (offset) {
+		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
+			return -EFAULT;
+		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
+		if (unlikely(put_user(pos, offset)))
+			return -EFAULT;
+		return ret;
+	}
+
+	return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+#endif
diff --git a/fs/read_write.h b/fs/read_write.h
deleted file mode 100644
index d3e00ef6742..00000000000
--- a/fs/read_write.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * This file is only for sharing some helpers from read_write.c with compat.c.
- * Don't use anywhere else.
- */
-
-
-typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
-typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
-		unsigned long, loff_t);
-
-ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
-		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn);
-ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
-		unsigned long nr_segs, loff_t *ppos, io_fn_t fn);
-ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
-		    loff_t max);
diff --git a/fs/readdir.c b/fs/readdir.c
index 5e69ef533b7..93d71e57431 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -20,11 +20,11 @@
 
 #include <asm/uaccess.h>
 
-int vfs_readdir(struct file *file, filldir_t filler, void *buf)
+int iterate_dir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	int res = -ENOTDIR;
-	if (!file->f_op || !file->f_op->readdir)
+	if (!file->f_op || !file->f_op->iterate)
 		goto out;
 
 	res = security_file_permission(file, MAY_READ);
@@ -37,15 +37,16 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
 
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
-		res = file->f_op->readdir(file, buf, filler);
+		ctx->pos = file->f_pos;
+		res = file->f_op->iterate(file, ctx);
+		file->f_pos = ctx->pos;
 		file_accessed(file);
 	}
 	mutex_unlock(&inode->i_mutex);
 out:
 	return res;
 }
-
-EXPORT_SYMBOL(vfs_readdir);
+EXPORT_SYMBOL(iterate_dir);
 
 /*
  * Traditional linux readdir() handling..
@@ -66,6 +67,7 @@ struct old_linux_dirent {
 };
 
 struct readdir_callback {
+	struct dir_context ctx;
 	struct old_linux_dirent __user * dirent;
 	int result;
 };
@@ -73,7 +75,7 @@ struct readdir_callback {
 static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
 		      u64 ino, unsigned int d_type)
 {
-	struct readdir_callback * buf = (struct readdir_callback *) __buf;
+	struct readdir_callback *buf = (struct readdir_callback *) __buf;
 	struct old_linux_dirent __user * dirent;
 	unsigned long d_ino;
 
@@ -107,15 +109,15 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 {
 	int error;
 	struct fd f = fdget(fd);
-	struct readdir_callback buf;
+	struct readdir_callback buf = {
+		.ctx.actor = fillonedir,
+		.dirent = dirent
+	};
 
 	if (!f.file)
 		return -EBADF;
 
-	buf.result = 0;
-	buf.dirent = dirent;
-
-	error = vfs_readdir(f.file, fillonedir, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (buf.result)
 		error = buf.result;
 
@@ -137,6 +139,7 @@ struct linux_dirent {
 };
 
 struct getdents_callback {
+	struct dir_context ctx;
 	struct linux_dirent __user * current_dir;
 	struct linux_dirent __user * previous;
 	int count;
@@ -191,7 +194,11 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 {
 	struct fd f;
 	struct linux_dirent __user * lastdirent;
-	struct getdents_callback buf;
+	struct getdents_callback buf = {
+		.ctx.actor = filldir,
+		.count = count,
+		.current_dir = dirent
+	};
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -201,17 +208,12 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 	if (!f.file)
 		return -EBADF;
 
-	buf.current_dir = dirent;
-	buf.previous = NULL;
-	buf.count = count;
-	buf.error = 0;
-
-	error = vfs_readdir(f.file, filldir, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		if (put_user(f.file->f_pos, &lastdirent->d_off))
+		if (put_user(buf.ctx.pos, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;
@@ -221,6 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 }
 
 struct getdents_callback64 {
+	struct dir_context ctx;
 	struct linux_dirent64 __user * current_dir;
 	struct linux_dirent64 __user * previous;
 	int count;
@@ -271,7 +274,11 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 {
 	struct fd f;
 	struct linux_dirent64 __user * lastdirent;
-	struct getdents_callback64 buf;
+	struct getdents_callback64 buf = {
+		.ctx.actor = filldir64,
+		.count = count,
+		.current_dir = dirent
+	};
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -281,17 +288,12 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 	if (!f.file)
 		return -EBADF;
 
-	buf.current_dir = dirent;
-	buf.previous = NULL;
-	buf.count = count;
-	buf.error = 0;
-
-	error = vfs_readdir(f.file, filldir64, &buf);
+	error = iterate_dir(f.file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		typeof(lastdirent->d_off) d_off = f.file->f_pos;
+		typeof(lastdirent->d_off) d_off = buf.ctx.pos;
 		if (__put_user(d_off, &lastdirent->d_off))
 			error = -EFAULT;
 		else
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 66c53b642a8..03e4ca5624d 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -13,14 +13,14 @@
 
 extern const struct reiserfs_key MIN_KEY;
 
-static int reiserfs_readdir(struct file *, void *, filldir_t);
+static int reiserfs_readdir(struct file *, struct dir_context *);
 static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 			      int datasync);
 
 const struct file_operations reiserfs_dir_operations = {
 	.llseek = generic_file_llseek,
 	.read = generic_read_dir,
-	.readdir = reiserfs_readdir,
+	.iterate = reiserfs_readdir,
 	.fsync = reiserfs_dir_fsync,
 	.unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
@@ -50,18 +50,15 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 
 #define store_ih(where,what) copy_item_head (where, what)
 
-static inline bool is_privroot_deh(struct dentry *dir,
-				   struct reiserfs_de_head *deh)
+static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
 {
-	struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-	return (dir == dir->d_parent && privroot->d_inode &&
+	struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
+	return (privroot->d_inode &&
 	        deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
 
-int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
-			   filldir_t filldir, loff_t *pos)
+int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
 {
-	struct inode *inode = dentry->d_inode;
 	struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
 	INITIALIZE_PATH(path_to_entry);
 	struct buffer_head *bh;
@@ -81,7 +78,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
 
 	/* form key for search the next directory entry using f_pos field of
 	   file structure */
-	make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
+	make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
 	next_pos = cpu_key_k_offset(&pos_key);
 
 	path_to_entry.reada = PATH_READA;
@@ -126,7 +123,6 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
 			     entry_num++, deh++) {
 				int d_reclen;
 				char *d_name;
-				off_t d_off;
 				ino_t d_ino;
 
 				if (!de_visible(deh))
@@ -155,11 +151,10 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
 				}
 
 				/* Ignore the .reiserfs_priv entry */
-				if (is_privroot_deh(dentry, deh))
+				if (is_privroot_deh(inode, deh))
 					continue;
 
-				d_off = deh_offset(deh);
-				*pos = d_off;
+				ctx->pos = deh_offset(deh);
 				d_ino = deh_objectid(deh);
 				if (d_reclen <= 32) {
 					local_buf = small_buf;
@@ -187,9 +182,9 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
 				 * the write lock here for other waiters
 				 */
 				reiserfs_write_unlock(inode->i_sb);
-				if (filldir
-				    (dirent, local_buf, d_reclen, d_off, d_ino,
-				     DT_UNKNOWN) < 0) {
+				if (!dir_emit
+				    (ctx, local_buf, d_reclen, d_ino,
+				     DT_UNKNOWN)) {
 					reiserfs_write_lock(inode->i_sb);
 					if (local_buf != small_buf) {
 						kfree(local_buf);
@@ -204,6 +199,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
 				next_pos = deh_offset(deh) + 1;
 
 				if (item_moved(&tmp_ih, &path_to_entry)) {
+					set_cpu_key_k_offset(&pos_key,
+							     next_pos);
 					goto research;
 				}
 			}	/* for */
@@ -235,7 +232,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
 	}			/* while */
 
 end:
-	*pos = next_pos;
+	ctx->pos = next_pos;
 	pathrelse(&path_to_entry);
 	reiserfs_check_path(&path_to_entry);
 out:
@@ -243,10 +240,9 @@ out:
 	return ret;
 }
 
-static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = file->f_path.dentry;
-	return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
+	return reiserfs_readdir_inode(file_inode(file), ctx);
 }
 
 /* compose directory item containing "." and ".." entries (entries are
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 50302d6f889..dcaafcfc23b 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -234,68 +234,9 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
 	return ret;
 }
 
-/* Write @count bytes at position @ppos in a file indicated by @file
-   from the buffer @buf.
-
-   generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
-   something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
-   written for (ext2/3).  This is for several reasons:
-
-   * It has no understanding of any filesystem specific optimizations.
-
-   * It enters the filesystem repeatedly for each page that is written.
-
-   * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
-   * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
-   * to reiserfs which allows for fewer tree traversals.
-
-   * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
-
-   * Asking the block allocation code for blocks one at a time is slightly less efficient.
-
-   All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
-   use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
-   things right finally.
-
-   Future Features: providing search_by_key with hints.
-
-*/
-static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going to write into */
-				   const char __user * buf,	/*  pointer to user supplied data
-								   (in userspace) */
-				   size_t count,	/* amount of bytes to write */
-				   loff_t * ppos	/* pointer to position in file that we start writing at. Should be updated to
-							 * new current position before returning. */
-				   )
-{
-	struct inode *inode = file->f_path.dentry->d_inode;	// Inode of the file that we are writing to.
-	/* To simplify coding at this time, we store
-	   locked pages in array for now */
-	struct reiserfs_transaction_handle th;
-	th.t_trans_id = 0;
-
-	/* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
-	* lying around (most of the disk, in fact). Despite the filesystem
-	* now being a v3.6 format, the old items still can't support large
-	* file sizes. Catch this case here, as the rest of the VFS layer is
-	* oblivious to the different limitations between old and new items.
-	* reiserfs_setattr catches this for truncates. This chunk is lifted
-	* from generic_write_checks. */
-	if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
-	    *ppos + count > MAX_NON_LFS) {
-		if (*ppos >= MAX_NON_LFS) {
-			return -EFBIG;
-		}
-		if (count > MAX_NON_LFS - (unsigned long)*ppos)
-			count = MAX_NON_LFS - (unsigned long)*ppos;
-	}
-
-	return do_sync_write(file, buf, count, ppos);
-}
-
 const struct file_operations reiserfs_file_operations = {
 	.read = do_sync_read,
-	.write = reiserfs_file_write,
+	.write = do_sync_write,
 	.unlocked_ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = reiserfs_compat_ioctl,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 95d7680ead4..0048cc16a6a 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
 #include <linux/swap.h>
+#include <linux/aio.h>
 
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
@@ -1603,10 +1604,10 @@ int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
 
 	if (parent && (maxlen < 5)) {
 		*lenp = 5;
-		return 255;
+		return FILEID_INVALID;
 	} else if (maxlen < 3) {
 		*lenp = 3;
-		return 255;
+		return FILEID_INVALID;
 	}
 
 	data[0] = inode->i_ino;
@@ -1810,11 +1811,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
 	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
 	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
-	if (insert_inode_locked4(inode, args.objectid,
-			     reiserfs_find_actor, &args) < 0) {
+
+	reiserfs_write_unlock(inode->i_sb);
+	err = insert_inode_locked4(inode, args.objectid,
+			     reiserfs_find_actor, &args);
+	reiserfs_write_lock(inode->i_sb);
+	if (err) {
 		err = -EINVAL;
 		goto out_bad_inode;
 	}
+
 	if (old_format_only(sb))
 		/* not a perfect generation count, as object ids can be reused, but
 		 ** this is as good as reiserfs can do right now.
@@ -2969,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
 }
 
 /* clm -- taken from fs/buffer.c:block_invalidate_page */
-static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
+static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
+				    unsigned int length)
 {
 	struct buffer_head *head, *bh, *next;
 	struct inode *inode = page->mapping->host;
 	unsigned int curr_off = 0;
+	unsigned int stop = offset + length;
+	int partial_page = (offset || length < PAGE_CACHE_SIZE);
 	int ret = 1;
 
 	BUG_ON(!PageLocked(page));
 
-	if (offset == 0)
+	if (!partial_page)
 		ClearPageChecked(page);
 
 	if (!page_has_buffers(page))
@@ -2990,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;
 
+		if (next_off > stop)
+			goto out;
+
 		/*
 		 * is this block fully invalidated?
 		 */
@@ -3008,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
 	 * The get_block cached value has been unconditionally invalidated,
 	 * so real IO is not possible anymore.
 	 */
-	if (!offset && ret) {
+	if (!partial_page && ret) {
 		ret = try_to_release_page(page, 0);
 		/* maybe should BUG_ON(!ret); - neilb */
 	}
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 0c2185042d5..15cb5fe6b42 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -21,7 +21,7 @@
  */
 long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	unsigned int flags;
 	int err = 0;
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index afcadcc03e8..742fdd4c209 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -97,7 +97,7 @@ static int flush_commit_list(struct super_block *s,
 static int can_dirty(struct reiserfs_journal_cnode *cn);
 static int journal_join(struct reiserfs_transaction_handle *th,
 			struct super_block *sb, unsigned long nblocks);
-static int release_journal_dev(struct super_block *super,
+static void release_journal_dev(struct super_block *super,
 			       struct reiserfs_journal *journal);
 static int dirty_one_transaction(struct super_block *s,
 				 struct reiserfs_journal_list *jl);
@@ -2532,23 +2532,13 @@ static void journal_list_init(struct super_block *sb)
 	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
 }
 
-static int release_journal_dev(struct super_block *super,
+static void release_journal_dev(struct super_block *super,
 			       struct reiserfs_journal *journal)
 {
-	int result;
-
-	result = 0;
-
 	if (journal->j_dev_bd != NULL) {
-		result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
+		blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
 		journal->j_dev_bd = NULL;
 	}
-
-	if (result != 0) {
-		reiserfs_warning(super, "sh-457",
-				 "Cannot release journal device: %i", result);
-	}
-	return result;
 }
 
 static int journal_init_dev(struct super_block *super,
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index e60e87035bb..33532f79b4f 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -281,7 +281,7 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb)
 	}
 #if defined( REISERFS_USE_OIDMAPF )
 	if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) {
-		loff_t size = sb_info->oidmap.mapf->f_path.dentry->d_inode->i_size;
+		loff_t size = file_inode(sb_info->oidmap.mapf)->i_size;
 		total_used += size / sizeof(reiserfs_oidinterval_d_t);
 	}
 #endif
@@ -394,20 +394,24 @@ static int set_sb(struct super_block *sb, void *data)
 	return -ENOENT;
 }
 
+struct reiserfs_seq_private {
+	struct super_block *sb;
+	int (*show) (struct seq_file *, struct super_block *);
+};
+
 static void *r_start(struct seq_file *m, loff_t * pos)
 {
-	struct proc_dir_entry *de = m->private;
-	struct super_block *s = de->parent->data;
+	struct reiserfs_seq_private *priv = m->private;
 	loff_t l = *pos;
 
 	if (l)
 		return NULL;
 
-	if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, s)))
+	if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb)))
 		return NULL;
 
-	up_write(&s->s_umount);
-	return s;
+	up_write(&priv->sb->s_umount);
+	return priv->sb;
 }
 
 static void *r_next(struct seq_file *m, void *v, loff_t * pos)
@@ -426,9 +430,8 @@ static void r_stop(struct seq_file *m, void *v)
 
 static int r_show(struct seq_file *m, void *v)
 {
-	struct proc_dir_entry *de = m->private;
-	int (*show) (struct seq_file *, struct super_block *) = de->data;
-	return show(m, v);
+	struct reiserfs_seq_private *priv = m->private;
+	return priv->show(m, v);
 }
 
 static const struct seq_operations r_ops = {
@@ -440,11 +443,15 @@ static const struct seq_operations r_ops = {
 
 static int r_open(struct inode *inode, struct file *file)
 {
-	int ret = seq_open(file, &r_ops);
+	struct reiserfs_seq_private *priv;
+	int ret = seq_open_private(file, &r_ops,
+				   sizeof(struct reiserfs_seq_private));
 
 	if (!ret) {
 		struct seq_file *m = file->private_data;
-		m->private = PDE(inode);
+		priv = m->private;
+		priv->sb = proc_get_parent_data(inode);
+		priv->show = PDE_DATA(inode);
 	}
 	return ret;
 }
@@ -453,7 +460,7 @@ static const struct file_operations r_file_operations = {
 	.open = r_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
-	.release = seq_release,
+	.release = seq_release_private,
 	.owner = THIS_MODULE,
 };
 
@@ -479,9 +486,8 @@ int reiserfs_proc_info_init(struct super_block *sb)
 		*s = '!';
 
 	spin_lock_init(&__PINFO(sb).lock);
-	REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root);
+	REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb);
 	if (REISERFS_SB(sb)->procdir) {
-		REISERFS_SB(sb)->procdir->data = sb;
 		add_file(sb, "version", show_version);
 		add_file(sb, "super", show_super);
 		add_file(sb, "per-level", show_per_level);
@@ -499,29 +505,17 @@ int reiserfs_proc_info_init(struct super_block *sb)
 int reiserfs_proc_info_done(struct super_block *sb)
 {
 	struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
-	char b[BDEVNAME_SIZE];
-	char *s;
+	if (de) {
+		char b[BDEVNAME_SIZE];
+		char *s;
 
-	/* Some block devices use /'s */
-	strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
-	s = strchr(b, '/');
-	if (s)
-		*s = '!';
+		/* Some block devices use /'s */
+		strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
+		s = strchr(b, '/');
+		if (s)
+			*s = '!';
 
-	if (de) {
-		remove_proc_entry("journal", de);
-		remove_proc_entry("oidmap", de);
-		remove_proc_entry("on-disk-super", de);
-		remove_proc_entry("bitmap", de);
-		remove_proc_entry("per-level", de);
-		remove_proc_entry("super", de);
-		remove_proc_entry("version", de);
-	}
-	spin_lock(&__PINFO(sb).lock);
-	__PINFO(sb).exiting = 1;
-	spin_unlock(&__PINFO(sb).lock);
-	if (proc_info_root) {
-		remove_proc_entry(b, proc_info_root);
+		remove_proc_subtree(b, proc_info_root);
 		REISERFS_SB(sb)->procdir = NULL;
 	}
 	return 0;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 157e474ab30..3df5ce6c724 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2709,7 +2709,7 @@ extern const struct inode_operations reiserfs_dir_inode_operations;
 extern const struct inode_operations reiserfs_symlink_inode_operations;
 extern const struct inode_operations reiserfs_special_inode_operations;
 extern const struct file_operations reiserfs_dir_operations;
-int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *);
+int reiserfs_readdir_inode(struct inode *, struct dir_context *);
 
 /* tail_conversion.c */
 int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 418bdc3a57d..f8a23c3078f 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1147,8 +1147,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 							 "on filesystem root.");
 					return 0;
 				}
-				qf_names[qtype] =
-				    kmalloc(strlen(arg) + 1, GFP_KERNEL);
+				qf_names[qtype] = kstrdup(arg, GFP_KERNEL);
 				if (!qf_names[qtype]) {
 					reiserfs_warning(s, "reiserfs-2502",
 							 "not enough memory "
@@ -1156,7 +1155,6 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 							 "quotafile name.");
 					return 0;
 				}
-				strcpy(qf_names[qtype], arg);
 				if (qtype == USRQUOTA)
 					*mount_options |= 1 << REISERFS_USRQUOTA;
 				else
@@ -2434,6 +2432,7 @@ struct file_system_type reiserfs_fs_type = {
 	.kill_sb = reiserfs_kill_sb,
 	.fs_flags = FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("reiserfs");
 
 MODULE_DESCRIPTION("ReiserFS journaled filesystem");
 MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index c196369fe40..c69cdd749f0 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -171,6 +171,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
  * modifying extended attributes. This includes operations such as permissions
  * or ownership changes, object deletions, etc. */
 struct reiserfs_dentry_buf {
+	struct dir_context ctx;
 	struct dentry *xadir;
 	int count;
 	struct dentry *dentries[8];
@@ -187,8 +188,8 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
 	if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
 		return -ENOSPC;
 
-	if (name[0] == '.' && (name[1] == '\0' ||
-			       (name[1] == '.' && name[2] == '\0')))
+	if (name[0] == '.' && (namelen < 2 ||
+			       (namelen == 2 && name[1] == '.')))
 		return 0;
 
 	dentry = lookup_one_len(name, dbuf->xadir, namelen);
@@ -223,9 +224,8 @@ static int reiserfs_for_each_xattr(struct inode *inode,
 {
 	struct dentry *dir;
 	int i, err = 0;
-	loff_t pos = 0;
 	struct reiserfs_dentry_buf buf = {
-		.count = 0,
+		.ctx.actor = fill_with_dentries,
 	};
 
 	/* Skip out, an xattr has no xattrs associated with it */
@@ -249,29 +249,27 @@ static int reiserfs_for_each_xattr(struct inode *inode,
 	reiserfs_write_lock(inode->i_sb);
 
 	buf.xadir = dir;
-	err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
-	while ((err == 0 || err == -ENOSPC) && buf.count) {
-		err = 0;
-
-		for (i = 0; i < buf.count && buf.dentries[i]; i++) {
-			int lerr = 0;
+	while (1) {
+		err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
+		if (err)
+			break;
+		if (!buf.count)
+			break;
+		for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
 			struct dentry *dentry = buf.dentries[i];
 
-			if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode))
-				lerr = action(dentry, data);
+			if (!S_ISDIR(dentry->d_inode->i_mode))
+				err = action(dentry, data);
 
 			dput(dentry);
 			buf.dentries[i] = NULL;
-			err = lerr ?: err;
 		}
+		if (err)
+			break;
 		buf.count = 0;
-		if (!err)
-			err = reiserfs_readdir_dentry(dir, &buf,
-						      fill_with_dentries, &pos);
 	}
 	mutex_unlock(&dir->d_inode->i_mutex);
 
-	/* Clean up after a failed readdir */
 	cleanup_dentry_buf(&buf);
 
 	if (!err) {
@@ -318,7 +316,19 @@ static int delete_one_xattr(struct dentry *dentry, void *data)
 static int chown_one_xattr(struct dentry *dentry, void *data)
 {
 	struct iattr *attrs = data;
-	return reiserfs_setattr(dentry, attrs);
+	int ia_valid = attrs->ia_valid;
+	int err;
+
+	/*
+	 * We only want the ownership bits. Otherwise, we'll do
+	 * things like change a directory to a regular file if
+	 * ATTR_MODE is set.
+	 */
+	attrs->ia_valid &= (ATTR_UID|ATTR_GID);
+	err = reiserfs_setattr(dentry, attrs);
+	attrs->ia_valid = ia_valid;
+
+	return err;
 }
 
 /* No i_mutex, but the inode is unconnected. */
@@ -788,6 +798,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
 }
 
 struct listxattr_buf {
+	struct dir_context ctx;
 	size_t size;
 	size_t pos;
 	char *buf;
@@ -833,8 +844,8 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 {
 	struct dentry *dir;
 	int err = 0;
-	loff_t pos = 0;
 	struct listxattr_buf buf = {
+		.ctx.actor = listxattr_filler,
 		.dentry = dentry,
 		.buf = buffer,
 		.size = buffer ? size : 0,
@@ -856,7 +867,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 	}
 
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
-	err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos);
+	err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
 	mutex_unlock(&dir->d_inode->i_mutex);
 
 	if (!err)
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d7c01ef64ed..6c8767fdfc6 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -443,6 +443,9 @@ int reiserfs_acl_chmod(struct inode *inode)
 	int depth;
 	int error;
 
+	if (IS_PRIVATE(inode))
+		return 0;
+
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
 
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index e1a7779dd3c..f373bde8f54 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -49,8 +49,11 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
 		return (unsigned long) -EINVAL;
 
 	offset += ROMFS_I(inode)->i_dataoffset;
-	if (offset > mtd->size - len)
+	if (offset >= mtd->size)
 		return (unsigned long) -EINVAL;
+	/* the mapping mustn't extend beyond the EOF */
+	if ((offset + len) > mtd->size)
+		len = mtd->size - offset;
 
 	ret = mtd_get_unmapped_area(mtd, len, offset, flags);
 	if (ret == -EOPNOTSUPP)
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index fd7c5f60b46..ff1d3d42e72 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -145,19 +145,18 @@ static const struct address_space_operations romfs_aops = {
 /*
  * read the entries from a directory
  */
-static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int romfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *i = filp->f_dentry->d_inode;
+	struct inode *i = file_inode(file);
 	struct romfs_inode ri;
 	unsigned long offset, maxoff;
 	int j, ino, nextfh;
-	int stored = 0;
 	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
 	int ret;
 
 	maxoff = romfs_maxsize(i->i_sb);
 
-	offset = filp->f_pos;
+	offset = ctx->pos;
 	if (!offset) {
 		offset = i->i_ino & ROMFH_MASK;
 		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -170,10 +169,10 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	for (;;) {
 		if (!offset || offset >= maxoff) {
 			offset = maxoff;
-			filp->f_pos = offset;
+			ctx->pos = offset;
 			goto out;
 		}
-		filp->f_pos = offset;
+		ctx->pos = offset;
 
 		/* Fetch inode info */
 		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -194,16 +193,14 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		nextfh = be32_to_cpu(ri.next);
 		if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
 			ino = be32_to_cpu(ri.spec);
-		if (filldir(dirent, fsname, j, offset, ino,
-			    romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+		if (!dir_emit(ctx, fsname, j, ino,
+			    romfs_dtype_table[nextfh & ROMFH_TYPE]))
 			goto out;
 
-		stored++;
 		offset = nextfh & ROMFH_MASK;
 	}
-
 out:
-	return stored;
+	return 0;
 }
 
 /*
@@ -281,7 +278,7 @@ error:
 
 static const struct file_operations romfs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= romfs_readdir,
+	.iterate	= romfs_readdir,
 	.llseek		= default_llseek,
 };
 
@@ -599,6 +596,7 @@ static struct file_system_type romfs_fs_type = {
 	.kill_sb	= romfs_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("romfs");
 
 /*
  * inode storage initialiser
diff --git a/fs/select.c b/fs/select.c
index 2ef72d96503..f9f49c40cfd 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -26,6 +26,9 @@
 #include <linux/fs.h>
 #include <linux/rcupdate.h>
 #include <linux/hrtimer.h>
+#include <linux/sched/rt.h>
+#include <linux/freezer.h>
+#include <net/ll_poll.h>
 
 #include <asm/uaccess.h>
 
@@ -235,7 +238,8 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 
 	set_current_state(state);
 	if (!pwq->triggered)
-		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+		rc = freezable_schedule_hrtimeout_range(expires, slack,
+							HRTIMER_MODE_ABS);
 	__set_current_state(TASK_RUNNING);
 
 	/*
@@ -383,9 +387,10 @@ get_max:
 #define POLLEX_SET (POLLPRI)
 
 static inline void wait_key_set(poll_table *wait, unsigned long in,
-				unsigned long out, unsigned long bit)
+				unsigned long out, unsigned long bit,
+				unsigned int ll_flag)
 {
-	wait->_key = POLLEX_SET;
+	wait->_key = POLLEX_SET | ll_flag;
 	if (in & bit)
 		wait->_key |= POLLIN_SET;
 	if (out & bit)
@@ -399,6 +404,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 	poll_table *wait;
 	int retval, i, timed_out = 0;
 	unsigned long slack = 0;
+	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+	unsigned long busy_end = 0;
 
 	rcu_read_lock();
 	retval = max_select_fd(n, fds);
@@ -421,6 +428,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 	retval = 0;
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
+		bool can_busy_loop = false;
 
 		inp = fds->in; outp = fds->out; exp = fds->ex;
 		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -448,7 +456,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 					f_op = f.file->f_op;
 					mask = DEFAULT_POLLMASK;
 					if (f_op && f_op->poll) {
-						wait_key_set(wait, in, out, bit);
+						wait_key_set(wait, in, out,
+							     bit, busy_flag);
 						mask = (*f_op->poll)(f.file, wait);
 					}
 					fdput(f);
@@ -467,6 +476,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 						retval++;
 						wait->_qproc = NULL;
 					}
+					/* got something, stop busy polling */
+					if (retval) {
+						can_busy_loop = false;
+						busy_flag = 0;
+
+					/*
+					 * only remember a returned
+					 * POLL_BUSY_LOOP if we asked for it
+					 */
+					} else if (busy_flag & mask)
+						can_busy_loop = true;
+
 				}
 			}
 			if (res_in)
@@ -485,6 +506,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 			break;
 		}
 
+		/* only if found POLL_BUSY_LOOP sockets && not out of time */
+		if (can_busy_loop && !need_resched()) {
+			if (!busy_end) {
+				busy_end = busy_loop_end_time();
+				continue;
+			}
+			if (!busy_loop_timeout(busy_end))
+				continue;
+		}
+		busy_flag = 0;
+
 		/*
 		 * If this is the first loop and we have a timeout
 		 * given, then we convert to ktime_t and set the to
@@ -716,7 +748,9 @@ struct poll_list {
  * pwait poll_table will be used by the fd-provided poll handler for waiting,
  * if pwait->_qproc is non-NULL.
  */
-static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
+static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
+				     bool *can_busy_poll,
+				     unsigned int busy_flag)
 {
 	unsigned int mask;
 	int fd;
@@ -730,7 +764,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 			mask = DEFAULT_POLLMASK;
 			if (f.file->f_op && f.file->f_op->poll) {
 				pwait->_key = pollfd->events|POLLERR|POLLHUP;
+				pwait->_key |= busy_flag;
 				mask = f.file->f_op->poll(f.file, pwait);
+				if (mask & busy_flag)
+					*can_busy_poll = true;
 			}
 			/* Mask out unneeded events. */
 			mask &= pollfd->events | POLLERR | POLLHUP;
@@ -749,6 +786,8 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 	ktime_t expire, *to = NULL;
 	int timed_out = 0, count = 0;
 	unsigned long slack = 0;
+	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
+	unsigned long busy_end = 0;
 
 	/* Optimise the no-wait case */
 	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -761,6 +800,7 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 
 	for (;;) {
 		struct poll_list *walk;
+		bool can_busy_loop = false;
 
 		for (walk = list; walk != NULL; walk = walk->next) {
 			struct pollfd * pfd, * pfd_end;
@@ -775,9 +815,13 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 				 * this. They'll get immediately deregistered
 				 * when we break out and return.
 				 */
-				if (do_pollfd(pfd, pt)) {
+				if (do_pollfd(pfd, pt, &can_busy_loop,
+					      busy_flag)) {
 					count++;
 					pt->_qproc = NULL;
+					/* found something, stop busy polling */
+					busy_flag = 0;
+					can_busy_loop = false;
 				}
 			}
 		}
@@ -794,6 +838,17 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 		if (count || timed_out)
 			break;
 
+		/* only if found POLL_BUSY_LOOP sockets && not out of time */
+		if (can_busy_loop && !need_resched()) {
+			if (!busy_end) {
+				busy_end = busy_loop_end_time();
+				continue;
+			}
+			if (!busy_loop_timeout(busy_end))
+				continue;
+		}
+		busy_flag = 0;
+
 		/*
 		 * If this is the first loop and we have a timeout
 		 * given, then we convert to ktime_t and set the to
diff --git a/fs/seq_file.c b/fs/seq_file.c
index f2bc3dfd0b8..3135c2525c7 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -308,27 +308,27 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
 	mutex_lock(&m->lock);
 	m->version = file->f_version;
 	switch (whence) {
-		case 1:
-			offset += file->f_pos;
-		case 0:
-			if (offset < 0)
-				break;
-			retval = offset;
-			if (offset != m->read_pos) {
-				while ((retval=traverse(m, offset)) == -EAGAIN)
-					;
-				if (retval) {
-					/* with extreme prejudice... */
-					file->f_pos = 0;
-					m->read_pos = 0;
-					m->version = 0;
-					m->index = 0;
-					m->count = 0;
-				} else {
-					m->read_pos = offset;
-					retval = file->f_pos = offset;
-				}
+	case SEEK_CUR:
+		offset += file->f_pos;
+	case SEEK_SET:
+		if (offset < 0)
+			break;
+		retval = offset;
+		if (offset != m->read_pos) {
+			while ((retval = traverse(m, offset)) == -EAGAIN)
+				;
+			if (retval) {
+				/* with extreme prejudice... */
+				file->f_pos = 0;
+				m->read_pos = 0;
+				m->version = 0;
+				m->index = 0;
+				m->count = 0;
+			} else {
+				m->read_pos = offset;
+				retval = file->f_pos = offset;
 			}
+		}
 	}
 	file->f_version = m->version;
 	mutex_unlock(&m->lock);
@@ -339,7 +339,7 @@ EXPORT_SYMBOL(seq_lseek);
 /**
  *	seq_release -	free the structures associated with sequential file.
  *	@file: file in question
- *	@inode: file->f_path.dentry->d_inode
+ *	@inode: its inode
  *
  *	Frees the structures associated with sequential file; can be used
  *	as ->f_op->release() if you don't have private data to destroy.
@@ -599,6 +599,24 @@ int single_open(struct file *file, int (*show)(struct seq_file *, void *),
 }
 EXPORT_SYMBOL(single_open);
 
+int single_open_size(struct file *file, int (*show)(struct seq_file *, void *),
+		void *data, size_t size)
+{
+	char *buf = kmalloc(size, GFP_KERNEL);
+	int ret;
+	if (!buf)
+		return -ENOMEM;
+	ret = single_open(file, show, data);
+	if (ret) {
+		kfree(buf);
+		return ret;
+	}
+	((struct seq_file *)file->private_data)->buf = buf;
+	((struct seq_file *)file->private_data)->size = size;
+	return 0;
+}
+EXPORT_SYMBOL(single_open_size);
+
 int single_release(struct inode *inode, struct file *file)
 {
 	const struct seq_operations *op = ((struct seq_file *)file->private_data)->op;
@@ -903,3 +921,57 @@ struct hlist_node *seq_hlist_next_rcu(void *v,
 		return rcu_dereference(node->next);
 }
 EXPORT_SYMBOL(seq_hlist_next_rcu);
+
+/**
+ * seq_hlist_start_precpu - start an iteration of a percpu hlist array
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu:  pointer to cpu "cursor"
+ * @pos:  start position of sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *
+seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
+{
+	struct hlist_node *node;
+
+	for_each_possible_cpu(*cpu) {
+		hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
+			if (pos-- == 0)
+				return node;
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_percpu);
+
+/**
+ * seq_hlist_next_percpu - move to the next position of the percpu hlist array
+ * @v:    pointer to current hlist_node
+ * @head: pointer to percpu array of struct hlist_heads
+ * @cpu:  pointer to cpu "cursor"
+ * @pos:  start position of sequence
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *
+seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
+			int *cpu, loff_t *pos)
+{
+	struct hlist_node *node = v;
+
+	++*pos;
+
+	if (node->next)
+		return node->next;
+
+	for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
+	     *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
+		struct hlist_head *bucket = per_cpu_ptr(head, *cpu);
+
+		if (!hlist_empty(bucket))
+			return bucket->first;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_next_percpu);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index b5348696173..424b7b65321 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -30,6 +30,7 @@
 #include <linux/signalfd.h>
 #include <linux/syscalls.h>
 #include <linux/proc_fs.h>
+#include <linux/compat.h>
 
 void signalfd_cleanup(struct sighand_struct *sighand)
 {
@@ -311,3 +312,33 @@ SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
 {
 	return sys_signalfd4(ufd, user_mask, sizemask, 0);
 }
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
+		     const compat_sigset_t __user *,sigmask,
+		     compat_size_t, sigsetsize,
+		     int, flags)
+{
+	compat_sigset_t ss32;
+	sigset_t tmp;
+	sigset_t __user *ksigmask;
+
+	if (sigsetsize != sizeof(compat_sigset_t))
+		return -EINVAL;
+	if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		return -EFAULT;
+	sigset_from_compat(&tmp, &ss32);
+	ksigmask = compat_alloc_user_space(sizeof(sigset_t));
+	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
+		return -EFAULT;
+
+	return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
+}
+
+COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd,
+		     const compat_sigset_t __user *,sigmask,
+		     compat_size_t, sigsetsize)
+{
+	return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
+}
+#endif
diff --git a/fs/splice.c b/fs/splice.c
index 6909d89d0da..3b7ee656f3a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -31,6 +31,8 @@
 #include <linux/security.h>
 #include <linux/gfp.h>
 #include <linux/socket.h>
+#include <linux/compat.h>
+#include "internal.h"
 
 /*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -217,7 +219,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 			page_nr++;
 			ret += buf->len;
 
-			if (pipe->inode)
+			if (pipe->files)
 				do_wakeup = 1;
 
 			if (!--spd->nr_pages)
@@ -569,7 +571,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 	return res;
 }
 
-static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+ssize_t kernel_write(struct file *file, const char *buf, size_t count,
 			    loff_t pos)
 {
 	mm_segment_t old_fs;
@@ -578,11 +580,12 @@ static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
 	old_fs = get_fs();
 	set_fs(get_ds());
 	/* The cast to a user pointer is valid due to the set_fs() */
-	res = vfs_write(file, (const char __user *)buf, count, &pos);
+	res = vfs_write(file, (__force const char __user *)buf, count, &pos);
 	set_fs(old_fs);
 
 	return res;
 }
+EXPORT_SYMBOL(kernel_write);
 
 ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 				 struct pipe_inode_info *pipe, size_t len,
@@ -826,7 +829,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 			ops->release(pipe, buf);
 			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 			pipe->nrbufs--;
-			if (pipe->inode)
+			if (pipe->files)
 				sd->need_wakeup = true;
 		}
 
@@ -998,8 +1001,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	};
 	ssize_t ret;
 
-	sb_start_write(inode->i_sb);
-
 	pipe_lock(pipe);
 
 	splice_from_pipe_begin(&sd);
@@ -1035,7 +1036,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 			*ppos += ret;
 		balance_dirty_pages_ratelimited(mapping);
 	}
-	sb_end_write(inode->i_sb);
 
 	return ret;
 }
@@ -1047,9 +1047,10 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 {
 	int ret;
 	void *data;
+	loff_t tmp = sd->pos;
 
 	data = buf->ops->map(pipe, buf, 0);
-	ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
+	ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
 	buf->ops->unmap(pipe, buf, data);
 
 	return ret;
@@ -1097,17 +1098,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 {
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
 				loff_t *, size_t, unsigned int);
-	int ret;
-
-	if (unlikely(!(out->f_mode & FMODE_WRITE)))
-		return -EBADF;
-
-	if (unlikely(out->f_flags & O_APPEND))
-		return -EINVAL;
-
-	ret = rw_verify_area(WRITE, out, ppos, len);
-	if (unlikely(ret < 0))
-		return ret;
 
 	if (out->f_op && out->f_op->splice_write)
 		splice_write = out->f_op->splice_write;
@@ -1170,7 +1160,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	 * randomly drop data for eg socket -> socket splicing. Use the
 	 * piped splicing for that!
 	 */
-	i_mode = in->f_path.dentry->d_inode->i_mode;
+	i_mode = file_inode(in)->i_mode;
 	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
 		return -EINVAL;
 
@@ -1180,7 +1170,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	 */
 	pipe = current->splice_pipe;
 	if (unlikely(!pipe)) {
-		pipe = alloc_pipe_info(NULL);
+		pipe = alloc_pipe_info();
 		if (!pipe)
 			return -ENOMEM;
 
@@ -1270,7 +1260,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
 {
 	struct file *file = sd->u.file;
 
-	return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
+	return do_splice_from(pipe, file, sd->opos, sd->total_len,
 			      sd->flags);
 }
 
@@ -1279,6 +1269,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
  * @in:		file to splice from
  * @ppos:	input file offset
  * @out:	file to splice to
+ * @opos:	output file offset
  * @len:	number of bytes to splice
  * @flags:	splice modifier flags
  *
@@ -1290,7 +1281,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
  *
  */
 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
-		      size_t len, unsigned int flags)
+		      loff_t *opos, size_t len, unsigned int flags)
 {
 	struct splice_desc sd = {
 		.len		= len,
@@ -1298,9 +1289,20 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 		.flags		= flags,
 		.pos		= *ppos,
 		.u.file		= out,
+		.opos		= opos,
 	};
 	long ret;
 
+	if (unlikely(!(out->f_mode & FMODE_WRITE)))
+		return -EBADF;
+
+	if (unlikely(out->f_flags & O_APPEND))
+		return -EINVAL;
+
+	ret = rw_verify_area(WRITE, out, opos, len);
+	if (unlikely(ret < 0))
+		return ret;
+
 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
 	if (ret > 0)
 		*ppos = sd.pos;
@@ -1321,7 +1323,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 {
 	struct pipe_inode_info *ipipe;
 	struct pipe_inode_info *opipe;
-	loff_t offset, *off;
+	loff_t offset;
 	long ret;
 
 	ipipe = get_pipe_info(in);
@@ -1352,13 +1354,27 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 				return -EINVAL;
 			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
 				return -EFAULT;
-			off = &offset;
-		} else
-			off = &out->f_pos;
+		} else {
+			offset = out->f_pos;
+		}
+
+		if (unlikely(!(out->f_mode & FMODE_WRITE)))
+			return -EBADF;
+
+		if (unlikely(out->f_flags & O_APPEND))
+			return -EINVAL;
 
-		ret = do_splice_from(ipipe, out, off, len, flags);
+		ret = rw_verify_area(WRITE, out, &offset, len);
+		if (unlikely(ret < 0))
+			return ret;
+
+		file_start_write(out);
+		ret = do_splice_from(ipipe, out, &offset, len, flags);
+		file_end_write(out);
 
-		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
+		if (!off_out)
+			out->f_pos = offset;
+		else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
 			ret = -EFAULT;
 
 		return ret;
@@ -1372,13 +1388,15 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 				return -EINVAL;
 			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
 				return -EFAULT;
-			off = &offset;
-		} else
-			off = &in->f_pos;
+		} else {
+			offset = in->f_pos;
+		}
 
-		ret = do_splice_to(in, off, opipe, len, flags);
+		ret = do_splice_to(in, &offset, opipe, len, flags);
 
-		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
+		if (!off_in)
+			in->f_pos = offset;
+		else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
 			ret = -EFAULT;
 
 		return ret;
@@ -1687,6 +1705,27 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
 	return error;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
+		    unsigned int, nr_segs, unsigned int, flags)
+{
+	unsigned i;
+	struct iovec __user *iov;
+	if (nr_segs > UIO_MAXIOV)
+		return -EINVAL;
+	iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
+	for (i = 0; i < nr_segs; i++) {
+		struct compat_iovec v;
+		if (get_user(v.iov_base, &iov32[i].iov_base) ||
+		    get_user(v.iov_len, &iov32[i].iov_len) ||
+		    put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
+		    put_user(v.iov_len, &iov[i].iov_len))
+			return -EFAULT;
+	}
+	return sys_vmsplice(fd, iov, nr_segs, flags);
+}
+#endif
+
 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 		int, fd_out, loff_t __user *, off_out,
 		size_t, len, unsigned int, flags)
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index b381305c9a4..f7f527bf8c1 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -100,9 +100,9 @@ static int get_dir_index_using_offset(struct super_block *sb,
 }
 
 
-static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int squashfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
 	u64 block = squashfs_i(inode)->start + msblk->directory_table;
 	int offset = squashfs_i(inode)->offset, length, dir_count, size,
@@ -127,11 +127,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
 	 * It also means that the external f_pos is offset by 3 from the
 	 * on-disk directory f_pos.
 	 */
-	while (file->f_pos < 3) {
+	while (ctx->pos < 3) {
 		char *name;
 		int i_ino;
 
-		if (file->f_pos == 0) {
+		if (ctx->pos == 0) {
 			name = ".";
 			size = 1;
 			i_ino = inode->i_ino;
@@ -141,24 +141,18 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
 			i_ino = squashfs_i(inode)->parent;
 		}
 
-		TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
-				dirent, name, size, file->f_pos, i_ino,
-				squashfs_filetype_table[1]);
-
-		if (filldir(dirent, name, size, file->f_pos, i_ino,
-				squashfs_filetype_table[1]) < 0) {
-				TRACE("Filldir returned less than 0\n");
+		if (!dir_emit(ctx, name, size, i_ino,
+				squashfs_filetype_table[1]))
 			goto finish;
-		}
 
-		file->f_pos += size;
+		ctx->pos += size;
 	}
 
 	length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
 				squashfs_i(inode)->dir_idx_start,
 				squashfs_i(inode)->dir_idx_offset,
 				squashfs_i(inode)->dir_idx_cnt,
-				file->f_pos);
+				ctx->pos);
 
 	while (length < i_size_read(inode)) {
 		/*
@@ -198,7 +192,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
 
 			length += sizeof(*dire) + size;
 
-			if (file->f_pos >= length)
+			if (ctx->pos >= length)
 				continue;
 
 			dire->name[size] = '\0';
@@ -206,22 +200,12 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
 				((short) le16_to_cpu(dire->inode_number));
 			type = le16_to_cpu(dire->type);
 
-			TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
-					"\n", dirent, dire->name, size,
-					file->f_pos,
-					le32_to_cpu(dirh.start_block),
-					le16_to_cpu(dire->offset),
-					inode_number,
-					squashfs_filetype_table[type]);
-
-			if (filldir(dirent, dire->name, size, file->f_pos,
+			if (!dir_emit(ctx, dire->name, size,
 					inode_number,
-					squashfs_filetype_table[type]) < 0) {
-				TRACE("Filldir returned less than 0\n");
+					squashfs_filetype_table[type]))
 				goto finish;
-			}
 
-			file->f_pos = length;
+			ctx->pos = length;
 		}
 	}
 
@@ -238,6 +222,6 @@ failed_read:
 
 const struct file_operations squashfs_dir_ops = {
 	.read = generic_read_dir,
-	.readdir = squashfs_readdir,
+	.iterate = squashfs_readdir,
 	.llseek = default_llseek,
 };
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 260e3928d4f..60553a9053c 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -489,6 +489,7 @@ static struct file_system_type squashfs_fs_type = {
 	.kill_sb = kill_block_super,
 	.fs_flags = FS_REQUIRES_DEV
 };
+MODULE_ALIAS_FS("squashfs");
 
 static const struct super_operations squashfs_super_ops = {
 	.alloc_inode = squashfs_alloc_inode,
diff --git a/fs/stat.c b/fs/stat.c
index 14f45459c83..04ce1ac20d2 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -37,17 +37,17 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
 
 EXPORT_SYMBOL(generic_fillattr);
 
-int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int vfs_getattr(struct path *path, struct kstat *stat)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = path->dentry->d_inode;
 	int retval;
 
-	retval = security_inode_getattr(mnt, dentry);
+	retval = security_inode_getattr(path->mnt, path->dentry);
 	if (retval)
 		return retval;
 
 	if (inode->i_op->getattr)
-		return inode->i_op->getattr(mnt, dentry, stat);
+		return inode->i_op->getattr(path->mnt, path->dentry, stat);
 
 	generic_fillattr(inode, stat);
 	return 0;
@@ -61,8 +61,7 @@ int vfs_fstat(unsigned int fd, struct kstat *stat)
 	int error = -EBADF;
 
 	if (f.file) {
-		error = vfs_getattr(f.file->f_path.mnt, f.file->f_path.dentry,
-				    stat);
+		error = vfs_getattr(&f.file->f_path, stat);
 		fdput(f);
 	}
 	return error;
@@ -89,7 +88,7 @@ retry:
 	if (error)
 		goto out;
 
-	error = vfs_getattr(path.mnt, path.dentry, stat);
+	error = vfs_getattr(&path, stat);
 	path_put(&path);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
diff --git a/fs/super.c b/fs/super.c
index 12f12371216..7465d436420 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -447,14 +447,13 @@ struct super_block *sget(struct file_system_type *type,
 			void *data)
 {
 	struct super_block *s = NULL;
-	struct hlist_node *node;
 	struct super_block *old;
 	int err;
 
 retry:
 	spin_lock(&sb_lock);
 	if (test) {
-		hlist_for_each_entry(old, node, &type->fs_supers, s_instances) {
+		hlist_for_each_entry(old, &type->fs_supers, s_instances) {
 			if (!test(old, data))
 				continue;
 			if (!grab_super(old))
@@ -554,10 +553,9 @@ void iterate_supers_type(struct file_system_type *type,
 	void (*f)(struct super_block *, void *), void *arg)
 {
 	struct super_block *sb, *p = NULL;
-	struct hlist_node *node;
 
 	spin_lock(&sb_lock);
-	hlist_for_each_entry(sb, node, &type->fs_supers, s_instances) {
+	hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 
@@ -842,7 +840,7 @@ int get_anon_bdev(dev_t *p)
 	else if (error)
 		return -EAGAIN;
 
-	if ((dev & MAX_IDR_MASK) == (1 << MINORBITS)) {
+	if (dev == (1 << MINORBITS)) {
 		spin_lock(&unnamed_dev_lock);
 		ida_remove(&unnamed_dev_ida, dev);
 		if (unnamed_dev_start > dev)
diff --git a/fs/sync.c b/fs/sync.c
index 14eefeb4463..905f3f6b3d8 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -283,8 +283,8 @@ EXPORT_SYMBOL(generic_write_sync);
  * already-instantiated disk blocks, there are no guarantees here that the data
  * will be available after a crash.
  */
-SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
-				unsigned int flags)
+SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
+				unsigned int, flags)
 {
 	int ret;
 	struct fd f;
@@ -332,7 +332,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
 	if (!f.file)
 		goto out;
 
-	i_mode = f.file->f_path.dentry->d_inode->i_mode;
+	i_mode = file_inode(f.file)->i_mode;
 	ret = -ESPIPE;
 	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
 			!S_ISLNK(i_mode))
@@ -365,29 +365,11 @@ out_put:
 out:
 	return ret;
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes,
-				    long flags)
-{
-	return SYSC_sync_file_range((int) fd, offset, nbytes,
-				    (unsigned int) flags);
-}
-SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range);
-#endif
 
 /* It would be nice if people remember that not all the world's an i386
    when they introduce new system calls */
-SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags,
-				 loff_t offset, loff_t nbytes)
+SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
+				 loff_t, offset, loff_t, nbytes)
 {
 	return sys_sync_file_range(fd, offset, nbytes, flags);
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_sync_file_range2(long fd, long flags,
-				     loff_t offset, loff_t nbytes)
-{
-	return SYSC_sync_file_range2((int) fd, (unsigned int) flags,
-				     offset, nbytes);
-}
-SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
-#endif
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 614b2b54488..15c68f9489a 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -70,7 +70,7 @@ static ssize_t
 read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 {
 	struct bin_buffer *bb = file->private_data;
-	int size = file->f_path.dentry->d_inode->i_size;
+	int size = file_inode(file)->i_size;
 	loff_t offs = *off;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	char *temp;
@@ -140,7 +140,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 		     size_t bytes, loff_t *off)
 {
 	struct bin_buffer *bb = file->private_data;
-	int size = file->f_path.dentry->d_inode->i_size;
+	int size = file_inode(file)->i_size;
 	loff_t offs = *off;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	char *temp;
@@ -461,15 +461,14 @@ const struct file_operations bin_fops = {
 void unmap_bin_file(struct sysfs_dirent *attr_sd)
 {
 	struct bin_buffer *bb;
-	struct hlist_node *tmp;
 
 	if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)
 		return;
 
 	mutex_lock(&sysfs_bin_lock);
 
-	hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) {
-		struct inode *inode = bb->file->f_path.dentry->d_inode;
+	hlist_for_each_entry(bb, &attr_sd->s_bin_attr.buffers, list) {
+		struct inode *inode = file_inode(bb->file);
 
 		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
 	}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 2fbdff6be25..e068e744dbd 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -74,7 +74,7 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
 }
 
 /**
- *	sysfs_link_subling - link sysfs_dirent into sibling rbtree
+ *	sysfs_link_sibling - link sysfs_dirent into sibling rbtree
  *	@sd: sysfs_dirent of interest
  *
  *	Link @sd into its sibling rbtree which starts from
@@ -165,21 +165,8 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
 	if (unlikely(!sd))
 		return NULL;
 
-	while (1) {
-		int v, t;
-
-		v = atomic_read(&sd->s_active);
-		if (unlikely(v < 0))
-			return NULL;
-
-		t = atomic_cmpxchg(&sd->s_active, v, v + 1);
-		if (likely(t == v))
-			break;
-		if (t < 0)
-			return NULL;
-
-		cpu_relax();
-	}
+	if (!atomic_inc_unless_negative(&sd->s_active))
+		return NULL;
 
 	if (likely(!ignore_lockdep(sd)))
 		rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
@@ -281,6 +268,10 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
 	 */
 	parent_sd = sd->s_parent;
 
+	WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
+		"sysfs: free using entry: %s/%s\n",
+		parent_sd ? parent_sd->s_name : "", sd->s_name);
+
 	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
 		sysfs_put(sd->s_symlink.target_sd);
 	if (sysfs_type(sd) & SYSFS_COPY_NAME)
@@ -399,7 +390,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
 
 	sd->s_name = name;
 	sd->s_mode = mode;
-	sd->s_flags = type;
+	sd->s_flags = type | SYSFS_FLAG_REMOVED;
 
 	return sd;
 
@@ -479,6 +470,9 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
 	}
 
+	/* Mark the entry added into directory tree */
+	sd->s_flags &= ~SYSFS_FLAG_REMOVED;
+
 	return 0;
 }
 
@@ -1004,64 +998,56 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
 	return pos;
 }
 
-static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysfs_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dentry *dentry = filp->f_path.dentry;
+	struct dentry *dentry = file->f_path.dentry;
 	struct sysfs_dirent * parent_sd = dentry->d_fsdata;
-	struct sysfs_dirent *pos = filp->private_data;
+	struct sysfs_dirent *pos = file->private_data;
 	enum kobj_ns_type type;
 	const void *ns;
-	ino_t ino;
 
 	type = sysfs_ns_type(parent_sd);
 	ns = sysfs_info(dentry->d_sb)->ns[type];
 
-	if (filp->f_pos == 0) {
-		ino = parent_sd->s_ino;
-		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
-			filp->f_pos++;
-	}
-	if (filp->f_pos == 1) {
-		if (parent_sd->s_parent)
-			ino = parent_sd->s_parent->s_ino;
-		else
-			ino = parent_sd->s_ino;
-		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
-			filp->f_pos++;
-	}
+	if (!dir_emit_dots(file, ctx))
+		return 0;
 	mutex_lock(&sysfs_mutex);
-	for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
+	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
 	     pos;
-	     pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
-		const char * name;
-		unsigned int type;
-		int len, ret;
-
-		name = pos->s_name;
-		len = strlen(name);
-		ino = pos->s_ino;
-		type = dt_type(pos);
-		filp->f_pos = pos->s_hash;
-		filp->private_data = sysfs_get(pos);
+	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
+		const char *name = pos->s_name;
+		unsigned int type = dt_type(pos);
+		int len = strlen(name);
+		ino_t ino = pos->s_ino;
+		ctx->pos = pos->s_hash;
+		file->private_data = sysfs_get(pos);
 
 		mutex_unlock(&sysfs_mutex);
-		ret = filldir(dirent, name, len, filp->f_pos, ino, type);
+		if (!dir_emit(ctx, name, len, ino, type))
+			return 0;
 		mutex_lock(&sysfs_mutex);
-		if (ret < 0)
-			break;
 	}
 	mutex_unlock(&sysfs_mutex);
-	if ((filp->f_pos > 1) && !pos) { /* EOF */
-		filp->f_pos = INT_MAX;
-		filp->private_data = NULL;
-	}
+	file->private_data = NULL;
+	ctx->pos = INT_MAX;
 	return 0;
 }
 
+static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file_inode(file);
+	loff_t ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = generic_file_llseek(file, offset, whence);
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
 
 const struct file_operations sysfs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= sysfs_readdir,
+	.iterate	= sysfs_readdir,
 	.release	= sysfs_dir_release,
-	.llseek		= generic_file_llseek,
+	.llseek		= sysfs_dir_llseek,
 };
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 602f56db044..d2bb7ed8fa7 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -449,10 +449,12 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
 
 	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
 
-	od = sd->s_attr.open;
-	if (od) {
-		atomic_inc(&od->event);
-		wake_up_interruptible(&od->poll);
+	if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
+		od = sd->s_attr.open;
+		if (od) {
+			atomic_inc(&od->event);
+			wake_up_interruptible(&od->poll);
+		}
 	}
 
 	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 2df555c66d5..aec3d5c98c9 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -205,6 +205,48 @@ void sysfs_unmerge_group(struct kobject *kobj,
 }
 EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
 
+/**
+ * sysfs_add_link_to_group - add a symlink to an attribute group.
+ * @kobj:	The kobject containing the group.
+ * @group_name:	The name of the group.
+ * @target:	The target kobject of the symlink to create.
+ * @link_name:	The name of the symlink to create.
+ */
+int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
+			    struct kobject *target, const char *link_name)
+{
+	struct sysfs_dirent *dir_sd;
+	int error = 0;
+
+	dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
+	if (!dir_sd)
+		return -ENOENT;
+
+	error = sysfs_create_link_sd(dir_sd, target, link_name);
+	sysfs_put(dir_sd);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
+
+/**
+ * sysfs_remove_link_from_group - remove a symlink from an attribute group.
+ * @kobj:	The kobject containing the group.
+ * @group_name:	The name of the group.
+ * @link_name:	The name of the symlink to remove.
+ */
+void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
+				  const char *link_name)
+{
+	struct sysfs_dirent *dir_sd;
+
+	dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
+	if (dir_sd) {
+		sysfs_hash_and_remove(dir_sd, NULL, link_name);
+		sysfs_put(dir_sd);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
 
 EXPORT_SYMBOL_GPL(sysfs_create_group);
 EXPORT_SYMBOL_GPL(sysfs_update_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0ce3ccf7f40..3e2837a633e 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -24,8 +24,6 @@
 #include <linux/security.h>
 #include "sysfs.h"
 
-extern struct super_block * sysfs_sb;
-
 static const struct address_space_operations sysfs_aops = {
 	.readpage	= simple_readpage,
 	.write_begin	= simple_write_begin,
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index db940a9be04..afd83273e6c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -10,7 +10,7 @@
  * Please see Documentation/filesystems/sysfs.txt for more information.
  */
 
-#define DEBUG 
+#define DEBUG
 
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/user_namespace.h>
 
 #include "sysfs.h"
 
@@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	struct super_block *sb;
 	int error;
 
+	if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs)
+		return ERR_PTR(-EPERM);
+
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return ERR_PTR(-ENOMEM);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3c9eb5624f5..8c940df97a5 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -21,26 +21,17 @@
 
 #include "sysfs.h"
 
-static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
-				const char *name, int warn)
+static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
+				   struct kobject *target,
+				   const char *name, int warn)
 {
-	struct sysfs_dirent *parent_sd = NULL;
 	struct sysfs_dirent *target_sd = NULL;
 	struct sysfs_dirent *sd = NULL;
 	struct sysfs_addrm_cxt acxt;
 	enum kobj_ns_type ns_type;
 	int error;
 
-	BUG_ON(!name);
-
-	if (!kobj)
-		parent_sd = &sysfs_root;
-	else
-		parent_sd = kobj->sd;
-
-	error = -EFAULT;
-	if (!parent_sd)
-		goto out_put;
+	BUG_ON(!name || !parent_sd);
 
 	/* target->sd can go away beneath us but is protected with
 	 * sysfs_assoc_lock.  Fetch target_sd from it.
@@ -96,6 +87,34 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
 }
 
 /**
+ *	sysfs_create_link_sd - create symlink to a given object.
+ *	@sd:		directory we're creating the link in.
+ *	@target:	object we're pointing to.
+ *	@name:		name of the symlink.
+ */
+int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+			 const char *name)
+{
+	return sysfs_do_create_link_sd(sd, target, name, 1);
+}
+
+static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
+				const char *name, int warn)
+{
+	struct sysfs_dirent *parent_sd = NULL;
+
+	if (!kobj)
+		parent_sd = &sysfs_root;
+	else
+		parent_sd = kobj->sd;
+
+	if (!parent_sd)
+		return -EFAULT;
+
+	return sysfs_do_create_link_sd(parent_sd, target, name, warn);
+}
+
+/**
  *	sysfs_create_link - create symlink between two objects.
  *	@kobj:	object whose directory we're creating the link in.
  *	@target:	object we're pointing to.
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d73c0932bbd..d1e4043eb0c 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -240,3 +240,5 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd);
  * symlink.c
  */
 extern const struct inode_operations sysfs_symlink_inode_operations;
+int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+			 const char *name);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index a77c4215762..d42291d0821 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -18,12 +18,12 @@
 #include <linux/swap.h>
 #include "sysv.h"
 
-static int sysv_readdir(struct file *, void *, filldir_t);
+static int sysv_readdir(struct file *, struct dir_context *);
 
 const struct file_operations sysv_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= sysv_readdir,
+	.iterate	= sysv_readdir,
 	.fsync		= generic_file_fsync,
 };
 
@@ -65,18 +65,21 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 	return page;
 }
 
-static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int sysv_readdir(struct file *file, struct dir_context *ctx)
 {
-	unsigned long pos = filp->f_pos;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	unsigned long pos = ctx->pos;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	unsigned offset = pos & ~PAGE_CACHE_MASK;
-	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = dir_pages(inode);
+	unsigned offset;
+	unsigned long n;
 
-	pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
+	ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
 	if (pos >= inode->i_size)
-		goto done;
+		return 0;
+
+	offset = pos & ~PAGE_CACHE_MASK;
+	n = pos >> PAGE_CACHE_SHIFT;
 
 	for ( ; n < npages; n++, offset = 0) {
 		char *kaddr, *limit;
@@ -88,29 +91,21 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
 		kaddr = (char *)page_address(page);
 		de = (struct sysv_dir_entry *)(kaddr+offset);
 		limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
-		for ( ;(char*)de <= limit; de++) {
+		for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
 			char *name = de->name;
-			int over;
 
 			if (!de->inode)
 				continue;
 
-			offset = (char *)de - kaddr;
-
-			over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
-					((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
+			if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
 					fs16_to_cpu(SYSV_SB(sb), de->inode),
-					DT_UNKNOWN);
-			if (over) {
+					DT_UNKNOWN)) {
 				dir_put_page(page);
-				goto done;
+				return 0;
 			}
 		}
 		dir_put_page(page);
 	}
-
-done:
-	filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
 	return 0;
 }
 
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 1c0d5f26476..731b2bbcaab 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,8 +27,7 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
 	return err;
 }
 
-static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+static int sysv_hash(const struct dentry *dentry, struct qstr *qstr)
 {
 	/* Truncate the name in place, avoids having to define a compare
 	   function. */
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index a38e87bdd78..d0c6a007ce8 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -545,6 +545,7 @@ static struct file_system_type sysv_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("sysv");
 
 static struct file_system_type v7_fs_type = {
 	.owner		= THIS_MODULE,
@@ -553,6 +554,8 @@ static struct file_system_type v7_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("v7");
+MODULE_ALIAS("v7");
 
 static int __init init_sysv_fs(void)
 {
@@ -586,5 +589,4 @@ static void __exit exit_sysv_fs(void)
 
 module_init(init_sysv_fs)
 module_exit(exit_sysv_fs)
-MODULE_ALIAS("v7");
 MODULE_LICENSE("GPL");
diff --git a/fs/timerfd.c b/fs/timerfd.c
index d03822bbf19..929312180dd 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -8,6 +8,7 @@
  *
  */
 
+#include <linux/alarmtimer.h>
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/init.h>
@@ -22,10 +23,14 @@
 #include <linux/anon_inodes.h>
 #include <linux/timerfd.h>
 #include <linux/syscalls.h>
+#include <linux/compat.h>
 #include <linux/rcupdate.h>
 
 struct timerfd_ctx {
-	struct hrtimer tmr;
+	union {
+		struct hrtimer tmr;
+		struct alarm alarm;
+	} t;
 	ktime_t tintv;
 	ktime_t moffs;
 	wait_queue_head_t wqh;
@@ -40,14 +45,19 @@ struct timerfd_ctx {
 static LIST_HEAD(cancel_list);
 static DEFINE_SPINLOCK(cancel_lock);
 
+static inline bool isalarm(struct timerfd_ctx *ctx)
+{
+	return ctx->clockid == CLOCK_REALTIME_ALARM ||
+		ctx->clockid == CLOCK_BOOTTIME_ALARM;
+}
+
 /*
  * This gets called when the timer event triggers. We set the "expired"
  * flag, but we do not re-arm the timer (in case it's necessary,
  * tintv.tv64 != 0) until the timer is accessed.
  */
-static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+static void timerfd_triggered(struct timerfd_ctx *ctx)
 {
-	struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
 	unsigned long flags;
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
@@ -55,10 +65,25 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
 	ctx->ticks++;
 	wake_up_locked(&ctx->wqh);
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+}
 
+static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
+{
+	struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx,
+					       t.tmr);
+	timerfd_triggered(ctx);
 	return HRTIMER_NORESTART;
 }
 
+static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
+	ktime_t now)
+{
+	struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx,
+					       t.alarm);
+	timerfd_triggered(ctx);
+	return ALARMTIMER_NORESTART;
+}
+
 /*
  * Called when the clock was set to cancel the timers in the cancel
  * list. This will wake up processes waiting on these timers. The
@@ -106,8 +131,9 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
 
 static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
 {
-	if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
-	    (flags & TFD_TIMER_CANCEL_ON_SET)) {
+	if ((ctx->clockid == CLOCK_REALTIME ||
+	     ctx->clockid == CLOCK_REALTIME_ALARM) &&
+	    (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
 		if (!ctx->might_cancel) {
 			ctx->might_cancel = true;
 			spin_lock(&cancel_lock);
@@ -123,7 +149,11 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 {
 	ktime_t remaining;
 
-	remaining = hrtimer_expires_remaining(&ctx->tmr);
+	if (isalarm(ctx))
+		remaining = alarm_expires_remaining(&ctx->t.alarm);
+	else
+		remaining = hrtimer_expires_remaining(&ctx->t.tmr);
+
 	return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
 
@@ -141,11 +171,28 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
 	ctx->expired = 0;
 	ctx->ticks = 0;
 	ctx->tintv = timespec_to_ktime(ktmr->it_interval);
-	hrtimer_init(&ctx->tmr, clockid, htmode);
-	hrtimer_set_expires(&ctx->tmr, texp);
-	ctx->tmr.function = timerfd_tmrproc;
+
+	if (isalarm(ctx)) {
+		alarm_init(&ctx->t.alarm,
+			   ctx->clockid == CLOCK_REALTIME_ALARM ?
+			   ALARM_REALTIME : ALARM_BOOTTIME,
+			   timerfd_alarmproc);
+	} else {
+		hrtimer_init(&ctx->t.tmr, clockid, htmode);
+		hrtimer_set_expires(&ctx->t.tmr, texp);
+		ctx->t.tmr.function = timerfd_tmrproc;
+	}
+
 	if (texp.tv64 != 0) {
-		hrtimer_start(&ctx->tmr, texp, htmode);
+		if (isalarm(ctx)) {
+			if (flags & TFD_TIMER_ABSTIME)
+				alarm_start(&ctx->t.alarm, texp);
+			else
+				alarm_start_relative(&ctx->t.alarm, texp);
+		} else {
+			hrtimer_start(&ctx->t.tmr, texp, htmode);
+		}
+
 		if (timerfd_canceled(ctx))
 			return -ECANCELED;
 	}
@@ -157,7 +204,11 @@ static int timerfd_release(struct inode *inode, struct file *file)
 	struct timerfd_ctx *ctx = file->private_data;
 
 	timerfd_remove_cancel(ctx);
-	hrtimer_cancel(&ctx->tmr);
+
+	if (isalarm(ctx))
+		alarm_cancel(&ctx->t.alarm);
+	else
+		hrtimer_cancel(&ctx->t.tmr);
 	kfree_rcu(ctx, rcu);
 	return 0;
 }
@@ -214,9 +265,15 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
 			 * callback to avoid DoS attacks specifying a very
 			 * short timer period.
 			 */
-			ticks += hrtimer_forward_now(&ctx->tmr,
-						     ctx->tintv) - 1;
-			hrtimer_restart(&ctx->tmr);
+			if (isalarm(ctx)) {
+				ticks += alarm_forward_now(
+					&ctx->t.alarm, ctx->tintv) - 1;
+				alarm_restart(&ctx->t.alarm);
+			} else {
+				ticks += hrtimer_forward_now(&ctx->t.tmr,
+							     ctx->tintv) - 1;
+				hrtimer_restart(&ctx->t.tmr);
+			}
 		}
 		ctx->expired = 0;
 		ctx->ticks = 0;
@@ -258,7 +315,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 
 	if ((flags & ~TFD_CREATE_FLAGS) ||
 	    (clockid != CLOCK_MONOTONIC &&
-	     clockid != CLOCK_REALTIME))
+	     clockid != CLOCK_REALTIME &&
+	     clockid != CLOCK_REALTIME_ALARM &&
+	     clockid != CLOCK_BOOTTIME_ALARM))
 		return -EINVAL;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -267,7 +326,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 
 	init_waitqueue_head(&ctx->wqh);
 	ctx->clockid = clockid;
-	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
+
+	if (isalarm(ctx))
+		alarm_init(&ctx->t.alarm,
+			   ctx->clockid == CLOCK_REALTIME_ALARM ?
+			   ALARM_REALTIME : ALARM_BOOTTIME,
+			   timerfd_alarmproc);
+	else
+		hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
+
 	ctx->moffs = ktime_get_monotonic_offset();
 
 	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
@@ -278,21 +345,17 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	return ufd;
 }
 
-SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
-		const struct itimerspec __user *, utmr,
-		struct itimerspec __user *, otmr)
+static int do_timerfd_settime(int ufd, int flags, 
+		const struct itimerspec *new,
+		struct itimerspec *old)
 {
 	struct fd f;
 	struct timerfd_ctx *ctx;
-	struct itimerspec ktmr, kotmr;
 	int ret;
 
-	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
-		return -EFAULT;
-
 	if ((flags & ~TFD_SETTIME_FLAGS) ||
-	    !timespec_valid(&ktmr.it_value) ||
-	    !timespec_valid(&ktmr.it_interval))
+	    !timespec_valid(&new->it_value) ||
+	    !timespec_valid(&new->it_interval))
 		return -EINVAL;
 
 	ret = timerfd_fget(ufd, &f);
@@ -308,8 +371,14 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	 */
 	for (;;) {
 		spin_lock_irq(&ctx->wqh.lock);
-		if (hrtimer_try_to_cancel(&ctx->tmr) >= 0)
-			break;
+
+		if (isalarm(ctx)) {
+			if (alarm_try_to_cancel(&ctx->t.alarm) >= 0)
+				break;
+		} else {
+			if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0)
+				break;
+		}
 		spin_unlock_irq(&ctx->wqh.lock);
 		cpu_relax();
 	}
@@ -320,30 +389,30 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	 * We do not update "ticks" and "expired" since the timer will be
 	 * re-programmed again in the following timerfd_setup() call.
 	 */
-	if (ctx->expired && ctx->tintv.tv64)
-		hrtimer_forward_now(&ctx->tmr, ctx->tintv);
+	if (ctx->expired && ctx->tintv.tv64) {
+		if (isalarm(ctx))
+			alarm_forward_now(&ctx->t.alarm, ctx->tintv);
+		else
+			hrtimer_forward_now(&ctx->t.tmr, ctx->tintv);
+	}
 
-	kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
-	kotmr.it_interval = ktime_to_timespec(ctx->tintv);
+	old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+	old->it_interval = ktime_to_timespec(ctx->tintv);
 
 	/*
 	 * Re-program the timer to the new value ...
 	 */
-	ret = timerfd_setup(ctx, flags, &ktmr);
+	ret = timerfd_setup(ctx, flags, new);
 
 	spin_unlock_irq(&ctx->wqh.lock);
 	fdput(f);
-	if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
-		return -EFAULT;
-
 	return ret;
 }
 
-SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
+static int do_timerfd_gettime(int ufd, struct itimerspec *t)
 {
 	struct fd f;
 	struct timerfd_ctx *ctx;
-	struct itimerspec kotmr;
 	int ret = timerfd_fget(ufd, &f);
 	if (ret)
 		return ret;
@@ -352,15 +421,78 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
 	spin_lock_irq(&ctx->wqh.lock);
 	if (ctx->expired && ctx->tintv.tv64) {
 		ctx->expired = 0;
-		ctx->ticks +=
-			hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1;
-		hrtimer_restart(&ctx->tmr);
+
+		if (isalarm(ctx)) {
+			ctx->ticks +=
+				alarm_forward_now(
+					&ctx->t.alarm, ctx->tintv) - 1;
+			alarm_restart(&ctx->t.alarm);
+		} else {
+			ctx->ticks +=
+				hrtimer_forward_now(&ctx->t.tmr, ctx->tintv)
+				- 1;
+			hrtimer_restart(&ctx->t.tmr);
+		}
 	}
-	kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
-	kotmr.it_interval = ktime_to_timespec(ctx->tintv);
+	t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+	t->it_interval = ktime_to_timespec(ctx->tintv);
 	spin_unlock_irq(&ctx->wqh.lock);
 	fdput(f);
+	return 0;
+}
+
+SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+		const struct itimerspec __user *, utmr,
+		struct itimerspec __user *, otmr)
+{
+	struct itimerspec new, old;
+	int ret;
+
+	if (copy_from_user(&new, utmr, sizeof(new)))
+		return -EFAULT;
+	ret = do_timerfd_settime(ufd, flags, &new, &old);
+	if (ret)
+		return ret;
+	if (otmr && copy_to_user(otmr, &old, sizeof(old)))
+		return -EFAULT;
 
+	return ret;
+}
+
+SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
+{
+	struct itimerspec kotmr;
+	int ret = do_timerfd_gettime(ufd, &kotmr);
+	if (ret)
+		return ret;
 	return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+		const struct compat_itimerspec __user *, utmr,
+		struct compat_itimerspec __user *, otmr)
+{
+	struct itimerspec new, old;
+	int ret;
+
+	if (get_compat_itimerspec(&new, utmr))
+		return -EFAULT;
+	ret = do_timerfd_settime(ufd, flags, &new, &old);
+	if (ret)
+		return ret;
+	if (otmr && put_compat_itimerspec(otmr, &old))
+		return -EFAULT;
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(timerfd_gettime, int, ufd,
+		struct compat_itimerspec __user *, otmr)
+{
+	struct itimerspec kotmr;
+	int ret = do_timerfd_gettime(ufd, &kotmr);
+	if (ret)
+		return ret;
+	return put_compat_itimerspec(otmr, &kotmr) ? -EFAULT: 0;
+}
+#endif
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 12817ffc734..7f60e900edf 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2459,7 +2459,7 @@ error_dump:
 
 static inline int chance(unsigned int n, unsigned int out_of)
 {
-	return !!((random32() % out_of) + 1 <= n);
+	return !!((prandom_u32() % out_of) + 1 <= n);
 
 }
 
@@ -2477,13 +2477,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
 			if (chance(1, 2)) {
 				d->pc_delay = 1;
 				/* Fail withing 1 minute */
-				delay = random32() % 60000;
+				delay = prandom_u32() % 60000;
 				d->pc_timeout = jiffies;
 				d->pc_timeout += msecs_to_jiffies(delay);
 				ubifs_warn("failing after %lums", delay);
 			} else {
 				d->pc_delay = 2;
-				delay = random32() % 10000;
+				delay = prandom_u32() % 10000;
 				/* Fail within 10000 operations */
 				d->pc_cnt_max = delay;
 				ubifs_warn("failing after %lu calls", delay);
@@ -2563,7 +2563,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
 	unsigned int from, to, ffs = chance(1, 2);
 	unsigned char *p = (void *)buf;
 
-	from = random32() % (len + 1);
+	from = prandom_u32() % (len + 1);
 	/* Corruption may only span one max. write unit */
 	to = min(len, ALIGN(from, c->max_write_size));
 
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 8a574776a49..6b4947f75af 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -346,38 +346,46 @@ static unsigned int vfs_dent_type(uint8_t type)
  * This means that UBIFS cannot support NFS which requires full
  * 'seekdir()'/'telldir()' support.
  */
-static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
+static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 {
-	int err, over = 0;
+	int err;
 	struct qstr nm;
 	union ubifs_key key;
 	struct ubifs_dent_node *dent;
-	struct inode *dir = file->f_path.dentry->d_inode;
+	struct inode *dir = file_inode(file);
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 
-	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
 
-	if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+	if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2)
 		/*
 		 * The directory was seek'ed to a senseless position or there
 		 * are no more entries.
 		 */
 		return 0;
 
-	/* File positions 0 and 1 correspond to "." and ".." */
-	if (file->f_pos == 0) {
-		ubifs_assert(!file->private_data);
-		over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
-		if (over)
-			return 0;
-		file->f_pos = 1;
+	if (file->f_version == 0) {
+		/*
+		 * The file was seek'ed, which means that @file->private_data
+		 * is now invalid. This may also be just the first
+		 * 'ubifs_readdir()' invocation, in which case
+		 * @file->private_data is NULL, and the below code is
+		 * basically a no-op.
+		 */
+		kfree(file->private_data);
+		file->private_data = NULL;
 	}
 
-	if (file->f_pos == 1) {
+	/*
+	 * 'generic_file_llseek()' unconditionally sets @file->f_version to
+	 * zero, and we use this for detecting whether the file was seek'ed.
+	 */
+	file->f_version = 1;
+
+	/* File positions 0 and 1 correspond to "." and ".." */
+	if (ctx->pos < 2) {
 		ubifs_assert(!file->private_data);
-		over = filldir(dirent, "..", 2, 1,
-			       parent_ino(file->f_path.dentry), DT_DIR);
-		if (over)
+		if (!dir_emit_dots(file, ctx))
 			return 0;
 
 		/* Find the first entry in TNC and save it */
@@ -389,7 +397,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
 			goto out;
 		}
 
-		file->f_pos = key_hash_flash(c, &dent->key);
+		ctx->pos = key_hash_flash(c, &dent->key);
 		file->private_data = dent;
 	}
 
@@ -397,17 +405,16 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
 	if (!dent) {
 		/*
 		 * The directory was seek'ed to and is now readdir'ed.
-		 * Find the entry corresponding to @file->f_pos or the
-		 * closest one.
+		 * Find the entry corresponding to @ctx->pos or the closest one.
 		 */
-		dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+		dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
 		nm.name = NULL;
 		dent = ubifs_tnc_next_ent(c, &key, &nm);
 		if (IS_ERR(dent)) {
 			err = PTR_ERR(dent);
 			goto out;
 		}
-		file->f_pos = key_hash_flash(c, &dent->key);
+		ctx->pos = key_hash_flash(c, &dent->key);
 		file->private_data = dent;
 	}
 
@@ -419,10 +426,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
 			     ubifs_inode(dir)->creat_sqnum);
 
 		nm.len = le16_to_cpu(dent->nlen);
-		over = filldir(dirent, dent->name, nm.len, file->f_pos,
+		if (!dir_emit(ctx, dent->name, nm.len,
 			       le64_to_cpu(dent->inum),
-			       vfs_dent_type(dent->type));
-		if (over)
+			       vfs_dent_type(dent->type)))
 			return 0;
 
 		/* Switch to the next entry */
@@ -435,7 +441,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
 		}
 
 		kfree(file->private_data);
-		file->f_pos = key_hash_flash(c, &dent->key);
+		ctx->pos = key_hash_flash(c, &dent->key);
 		file->private_data = dent;
 		cond_resched();
 	}
@@ -448,18 +454,11 @@ out:
 
 	kfree(file->private_data);
 	file->private_data = NULL;
-	file->f_pos = 2;
+	/* 2 is a special value indicating that there are no more direntries */
+	ctx->pos = 2;
 	return 0;
 }
 
-/* If a directory is seeked, we have to free saved readdir() state */
-static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-	kfree(file->private_data);
-	file->private_data = NULL;
-	return generic_file_llseek(file, offset, whence);
-}
-
 /* Free saved readdir() state when the directory is closed */
 static int ubifs_dir_release(struct inode *dir, struct file *file)
 {
@@ -1177,10 +1176,10 @@ const struct inode_operations ubifs_dir_inode_operations = {
 };
 
 const struct file_operations ubifs_dir_operations = {
-	.llseek         = ubifs_dir_llseek,
+	.llseek         = generic_file_llseek,
 	.release        = ubifs_dir_release,
 	.read           = generic_read_dir,
-	.readdir        = ubifs_readdir,
+	.iterate        = ubifs_readdir,
 	.fsync          = ubifs_fsync,
 	.unlocked_ioctl = ubifs_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5bc77817f38..123c79b7261 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,6 +50,7 @@
  */
 
 #include "ubifs.h"
+#include <linux/aio.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/slab.h>
@@ -1276,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
 	return err;
 }
 
-static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+static void ubifs_invalidatepage(struct page *page, unsigned int offset,
+				 unsigned int length)
 {
 	struct inode *inode = page->mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 
 	ubifs_assert(PagePrivate(page));
-	if (offset)
+	if (offset || length < PAGE_CACHE_SIZE)
 		/* Partial page remains dirty */
 		return;
 
@@ -1444,7 +1446,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
 				 struct vm_fault *vmf)
 {
 	struct page *page = vmf->page;
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(vma->vm_file);
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct timespec now = ubifs_current_time(inode);
 	struct ubifs_budget_req req = { .new_page = 1 };
@@ -1522,6 +1524,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
 			ubifs_release_dirty_inode_budget(c, ui);
 	}
 
+	wait_for_stable_page(page);
 	unlock_page(page);
 	return 0;
 
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 1a7e2d8bdbe..648b143606c 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -147,7 +147,7 @@ out_unlock:
 long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	int flags, err;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 
 	switch (cmd) {
 	case FS_IOC_GETFLAGS:
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 9daaeef675d..4b826abb152 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -2007,28 +2007,28 @@ static int dbg_populate_lsave(struct ubifs_info *c)
 
 	if (!dbg_is_chk_gen(c))
 		return 0;
-	if (random32() & 3)
+	if (prandom_u32() & 3)
 		return 0;
 
 	for (i = 0; i < c->lsave_cnt; i++)
 		c->lsave[i] = c->main_first;
 
 	list_for_each_entry(lprops, &c->empty_list, list)
-		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
 	list_for_each_entry(lprops, &c->freeable_list, list)
-		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
 	list_for_each_entry(lprops, &c->frdi_idx_list, list)
-		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
 
 	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
 	for (i = 0; i < heap->cnt; i++)
-		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
 	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
 	for (i = 0; i < heap->cnt; i++)
-		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
 	heap = &c->lpt_heap[LPROPS_FREE - 1];
 	for (i = 0; i < heap->cnt; i++)
-		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
 
 	return 1;
 }
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 769701ccb5c..ba32da3fe08 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -126,13 +126,14 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
 		else if (inum > o->inum)
 			p = p->rb_right;
 		else {
-			if (o->dnext) {
+			if (o->del) {
 				spin_unlock(&c->orphan_lock);
 				dbg_gen("deleted twice ino %lu",
 					(unsigned long)inum);
 				return;
 			}
-			if (o->cnext) {
+			if (o->cmt) {
+				o->del = 1;
 				o->dnext = c->orph_dnext;
 				c->orph_dnext = o;
 				spin_unlock(&c->orphan_lock);
@@ -172,7 +173,9 @@ int ubifs_orphan_start_commit(struct ubifs_info *c)
 	last = &c->orph_cnext;
 	list_for_each_entry(orphan, &c->orph_new, new_list) {
 		ubifs_assert(orphan->new);
+		ubifs_assert(!orphan->cmt);
 		orphan->new = 0;
+		orphan->cmt = 1;
 		*last = orphan;
 		last = &orphan->cnext;
 	}
@@ -299,7 +302,9 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
 	cnext = c->orph_cnext;
 	for (i = 0; i < cnt; i++) {
 		orphan = cnext;
+		ubifs_assert(orphan->cmt);
 		orph->inos[i] = cpu_to_le64(orphan->inum);
+		orphan->cmt = 0;
 		cnext = orphan->cnext;
 		orphan->cnext = NULL;
 	}
@@ -378,6 +383,7 @@ static int consolidate(struct ubifs_info *c)
 		list_for_each_entry(orphan, &c->orph_list, list) {
 			if (orphan->new)
 				continue;
+			orphan->cmt = 1;
 			*last = orphan;
 			last = &orphan->cnext;
 			cnt += 1;
@@ -442,6 +448,7 @@ static void erase_deleted(struct ubifs_info *c)
 		orphan = dnext;
 		dnext = orphan->dnext;
 		ubifs_assert(!orphan->new);
+		ubifs_assert(orphan->del);
 		rb_erase(&orphan->rb, &c->orph_tree);
 		list_del(&orphan->list);
 		c->tot_orphans -= 1;
@@ -531,6 +538,7 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
 	rb_link_node(&orphan->rb, parent, p);
 	rb_insert_color(&orphan->rb, &c->orph_tree);
 	list_add_tail(&orphan->list, &c->orph_list);
+	orphan->del = 1;
 	orphan->dnext = c->orph_dnext;
 	c->orph_dnext = orphan;
 	dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum,
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ddc0f6ae65e..879b9976c12 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1412,7 +1412,7 @@ static int mount_ubifs(struct ubifs_info *c)
 
 	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s",
 		  c->vi.ubi_num, c->vi.vol_id, c->vi.name,
-		  c->ro_mount ? ", R/O mode" : NULL);
+		  c->ro_mount ? ", R/O mode" : "");
 	x = (long long)c->main_lebs * c->leb_size;
 	y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
 	ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
@@ -1568,6 +1568,12 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	c->remounting_rw = 1;
 	c->ro_mount = 0;
 
+	if (c->space_fixup) {
+		err = ubifs_fixup_free_space(c);
+		if (err)
+			return err;
+	}
+
 	err = check_free_space(c);
 	if (err)
 		goto out;
@@ -1684,12 +1690,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 		err = dbg_check_space_info(c);
 	}
 
-	if (c->space_fixup) {
-		err = ubifs_fixup_free_space(c);
-		if (err)
-			goto out;
-	}
-
 	mutex_unlock(&c->umount_mutex);
 	return err;
 
@@ -2174,6 +2174,7 @@ static struct file_system_type ubifs_fs_type = {
 	.mount   = ubifs_mount,
 	.kill_sb = kill_ubifs_super,
 };
+MODULE_ALIAS_FS("ubifs");
 
 /*
  * Inode slab cache constructor.
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 523bbad69c0..52a6559275c 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -683,7 +683,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
 		c->ilebs[c->ileb_cnt++] = lnum;
 		dbg_cmt("LEB %d", lnum);
 	}
-	if (dbg_is_chk_index(c) && !(random32() & 7))
+	if (dbg_is_chk_index(c) && !(prandom_u32() & 7))
 		return -ENOSPC;
 	return 0;
 }
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index d133c276fe0..b2babce4d70 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -904,6 +904,8 @@ struct ubifs_budget_req {
  * @dnext: next orphan to delete
  * @inum: inode number
  * @new: %1 => added since the last commit, otherwise %0
+ * @cmt: %1 => commit pending, otherwise %0
+ * @del: %1 => delete pending, otherwise %0
  */
 struct ubifs_orphan {
 	struct rb_node rb;
@@ -912,7 +914,9 @@ struct ubifs_orphan {
 	struct ubifs_orphan *cnext;
 	struct ubifs_orphan *dnext;
 	ino_t inum;
-	int new;
+	unsigned new:1;
+	unsigned cmt:1;
+	unsigned del:1;
 };
 
 /**
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index eb8bfe2b89a..a012c51caff 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -35,14 +35,16 @@
 #include "udf_i.h"
 #include "udf_sb.h"
 
-static int do_udf_readdir(struct inode *dir, struct file *filp,
-			  filldir_t filldir, void *dirent)
+
+static int udf_readdir(struct file *file, struct dir_context *ctx)
 {
+	struct inode *dir = file_inode(file);
+	struct udf_inode_info *iinfo = UDF_I(dir);
 	struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
 	struct fileIdentDesc *fi = NULL;
 	struct fileIdentDesc cfi;
 	int block, iblock;
-	loff_t nf_pos = (filp->f_pos - 1) << 2;
+	loff_t nf_pos;
 	int flen;
 	unsigned char *fname = NULL;
 	unsigned char *nameptr;
@@ -54,10 +56,14 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 	uint32_t elen;
 	sector_t offset;
 	int i, num, ret = 0;
-	unsigned int dt_type;
 	struct extent_position epos = { NULL, 0, {0, 0} };
-	struct udf_inode_info *iinfo;
 
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(file, ctx))
+			return 0;
+		ctx->pos = 1;
+	}
+	nf_pos = (ctx->pos - 1) << 2;
 	if (nf_pos >= size)
 		goto out;
 
@@ -71,7 +77,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 		nf_pos = udf_ext0_offset(dir);
 
 	fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
-	iinfo = UDF_I(dir);
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 		if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
 		    &epos, &eloc, &elen, &offset)
@@ -116,7 +121,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 	}
 
 	while (nf_pos < size) {
-		filp->f_pos = (nf_pos >> 2) + 1;
+		struct kernel_lb_addr tloc;
+
+		ctx->pos = (nf_pos >> 2) + 1;
 
 		fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
 					&elen, &offset);
@@ -155,24 +162,22 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 		}
 
 		if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) {
-			iblock = parent_ino(filp->f_path.dentry);
-			flen = 2;
-			memcpy(fname, "..", flen);
-			dt_type = DT_DIR;
-		} else {
-			struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
-
-			iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
-			flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
-			dt_type = DT_UNKNOWN;
+			if (!dir_emit_dotdot(file, ctx))
+				goto out;
+			continue;
 		}
 
-		if (flen && filldir(dirent, fname, flen, filp->f_pos,
-				    iblock, dt_type) < 0)
+		flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+		if (!flen)
+			continue;
+
+		tloc = lelb_to_cpu(cfi.icb.extLocation);
+		iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
+		if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
 			goto out;
 	} /* end while */
 
-	filp->f_pos = (nf_pos >> 2) + 1;
+	ctx->pos = (nf_pos >> 2) + 1;
 
 out:
 	if (fibh.sbh != fibh.ebh)
@@ -184,27 +189,11 @@ out:
 	return ret;
 }
 
-static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-	struct inode *dir = filp->f_path.dentry->d_inode;
-	int result;
-
-	if (filp->f_pos == 0) {
-		if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
-			return 0;
-		}
-		filp->f_pos++;
-	}
-
-	result = do_udf_readdir(dir, filp, filldir, dirent);
- 	return result;
-}
-
 /* readdir and lookup functions */
 const struct file_operations udf_dir_operations = {
 	.llseek			= generic_file_llseek,
 	.read			= generic_read_dir,
-	.readdir		= udf_readdir,
+	.iterate		= udf_readdir,
 	.unlocked_ioctl		= udf_ioctl,
 	.fsync			= generic_file_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 77b5953eaac..29569dd0816 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -139,7 +139,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	ssize_t retval;
 	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	int err, pos;
 	size_t count = iocb->ki_left;
 	struct udf_inode_info *iinfo = UDF_I(inode);
@@ -178,7 +178,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct inode *inode = file_inode(filp);
 	long old_block, new_block;
 	int result = -EINVAL;
 
@@ -204,7 +204,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		goto out;
 	case UDF_RELOCATE_BLOCKS:
 		if (!capable(CAP_SYS_ADMIN)) {
-			result = -EACCES;
+			result = -EPERM;
 			goto out;
 		}
 		if (get_user(old_block, (long __user *)arg)) {
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index cbae1ed0b7c..b6d15d34981 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,6 +38,7 @@
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
 #include <linux/mpage.h>
+#include <linux/aio.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
@@ -67,6 +68,74 @@ static void udf_update_extents(struct inode *,
 			       struct extent_position *);
 static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
+static void __udf_clear_extent_cache(struct inode *inode)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	if (iinfo->cached_extent.lstart != -1) {
+		brelse(iinfo->cached_extent.epos.bh);
+		iinfo->cached_extent.lstart = -1;
+	}
+}
+
+/* Invalidate extent cache */
+static void udf_clear_extent_cache(struct inode *inode)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	spin_lock(&iinfo->i_extent_cache_lock);
+	__udf_clear_extent_cache(inode);
+	spin_unlock(&iinfo->i_extent_cache_lock);
+}
+
+/* Return contents of extent cache */
+static int udf_read_extent_cache(struct inode *inode, loff_t bcount,
+				 loff_t *lbcount, struct extent_position *pos)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	int ret = 0;
+
+	spin_lock(&iinfo->i_extent_cache_lock);
+	if ((iinfo->cached_extent.lstart <= bcount) &&
+	    (iinfo->cached_extent.lstart != -1)) {
+		/* Cache hit */
+		*lbcount = iinfo->cached_extent.lstart;
+		memcpy(pos, &iinfo->cached_extent.epos,
+		       sizeof(struct extent_position));
+		if (pos->bh)
+			get_bh(pos->bh);
+		ret = 1;
+	}
+	spin_unlock(&iinfo->i_extent_cache_lock);
+	return ret;
+}
+
+/* Add extent to extent cache */
+static void udf_update_extent_cache(struct inode *inode, loff_t estart,
+				    struct extent_position *pos, int next_epos)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+
+	spin_lock(&iinfo->i_extent_cache_lock);
+	/* Invalidate previously cached extent */
+	__udf_clear_extent_cache(inode);
+	if (pos->bh)
+		get_bh(pos->bh);
+	memcpy(&iinfo->cached_extent.epos, pos,
+	       sizeof(struct extent_position));
+	iinfo->cached_extent.lstart = estart;
+	if (next_epos)
+		switch (iinfo->i_alloc_type) {
+		case ICBTAG_FLAG_AD_SHORT:
+			iinfo->cached_extent.epos.offset -=
+			sizeof(struct short_ad);
+			break;
+		case ICBTAG_FLAG_AD_LONG:
+			iinfo->cached_extent.epos.offset -=
+			sizeof(struct long_ad);
+		}
+	spin_unlock(&iinfo->i_extent_cache_lock);
+}
 
 void udf_evict_inode(struct inode *inode)
 {
@@ -90,6 +159,7 @@ void udf_evict_inode(struct inode *inode)
 	}
 	kfree(iinfo->i_ext.i_data);
 	iinfo->i_ext.i_data = NULL;
+	udf_clear_extent_cache(inode);
 	if (want_delete) {
 		udf_free_inode(inode);
 	}
@@ -105,6 +175,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
 		truncate_pagecache(inode, to, isize);
 		if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 			down_write(&iinfo->i_data_sem);
+			udf_clear_extent_cache(inode);
 			udf_truncate_extents(inode);
 			up_write(&iinfo->i_data_sem);
 		}
@@ -372,7 +443,7 @@ static int udf_get_block(struct inode *inode, sector_t block,
 		iinfo->i_next_alloc_goal++;
 	}
 
-
+	udf_clear_extent_cache(inode);
 	phys = inode_getblk(inode, block, &err, &new);
 	if (!phys)
 		goto abort;
@@ -1171,6 +1242,7 @@ set_size:
 	} else {
 		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
 			down_write(&iinfo->i_data_sem);
+			udf_clear_extent_cache(inode);
 			memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
 			       0x00, bsize - newsize -
 			       udf_file_entry_alloc_offset(inode));
@@ -1184,6 +1256,7 @@ set_size:
 		if (err)
 			return err;
 		down_write(&iinfo->i_data_sem);
+		udf_clear_extent_cache(inode);
 		truncate_setsize(inode, newsize);
 		udf_truncate_extents(inode);
 		up_write(&iinfo->i_data_sem);
@@ -2156,11 +2229,12 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
 	struct udf_inode_info *iinfo;
 
 	iinfo = UDF_I(inode);
-	pos->offset = 0;
-	pos->block = iinfo->i_location;
-	pos->bh = NULL;
+	if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) {
+		pos->offset = 0;
+		pos->block = iinfo->i_location;
+		pos->bh = NULL;
+	}
 	*elen = 0;
-
 	do {
 		etype = udf_next_aext(inode, pos, eloc, elen, 1);
 		if (etype == -1) {
@@ -2170,7 +2244,8 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
 		}
 		lbcount += *elen;
 	} while (lbcount <= bcount);
-
+	/* update extent cache */
+	udf_update_extent_cache(inode, lbcount - *elen, pos, 1);
 	*offset = (bcount + *elen - lbcount) >> blocksize_bits;
 
 	return etype;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 95fee278ab9..5f6fc17d6bc 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -594,6 +594,29 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return 0;
 }
 
+static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+	struct udf_inode_info *iinfo;
+	int err;
+
+	inode = udf_new_inode(dir, mode, &err);
+	if (!inode)
+		return err;
+
+	iinfo = UDF_I(inode);
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		inode->i_data.a_ops = &udf_adinicb_aops;
+	else
+		inode->i_data.a_ops = &udf_aops;
+	inode->i_op = &udf_file_inode_operations;
+	inode->i_fop = &udf_file_operations;
+	mark_inode_dirty(inode);
+
+	d_tmpfile(dentry, inode);
+	return 0;
+}
+
 static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 		     dev_t rdev)
 {
@@ -1270,10 +1293,10 @@ static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
 
 	if (parent && (len < 5)) {
 		*lenp = 5;
-		return 255;
+		return FILEID_INVALID;
 	} else if (len < 3) {
 		*lenp = 3;
-		return 255;
+		return FILEID_INVALID;
 	}
 
 	*lenp = 3;
@@ -1311,6 +1334,7 @@ const struct inode_operations udf_dir_inode_operations = {
 	.rmdir				= udf_rmdir,
 	.mknod				= udf_mknod,
 	.rename				= udf_rename,
+	.tmpfile			= udf_tmpfile,
 };
 const struct inode_operations udf_symlink_inode_operations = {
 	.readlink	= generic_readlink,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e9be396a558..9ac4057a86c 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -118,6 +118,7 @@ static struct file_system_type udf_fstype = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("udf");
 
 static struct kmem_cache *udf_inode_cachep;
 
@@ -134,6 +135,8 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
 	ei->i_next_alloc_goal = 0;
 	ei->i_strat4096 = 0;
 	init_rwsem(&ei->i_data_sem);
+	ei->cached_extent.lstart = -1;
+	spin_lock_init(&ei->i_extent_cache_lock);
 
 	return &ei->vfs_inode;
 }
@@ -1021,7 +1024,6 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
 	if (bitmap == NULL)
 		return NULL;
 
-	bitmap->s_block_bitmap = (struct buffer_head **)(bitmap + 1);
 	bitmap->s_nr_groups = nr_groups;
 	return bitmap;
 }
@@ -1079,8 +1081,6 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 		if (!bitmap)
 			return 1;
 		map->s_uspace.s_bitmap = bitmap;
-		bitmap->s_extLength = le32_to_cpu(
-				phd->unallocSpaceBitmap.extLength);
 		bitmap->s_extPosition = le32_to_cpu(
 				phd->unallocSpaceBitmap.extPosition);
 		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
@@ -1115,8 +1115,6 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 		if (!bitmap)
 			return 1;
 		map->s_fspace.s_bitmap = bitmap;
-		bitmap->s_extLength = le32_to_cpu(
-				phd->freedSpaceBitmap.extLength);
 		bitmap->s_extPosition = le32_to_cpu(
 				phd->freedSpaceBitmap.extPosition);
 		map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
@@ -1866,6 +1864,8 @@ static void udf_open_lvid(struct super_block *sb)
 	mark_buffer_dirty(bh);
 	sbi->s_lvid_dirty = 0;
 	mutex_unlock(&sbi->s_alloc_mutex);
+	/* Make opening of filesystem visible on the media immediately */
+	sync_dirty_buffer(bh);
 }
 
 static void udf_close_lvid(struct super_block *sb)
@@ -1906,6 +1906,8 @@ static void udf_close_lvid(struct super_block *sb)
 	mark_buffer_dirty(bh);
 	sbi->s_lvid_dirty = 0;
 	mutex_unlock(&sbi->s_alloc_mutex);
+	/* Make closing of filesystem visible on the media immediately */
+	sync_dirty_buffer(bh);
 }
 
 u64 lvid_get_unique_id(struct super_block *sb)
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index bb8309dcd5c..b5cd8ed2aa1 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,6 +1,19 @@
 #ifndef _UDF_I_H
 #define _UDF_I_H
 
+struct extent_position {
+	struct buffer_head *bh;
+	uint32_t offset;
+	struct kernel_lb_addr block;
+};
+
+struct udf_ext_cache {
+	/* Extent position */
+	struct extent_position epos;
+	/* Start logical offset in bytes */
+	loff_t lstart;
+};
+
 /*
  * The i_data_sem and i_mutex serve for protection of allocation information
  * of a regular files and symlinks. This includes all extents belonging to
@@ -35,6 +48,9 @@ struct udf_inode_info {
 		__u8		*i_data;
 	} i_ext;
 	struct rw_semaphore	i_data_sem;
+	struct udf_ext_cache cached_extent;
+	/* Spinlock for protecting extent cache */
+	spinlock_t i_extent_cache_lock;
 	struct inode vfs_inode;
 };
 
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 5f027227f08..ed401e94aa8 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -80,10 +80,9 @@ struct udf_virtual_data {
 };
 
 struct udf_bitmap {
-	__u32			s_extLength;
 	__u32			s_extPosition;
-	__u16			s_nr_groups;
-	struct buffer_head 	**s_block_bitmap;
+	int			s_nr_groups;
+	struct buffer_head 	*s_block_bitmap[0];
 };
 
 struct udf_part_map {
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index de038da6f6b..be7dabbbcb4 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -113,11 +113,6 @@ struct ustr {
 	uint8_t u_len;
 };
 
-struct extent_position {
-	struct buffer_head *bh;
-	uint32_t offset;
-	struct kernel_lb_addr block;
-};
 
 /* super.c */
 
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768..0bf6e16f8d7 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -29,7 +29,7 @@ config UFS_FS
 
 config UFS_FS_WRITE
 	bool "UFS file system write support (DANGEROUS)"
-	depends on UFS_FS && EXPERIMENTAL
+	depends on UFS_FS
 	help
 	  Say Y here if you want to try writing to UFS partitions. This is
 	  experimental, so you should back up your UFS partitions beforehand.
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index dbc90994715..0ecc2cebed8 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -430,16 +430,16 @@ ufs_validate_entry(struct super_block *sb, char *base,
  * This is blatantly stolen from ext2fs
  */
 static int
-ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+ufs_readdir(struct file *file, struct dir_context *ctx)
 {
-	loff_t pos = filp->f_pos;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	loff_t pos = ctx->pos;
+	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
 	unsigned int offset = pos & ~PAGE_CACHE_MASK;
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = ufs_dir_pages(inode);
 	unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
-	int need_revalidate = filp->f_version != inode->i_version;
+	int need_revalidate = file->f_version != inode->i_version;
 	unsigned flags = UFS_SB(sb)->s_flags;
 
 	UFSD("BEGIN\n");
@@ -457,16 +457,16 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			ufs_error(sb, __func__,
 				  "bad page in #%lu",
 				  inode->i_ino);
-			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			ctx->pos += PAGE_CACHE_SIZE - offset;
 			return -EIO;
 		}
 		kaddr = page_address(page);
 		if (unlikely(need_revalidate)) {
 			if (offset) {
 				offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
-				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
 			}
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 			need_revalidate = 0;
 		}
 		de = (struct ufs_dir_entry *)(kaddr+offset);
@@ -479,11 +479,8 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				return -EIO;
 			}
 			if (de->d_ino) {
-				int over;
 				unsigned char d_type = DT_UNKNOWN;
 
-				offset = (char *)de - kaddr;
-
 				UFSD("filldir(%s,%u)\n", de->d_name,
 				      fs32_to_cpu(sb, de->d_ino));
 				UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
@@ -491,16 +488,15 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
 					d_type = de->d_u.d_44.d_type;
 
-				over = filldir(dirent, de->d_name,
+				if (!dir_emit(ctx, de->d_name,
 					       ufs_get_de_namlen(sb, de),
-						(n<<PAGE_CACHE_SHIFT) | offset,
-					       fs32_to_cpu(sb, de->d_ino), d_type);
-				if (over) {
+					       fs32_to_cpu(sb, de->d_ino),
+					       d_type)) {
 					ufs_put_page(page);
 					return 0;
 				}
 			}
-			filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+			ctx->pos += fs16_to_cpu(sb, de->d_reclen);
 		}
 		ufs_put_page(page);
 	}
@@ -660,7 +656,7 @@ not_empty:
 
 const struct file_operations ufs_dir_operations = {
 	.read		= generic_read_dir,
-	.readdir	= ufs_readdir,
+	.iterate	= ufs_readdir,
 	.fsync		= generic_file_fsync,
 	.llseek		= generic_file_llseek,
 };
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index dc8e3a861d0..329f2f53b7e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1500,6 +1500,7 @@ static struct file_system_type ufs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("ufs");
 
 static int __init init_ufs_fs(void)
 {
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 95425b59ce0..b6c2f94e041 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -26,8 +26,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
 	count = size >> uspi->s_fshift;
 	if (count > UFS_MAXFRAG)
 		return NULL;
-	ubh = (struct ufs_buffer_head *)
-		kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
+	ubh = kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
 	if (!ubh)
 		return NULL;
 	ubh->fragment = fragment;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5a7ffe54f5d..399e8cec6e6 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -69,9 +69,22 @@ config XFS_RT
 
 	  If unsure, say N.
 
+config XFS_WARN
+	bool "XFS Verbose Warnings"
+	depends on XFS_FS && !XFS_DEBUG
+	help
+	  Say Y here to get an XFS build with many additional warnings.
+	  It converts ASSERT checks to WARN, so will log any out-of-bounds
+	  conditions that occur that would otherwise be missed. It is much
+	  lighter weight than XFS_DEBUG and does not modify algorithms and will
+	  not cause the kernel to panic on non-fatal errors.
+
+	  However, similar to XFS_DEBUG, it is only advisable to use this if you
+	  are debugging a particular problem.
+
 config XFS_DEBUG
-	bool "XFS Debugging support (EXPERIMENTAL)"
-	depends on XFS_FS && EXPERIMENTAL
+	bool "XFS Debugging support"
+	depends on XFS_FS
 	help
 	  Say Y here to get an XFS build with many debugging features,
 	  including ASSERT checks, function wrappers around macros,
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d02201df855..4a4508023a3 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -45,11 +45,11 @@ xfs-y				+= xfs_aops.o \
 				   xfs_itable.o \
 				   xfs_message.o \
 				   xfs_mru_cache.o \
-				   xfs_super.o \
-				   xfs_xattr.o \
 				   xfs_rename.o \
+				   xfs_super.o \
 				   xfs_utils.o \
 				   xfs_vnodeops.o \
+				   xfs_xattr.o \
 				   kmem.o \
 				   uuid.o
 
@@ -58,6 +58,7 @@ xfs-y				+= xfs_alloc.o \
 				   xfs_alloc_btree.o \
 				   xfs_attr.o \
 				   xfs_attr_leaf.o \
+				   xfs_attr_remote.o \
 				   xfs_bmap.o \
 				   xfs_bmap_btree.o \
 				   xfs_btree.o \
@@ -70,9 +71,11 @@ xfs-y				+= xfs_alloc.o \
 				   xfs_dir2_sf.o \
 				   xfs_ialloc.o \
 				   xfs_ialloc_btree.o \
+				   xfs_icreate_item.o \
 				   xfs_inode.o \
 				   xfs_log_recover.o \
 				   xfs_mount.o \
+				   xfs_symlink.o \
 				   xfs_trans.o
 
 # low-level transaction/log code
diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h
index ff6a19873e5..e3c92d19e54 100644
--- a/fs/xfs/mrlock.h
+++ b/fs/xfs/mrlock.h
@@ -22,12 +22,12 @@
 
 typedef struct {
 	struct rw_semaphore	mr_lock;
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 	int			mr_writer;
 #endif
 } mrlock_t;
 
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 #define mrinit(mrp, name)	\
 	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
 #else
@@ -46,7 +46,7 @@ static inline void mraccess_nested(mrlock_t *mrp, int subclass)
 static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
 {
 	down_write_nested(&mrp->mr_lock, subclass);
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 	mrp->mr_writer = 1;
 #endif
 }
@@ -60,7 +60,7 @@ static inline int mrtryupdate(mrlock_t *mrp)
 {
 	if (!down_write_trylock(&mrp->mr_lock))
 		return 0;
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 	mrp->mr_writer = 1;
 #endif
 	return 1;
@@ -68,7 +68,7 @@ static inline int mrtryupdate(mrlock_t *mrp)
 
 static inline void mrunlock_excl(mrlock_t *mrp)
 {
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 	mrp->mr_writer = 0;
 #endif
 	up_write(&mrp->mr_lock);
@@ -81,7 +81,7 @@ static inline void mrunlock_shared(mrlock_t *mrp)
 
 static inline void mrdemote(mrlock_t *mrp)
 {
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 	mrp->mr_writer = 0;
 #endif
 	downgrade_write(&mrp->mr_lock);
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index d8b11b7f94a..a742c47f7d5 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -24,6 +24,11 @@
 #define XFS_BUF_LOCK_TRACKING 1
 #endif
 
+#ifdef CONFIG_XFS_WARN
+#define XFS_WARN 1
+#endif
+
+
 #include "xfs_linux.h"
 
 #endif	/* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 1d32f1d5276..306d883d89b 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -21,6 +21,8 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
 #include "xfs_vnodeops.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
 #include "xfs_trace.h"
 #include <linux/slab.h>
 #include <linux/xattr.h>
@@ -34,7 +36,9 @@
  */
 
 STATIC struct posix_acl *
-xfs_acl_from_disk(struct xfs_acl *aclp)
+xfs_acl_from_disk(
+	struct xfs_acl	*aclp,
+	int		max_entries)
 {
 	struct posix_acl_entry *acl_e;
 	struct posix_acl *acl;
@@ -42,7 +46,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp)
 	unsigned int count, i;
 
 	count = be32_to_cpu(aclp->acl_cnt);
-	if (count > XFS_ACL_MAX_ENTRIES)
+	if (count > max_entries)
 		return ERR_PTR(-EFSCORRUPTED);
 
 	acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -108,9 +112,9 @@ xfs_get_acl(struct inode *inode, int type)
 	struct xfs_inode *ip = XFS_I(inode);
 	struct posix_acl *acl;
 	struct xfs_acl *xfs_acl;
-	int len = sizeof(struct xfs_acl);
 	unsigned char *ea_name;
 	int error;
+	int len;
 
 	acl = get_cached_acl(inode, type);
 	if (acl != ACL_NOT_CACHED)
@@ -133,8 +137,8 @@ xfs_get_acl(struct inode *inode, int type)
 	 * If we have a cached ACLs value just return it, not need to
 	 * go out to the disk.
 	 */
-
-	xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+	len = XFS_ACL_MAX_SIZE(ip->i_mount);
+	xfs_acl = kzalloc(len, GFP_KERNEL);
 	if (!xfs_acl)
 		return ERR_PTR(-ENOMEM);
 
@@ -153,7 +157,7 @@ xfs_get_acl(struct inode *inode, int type)
 		goto out;
 	}
 
-	acl = xfs_acl_from_disk(xfs_acl);
+	acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
 	if (IS_ERR(acl))
 		goto out;
 
@@ -189,16 +193,17 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 
 	if (acl) {
 		struct xfs_acl *xfs_acl;
-		int len;
+		int len = XFS_ACL_MAX_SIZE(ip->i_mount);
 
-		xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+		xfs_acl = kzalloc(len, GFP_KERNEL);
 		if (!xfs_acl)
 			return -ENOMEM;
 
 		xfs_acl_to_disk(xfs_acl, acl);
-		len = sizeof(struct xfs_acl) -
-			(sizeof(struct xfs_acl_entry) *
-			 (XFS_ACL_MAX_ENTRIES - acl->a_count));
+
+		/* subtract away the unused acl entries */
+		len -= sizeof(struct xfs_acl_entry) *
+			 (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
 
 		error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
 				len, ATTR_ROOT);
@@ -243,7 +248,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
 static int
 xfs_acl_exists(struct inode *inode, unsigned char *name)
 {
-	int len = sizeof(struct xfs_acl);
+	int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
 
 	return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
 			    ATTR_ROOT|ATTR_KERNOVAL) == 0);
@@ -379,7 +384,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
 		goto out_release;
 
 	error = -EINVAL;
-	if (acl->a_count > XFS_ACL_MAX_ENTRIES)
+	if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
 		goto out_release;
 
 	if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 39632d94135..4016a567b83 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,19 +22,36 @@ struct inode;
 struct posix_acl;
 struct xfs_inode;
 
-#define XFS_ACL_MAX_ENTRIES 25
 #define XFS_ACL_NOT_PRESENT (-1)
 
 /* On-disk XFS access control list structure */
+struct xfs_acl_entry {
+	__be32	ae_tag;
+	__be32	ae_id;
+	__be16	ae_perm;
+	__be16	ae_pad;		/* fill the implicit hole in the structure */
+};
+
 struct xfs_acl {
-	__be32		acl_cnt;
-	struct xfs_acl_entry {
-		__be32	ae_tag;
-		__be32	ae_id;
-		__be16	ae_perm;
-	} acl_entry[XFS_ACL_MAX_ENTRIES];
+	__be32			acl_cnt;
+	struct xfs_acl_entry	acl_entry[0];
 };
 
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp)	\
+	(xfs_sb_version_hascrc(&mp->m_sb) \
+		?  (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+						sizeof(struct xfs_acl_entry) \
+		: 25)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+	(sizeof(struct xfs_acl) + \
+		sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+
 /* On-disk XFS extended attribute names */
 #define SGI_ACL_FILE		(unsigned char *)"SGI_ACL_FILE"
 #define SGI_ACL_DEFAULT		(unsigned char *)"SGI_ACL_DEFAULT"
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f2aeedb6a57..317aa86d96e 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -30,6 +30,7 @@ struct xfs_trans;
 
 #define	XFS_AGF_MAGIC	0x58414746	/* 'XAGF' */
 #define	XFS_AGI_MAGIC	0x58414749	/* 'XAGI' */
+#define	XFS_AGFL_MAGIC	0x5841464c	/* 'XAFL' */
 #define	XFS_AGF_VERSION	1
 #define	XFS_AGI_VERSION	1
 
@@ -63,12 +64,29 @@ typedef struct xfs_agf {
 	__be32		agf_spare0;	/* spare field */
 	__be32		agf_levels[XFS_BTNUM_AGF];	/* btree levels */
 	__be32		agf_spare1;	/* spare field */
+
 	__be32		agf_flfirst;	/* first freelist block's index */
 	__be32		agf_fllast;	/* last freelist block's index */
 	__be32		agf_flcount;	/* count of blocks in freelist */
 	__be32		agf_freeblks;	/* total free blocks */
+
 	__be32		agf_longest;	/* longest free space */
 	__be32		agf_btreeblks;	/* # of blocks held in AGF btrees */
+	uuid_t		agf_uuid;	/* uuid of filesystem */
+
+	/*
+	 * reserve some contiguous space for future logged fields before we add
+	 * the unlogged fields. This makes the range logging via flags and
+	 * structure offsets much simpler.
+	 */
+	__be64		agf_spare64[16];
+
+	/* unlogged fields, written during buffer writeback. */
+	__be64		agf_lsn;	/* last write sequence */
+	__be32		agf_crc;	/* crc of agf sector */
+	__be32		agf_spare2;
+
+	/* structure must be padded to 64 bit alignment */
 } xfs_agf_t;
 
 #define	XFS_AGF_MAGICNUM	0x00000001
@@ -83,7 +101,8 @@ typedef struct xfs_agf {
 #define	XFS_AGF_FREEBLKS	0x00000200
 #define	XFS_AGF_LONGEST		0x00000400
 #define	XFS_AGF_BTREEBLKS	0x00000800
-#define	XFS_AGF_NUM_BITS	12
+#define	XFS_AGF_UUID		0x00001000
+#define	XFS_AGF_NUM_BITS	13
 #define	XFS_AGF_ALL_BITS	((1 << XFS_AGF_NUM_BITS) - 1)
 
 #define XFS_AGF_FLAGS \
@@ -98,7 +117,8 @@ typedef struct xfs_agf {
 	{ XFS_AGF_FLCOUNT,	"FLCOUNT" }, \
 	{ XFS_AGF_FREEBLKS,	"FREEBLKS" }, \
 	{ XFS_AGF_LONGEST,	"LONGEST" }, \
-	{ XFS_AGF_BTREEBLKS,	"BTREEBLKS" }
+	{ XFS_AGF_BTREEBLKS,	"BTREEBLKS" }, \
+	{ XFS_AGF_UUID,		"UUID" }
 
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGF_DADDR(mp)	((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
@@ -132,6 +152,7 @@ typedef struct xfs_agi {
 	__be32		agi_root;	/* root of inode btree */
 	__be32		agi_level;	/* levels in inode btree */
 	__be32		agi_freecount;	/* number of free inodes */
+
 	__be32		agi_newino;	/* new inode just allocated */
 	__be32		agi_dirino;	/* last directory inode chunk */
 	/*
@@ -139,6 +160,13 @@ typedef struct xfs_agi {
 	 * still being referenced.
 	 */
 	__be32		agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+
+	uuid_t		agi_uuid;	/* uuid of filesystem */
+	__be32		agi_crc;	/* crc of agi sector */
+	__be32		agi_pad32;
+	__be64		agi_lsn;	/* last write sequence */
+
+	/* structure must be padded to 64 bit alignment */
 } xfs_agi_t;
 
 #define	XFS_AGI_MAGICNUM	0x00000001
@@ -171,11 +199,31 @@ extern const struct xfs_buf_ops xfs_agi_buf_ops;
  */
 #define XFS_AGFL_DADDR(mp)	((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
 #define	XFS_AGFL_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
-#define XFS_AGFL_SIZE(mp)	((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
 #define	XFS_BUF_TO_AGFL(bp)	((xfs_agfl_t *)((bp)->b_addr))
 
+#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
+	(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		&(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
+		(__be32 *)(bp)->b_addr)
+
+/*
+ * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of
+ * slots in the beginning of the block for a proper header with the
+ * location information and CRC.
+ */
+#define XFS_AGFL_SIZE(mp) \
+	(((mp)->m_sb.sb_sectsize - \
+	 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		sizeof(struct xfs_agfl) : 0)) / \
+	  sizeof(xfs_agblock_t))
+
 typedef struct xfs_agfl {
-	__be32		agfl_bno[1];	/* actually XFS_AGFL_SIZE(mp) */
+	__be32		agfl_magicnum;
+	__be32		agfl_seqno;
+	uuid_t		agfl_uuid;
+	__be64		agfl_lsn;
+	__be32		agfl_crc;
+	__be32		agfl_bno[];	/* actually XFS_AGFL_SIZE(mp) */
 } xfs_agfl_t;
 
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 393055fe3ae..71596e57283 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -33,7 +33,9 @@
 #include "xfs_alloc.h"
 #include "xfs_extent_busy.h"
 #include "xfs_error.h"
+#include "xfs_cksum.h"
 #include "xfs_trace.h"
+#include "xfs_buf_item.h"
 
 struct workqueue_struct *xfs_alloc_wq;
 
@@ -173,6 +175,7 @@ xfs_alloc_compute_diff(
 	xfs_agblock_t	wantbno,	/* target starting block */
 	xfs_extlen_t	wantlen,	/* target length */
 	xfs_extlen_t	alignment,	/* target alignment */
+	char		userdata,	/* are we allocating data? */
 	xfs_agblock_t	freebno,	/* freespace's starting block */
 	xfs_extlen_t	freelen,	/* freespace's length */
 	xfs_agblock_t	*newbnop)	/* result: best start block from free */
@@ -187,7 +190,14 @@ xfs_alloc_compute_diff(
 	ASSERT(freelen >= wantlen);
 	freeend = freebno + freelen;
 	wantend = wantbno + wantlen;
-	if (freebno >= wantbno) {
+	/*
+	 * We want to allocate from the start of a free extent if it is past
+	 * the desired block or if we are allocating user data and the free
+	 * extent is before desired block. The second case is there to allow
+	 * for contiguous allocation from the remaining free space if the file
+	 * grows in the short term.
+	 */
+	if (freebno >= wantbno || (userdata && freeend < wantend)) {
 		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
 			newbno1 = NULLAGBLOCK;
 	} else if (freeend >= wantend && alignment > 1) {
@@ -430,53 +440,84 @@ xfs_alloc_fixup_trees(
 	return 0;
 }
 
-static void
+static bool
 xfs_agfl_verify(
 	struct xfs_buf	*bp)
 {
-#ifdef WHEN_CRCS_COME_ALONG
-	/*
-	 * we cannot actually do any verification of the AGFL because mkfs does
-	 * not initialise the AGFL to zero or NULL. Hence the only valid part of
-	 * the AGFL is what the AGF says is active. We can't get to the AGF, so
-	 * we can't verify just those entries are valid.
-	 *
-	 * This problem goes away when the CRC format change comes along as that
-	 * requires the AGFL to be initialised by mkfs. At that point, we can
-	 * verify the blocks in the agfl -active or not- lie within the bounds
-	 * of the AG. Until then, just leave this check ifdef'd out.
-	 */
 	struct xfs_mount *mp = bp->b_target->bt_mount;
 	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
-	int		agfl_ok = 1;
-
 	int		i;
 
+	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+		return false;
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+		return false;
+
 	for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
-		if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
+		if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
 		    be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
-			agfl_ok = 0;
+			return false;
 	}
+	return true;
+}
+
+static void
+xfs_agfl_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	int		agfl_ok = 1;
+
+	/*
+	 * There is no verification of non-crc AGFLs because mkfs does not
+	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
+	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
+	 * can't verify just those entries are valid.
+	 */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+				   offsetof(struct xfs_agfl, agfl_crc));
+
+	agfl_ok = agfl_ok && xfs_agfl_verify(bp);
 
 	if (!agfl_ok) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
-#endif
 }
 
 static void
 xfs_agfl_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_agfl_verify(bp);
-}
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
 
-static void
-xfs_agfl_read_verify(
-	struct xfs_buf	*bp)
-{
-	xfs_agfl_verify(bp);
+	/* no verification of non-crc AGFLs */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_agfl_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+
+	if (bip)
+		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+			 offsetof(struct xfs_agfl, agfl_crc));
 }
 
 const struct xfs_buf_ops xfs_agfl_buf_ops = {
@@ -772,7 +813,8 @@ xfs_alloc_find_best_extent(
 			xfs_alloc_fix_len(args);
 
 			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-						       args->alignment, *sbnoa,
+						       args->alignment,
+						       args->userdata, *sbnoa,
 						       *slena, &new);
 
 			/*
@@ -842,7 +884,7 @@ xfs_alloc_ag_vextent_near(
 	 */
 	int		dofirst;	/* set to do first algorithm */
 
-	dofirst = random32() & 1;
+	dofirst = prandom_u32() & 1;
 #endif
 
 restart:
@@ -943,7 +985,8 @@ restart:
 			if (args->len < blen)
 				continue;
 			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, ltbnoa, ltlena, &ltnew);
+				args->alignment, args->userdata, ltbnoa,
+				ltlena, &ltnew);
 			if (ltnew != NULLAGBLOCK &&
 			    (args->len > blen || ltdiff < bdiff)) {
 				bdiff = ltdiff;
@@ -1095,7 +1138,8 @@ restart:
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
 			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, ltbnoa, ltlena, &ltnew);
+				args->alignment, args->userdata, ltbnoa,
+				ltlena, &ltnew);
 
 			error = xfs_alloc_find_best_extent(args,
 						&bno_cur_lt, &bno_cur_gt,
@@ -1111,7 +1155,8 @@ restart:
 			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
 			xfs_alloc_fix_len(args);
 			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, gtbnoa, gtlena, &gtnew);
+				args->alignment, args->userdata, gtbnoa,
+				gtlena, &gtnew);
 
 			error = xfs_alloc_find_best_extent(args,
 						&bno_cur_gt, &bno_cur_lt,
@@ -1170,7 +1215,7 @@ restart:
 	}
 	rlen = args->len;
 	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
-				     ltbnoa, ltlena, &ltnew);
+				     args->userdata, ltbnoa, ltlena, &ltnew);
 	ASSERT(ltnew >= ltbno);
 	ASSERT(ltnew + rlen <= ltbnoa + ltlena);
 	ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
@@ -1925,8 +1970,6 @@ xfs_alloc_fix_freelist(
 	targs.mp = mp;
 	targs.agbp = agbp;
 	targs.agno = args->agno;
-	targs.mod = targs.minleft = targs.wasdel = targs.userdata =
-		targs.minalignslop = 0;
 	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
 	targs.type = XFS_ALLOCTYPE_THIS_AG;
 	targs.pag = pag;
@@ -1984,18 +2027,18 @@ xfs_alloc_get_freelist(
 	int		btreeblk) /* destination is a AGF btree */
 {
 	xfs_agf_t	*agf;	/* a.g. freespace structure */
-	xfs_agfl_t	*agfl;	/* a.g. freelist structure */
 	xfs_buf_t	*agflbp;/* buffer for a.g. freelist structure */
 	xfs_agblock_t	bno;	/* block number returned */
+	__be32		*agfl_bno;
 	int		error;
 	int		logflags;
-	xfs_mount_t	*mp;	/* mount structure */
+	xfs_mount_t	*mp = tp->t_mountp;
 	xfs_perag_t	*pag;	/* per allocation group data */
 
-	agf = XFS_BUF_TO_AGF(agbp);
 	/*
 	 * Freelist is empty, give up.
 	 */
+	agf = XFS_BUF_TO_AGF(agbp);
 	if (!agf->agf_flcount) {
 		*bnop = NULLAGBLOCK;
 		return 0;
@@ -2003,15 +2046,17 @@ xfs_alloc_get_freelist(
 	/*
 	 * Read the array of free blocks.
 	 */
-	mp = tp->t_mountp;
-	if ((error = xfs_alloc_read_agfl(mp, tp,
-			be32_to_cpu(agf->agf_seqno), &agflbp)))
+	error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
+				    &agflbp);
+	if (error)
 		return error;
-	agfl = XFS_BUF_TO_AGFL(agflbp);
+
+
 	/*
 	 * Get the block number and update the data structures.
 	 */
-	bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
 	be32_add_cpu(&agf->agf_flfirst, 1);
 	xfs_trans_brelse(tp, agflbp);
 	if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
@@ -2060,11 +2105,14 @@ xfs_alloc_log_agf(
 		offsetof(xfs_agf_t, agf_freeblks),
 		offsetof(xfs_agf_t, agf_longest),
 		offsetof(xfs_agf_t, agf_btreeblks),
+		offsetof(xfs_agf_t, agf_uuid),
 		sizeof(xfs_agf_t)
 	};
 
 	trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
 
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+
 	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
 	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
 }
@@ -2101,12 +2149,13 @@ xfs_alloc_put_freelist(
 	int			btreeblk) /* block came from a AGF btree */
 {
 	xfs_agf_t		*agf;	/* a.g. freespace structure */
-	xfs_agfl_t		*agfl;	/* a.g. free block array */
 	__be32			*blockp;/* pointer to array entry */
 	int			error;
 	int			logflags;
 	xfs_mount_t		*mp;	/* mount structure */
 	xfs_perag_t		*pag;	/* per allocation group data */
+	__be32			*agfl_bno;
+	int			startoff;
 
 	agf = XFS_BUF_TO_AGF(agbp);
 	mp = tp->t_mountp;
@@ -2114,7 +2163,6 @@ xfs_alloc_put_freelist(
 	if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
 			be32_to_cpu(agf->agf_seqno), &agflbp)))
 		return error;
-	agfl = XFS_BUF_TO_AGFL(agflbp);
 	be32_add_cpu(&agf->agf_fllast, 1);
 	if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
 		agf->agf_fllast = 0;
@@ -2135,32 +2183,38 @@ xfs_alloc_put_freelist(
 	xfs_alloc_log_agf(tp, agbp, logflags);
 
 	ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
-	blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
+
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
 	*blockp = cpu_to_be32(bno);
+	startoff = (char *)blockp - (char *)agflbp->b_addr;
+
 	xfs_alloc_log_agf(tp, agbp, logflags);
-	xfs_trans_log_buf(tp, agflbp,
-		(int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
-		(int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
-			sizeof(xfs_agblock_t) - 1));
+
+	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+	xfs_trans_log_buf(tp, agflbp, startoff,
+			  startoff + sizeof(xfs_agblock_t) - 1);
 	return 0;
 }
 
-static void
+static bool
 xfs_agf_verify(
+	struct xfs_mount *mp,
 	struct xfs_buf	*bp)
  {
-	struct xfs_mount *mp = bp->b_target->bt_mount;
-	struct xfs_agf	*agf;
-	int		agf_ok;
+	struct xfs_agf	*agf = XFS_BUF_TO_AGF(bp);
 
-	agf = XFS_BUF_TO_AGF(bp);
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+			return false;
 
-	agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
-		XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
-		be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
-		be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
-		be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-		be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+	if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+	      XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+	      be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+	      be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+	      be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+	      be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
+		return false;
 
 	/*
 	 * during growfs operations, the perag is not fully initialised,
@@ -2168,33 +2222,58 @@ xfs_agf_verify(
 	 * use it by using uncached buffers that don't have the perag attached
 	 * so we can detect and avoid this problem.
 	 */
-	if (bp->b_pag)
-		agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
-						bp->b_pag->pag_agno;
+	if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+		return false;
 
-	if (xfs_sb_version_haslazysbcount(&mp->m_sb))
-		agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
-						be32_to_cpu(agf->agf_length);
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+		return false;
+
+	return true;;
 
-	if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
-			XFS_RANDOM_ALLOC_READ_AGF))) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-	}
 }
 
 static void
 xfs_agf_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_agf_verify(bp);
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	int		agf_ok = 1;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+					  offsetof(struct xfs_agf, agf_crc));
+
+	agf_ok = agf_ok && xfs_agf_verify(mp, bp);
+
+	if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+			XFS_RANDOM_ALLOC_READ_AGF))) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
 }
 
 static void
 xfs_agf_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_agf_verify(bp);
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_agf_verify(mp, bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+			 offsetof(struct xfs_agf, agf_crc));
 }
 
 const struct xfs_buf_ops xfs_agf_buf_ops = {
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b1ddef6b268..cafc90251d1 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -33,6 +33,7 @@
 #include "xfs_extent_busy.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_cksum.h"
 
 
 STATIC struct xfs_btree_cur *
@@ -272,7 +273,7 @@ xfs_allocbt_key_diff(
 	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
 
-static void
+static bool
 xfs_allocbt_verify(
 	struct xfs_buf		*bp)
 {
@@ -280,66 +281,103 @@ xfs_allocbt_verify(
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	struct xfs_perag	*pag = bp->b_pag;
 	unsigned int		level;
-	int			sblock_ok; /* block passes checks */
 
 	/*
 	 * magic number and level verification
 	 *
-	 * During growfs operations, we can't verify the exact level as the
-	 * perag is not fully initialised and hence not attached to the buffer.
-	 * In this case, check against the maximum tree depth.
+	 * During growfs operations, we can't verify the exact level or owner as
+	 * the perag is not fully initialised and hence not attached to the
+	 * buffer.  In this case, check against the maximum tree depth.
+	 *
+	 * Similarly, during log recovery we will have a perag structure
+	 * attached, but the agf information will not yet have been initialised
+	 * from the on disk AGF. Again, we can only check against maximum limits
+	 * in this case.
 	 */
 	level = be16_to_cpu(block->bb_level);
 	switch (block->bb_magic) {
+	case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+			return false;
+		if (pag &&
+		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+			return false;
+		/* fall through */
 	case cpu_to_be32(XFS_ABTB_MAGIC):
-		if (pag)
-			sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
-		else
-			sblock_ok = level < mp->m_ag_maxlevels;
+		if (pag && pag->pagf_init) {
+			if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
+				return false;
+		} else if (level >= mp->m_ag_maxlevels)
+			return false;
 		break;
+	case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+			return false;
+		if (pag &&
+		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+			return false;
+		/* fall through */
 	case cpu_to_be32(XFS_ABTC_MAGIC):
-		if (pag)
-			sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
-		else
-			sblock_ok = level < mp->m_ag_maxlevels;
+		if (pag && pag->pagf_init) {
+			if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
+				return false;
+		} else if (level >= mp->m_ag_maxlevels)
+			return false;
 		break;
 	default:
-		sblock_ok = 0;
-		break;
+		return false;
 	}
 
 	/* numrecs verification */
-	sblock_ok = sblock_ok &&
-		be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
+		return false;
 
 	/* sibling pointer verification */
-	sblock_ok = sblock_ok &&
-		(block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
-		 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
-		block->bb_u.s.bb_leftsib &&
-		(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
-		 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
-		block->bb_u.s.bb_rightsib;
-
-	if (!sblock_ok) {
-		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-	}
+	if (!block->bb_u.s.bb_leftsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+	if (!block->bb_u.s.bb_rightsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+
+	return true;
 }
 
 static void
 xfs_allocbt_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_allocbt_verify(bp);
+	if (!(xfs_btree_sblock_verify_crc(bp) &&
+	      xfs_allocbt_verify(bp))) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+				     bp->b_target->bt_mount, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
 }
 
 static void
 xfs_allocbt_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_allocbt_verify(bp);
+	if (!xfs_allocbt_verify(bp)) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+				     bp->b_target->bt_mount, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
 }
 
 const struct xfs_buf_ops xfs_allocbt_buf_ops = {
@@ -348,7 +386,7 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = {
 };
 
 
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 STATIC int
 xfs_allocbt_keys_inorder(
 	struct xfs_btree_cur	*cur,
@@ -404,7 +442,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
 	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
 	.key_diff		= xfs_allocbt_key_diff,
 	.buf_ops		= &xfs_allocbt_buf_ops,
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 	.keys_inorder		= xfs_allocbt_keys_inorder,
 	.recs_inorder		= xfs_allocbt_recs_inorder,
 #endif
@@ -444,6 +482,9 @@ xfs_allocbt_init_cursor(
 	cur->bc_private.a.agbp = agbp;
 	cur->bc_private.a.agno = agno;
 
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
 	return cur;
 }
 
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 7e89a2b429d..e3a3f742419 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -31,8 +31,10 @@ struct xfs_mount;
  * by blockcount and blockno.  All blocks look the same to make the code
  * simpler; if we have time later, we'll make the optimizations.
  */
-#define	XFS_ABTB_MAGIC	0x41425442	/* 'ABTB' for bno tree */
-#define	XFS_ABTC_MAGIC	0x41425443	/* 'ABTC' for cnt tree */
+#define	XFS_ABTB_MAGIC		0x41425442	/* 'ABTB' for bno tree */
+#define	XFS_ABTB_CRC_MAGIC	0x41423342	/* 'AB3B' */
+#define	XFS_ABTC_MAGIC		0x41425443	/* 'ABTC' for cnt tree */
+#define	XFS_ABTC_CRC_MAGIC	0x41423343	/* 'AB3C' */
 
 /*
  * Data record/key structure
@@ -59,10 +61,10 @@ typedef __be32 xfs_alloc_ptr_t;
 
 /*
  * Btree block header size depends on a superblock flag.
- *
- * (not quite yet, but soon)
  */
-#define XFS_ALLOC_BLOCK_LEN(mp)	XFS_BTREE_SBLOCK_LEN
+#define XFS_ALLOC_BLOCK_LEN(mp) \
+	(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
 
 /*
  * Record, key, and pointer address macros for btree blocks.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5f707e53717..596ec71da00 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
+#include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
@@ -724,6 +725,25 @@ xfs_convert_page(
 			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
 			i_size_read(inode));
 
+	/*
+	 * If the current map does not span the entire page we are about to try
+	 * to write, then give up. The only way we can write a page that spans
+	 * multiple mappings in a single writeback iteration is via the
+	 * xfs_vm_writepage() function. Data integrity writeback requires the
+	 * entire page to be written in a single attempt, otherwise the part of
+	 * the page we don't write here doesn't get written as part of the data
+	 * integrity sync.
+	 *
+	 * For normal writeback, we also don't attempt to write partial pages
+	 * here as it simply means that write_cache_pages() will see it under
+	 * writeback and ignore the page until some point in the future, at
+	 * which time this will be the only page in the file that needs
+	 * writeback.  Hence for more optimal IO patterns, we should always
+	 * avoid partial page writeback due to multiple mappings on a page here.
+	 */
+	if (!xfs_imap_valid(inode, imap, end_offset))
+		goto fail_unlock_page;
+
 	len = 1 << inode->i_blkbits;
 	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
 					PAGE_CACHE_SIZE);
@@ -823,10 +843,12 @@ xfs_cluster_write(
 STATIC void
 xfs_vm_invalidatepage(
 	struct page		*page,
-	unsigned long		offset)
+	unsigned int		offset,
+	unsigned int		length)
 {
-	trace_xfs_invalidatepage(page->mapping->host, page, offset);
-	block_invalidatepage(page, offset);
+	trace_xfs_invalidatepage(page->mapping->host, page, offset,
+				 length);
+	block_invalidatepage(page, offset, length);
 }
 
 /*
@@ -890,7 +912,7 @@ next_buffer:
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out_invalidate:
-	xfs_vm_invalidatepage(page, 0);
+	xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
 	return;
 }
 
@@ -920,7 +942,7 @@ xfs_vm_writepage(
 	int			count = 0;
 	int			nonblocking = 0;
 
-	trace_xfs_writepage(inode, page, 0);
+	trace_xfs_writepage(inode, page, 0, 0);
 
 	ASSERT(page_has_buffers(page));
 
@@ -953,13 +975,13 @@ xfs_vm_writepage(
 		unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
 
 		/*
-		 * Just skip the page if it is fully outside i_size, e.g. due
-		 * to a truncate operation that is in progress.
+		 * Skip the page if it is fully outside i_size, e.g. due to a
+		 * truncate operation that is in progress. We must redirty the
+		 * page so that reclaim stops reclaiming it. Otherwise
+		 * xfs_vm_releasepage() is called on it and gets confused.
 		 */
-		if (page->index >= end_index + 1 || offset_into_page == 0) {
-			unlock_page(page);
-			return 0;
-		}
+		if (page->index >= end_index + 1 || offset_into_page == 0)
+			goto redirty;
 
 		/*
 		 * The page straddles i_size.  It must be zeroed out on each
@@ -1151,7 +1173,7 @@ xfs_vm_releasepage(
 {
 	int			delalloc, unwritten;
 
-	trace_xfs_releasepage(page->mapping->host, page, 0);
+	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
 
 	xfs_count_page_state(page, &delalloc, &unwritten);
 
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index aaf472532b3..20fe3fe9d34 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -15,7 +15,6 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
-
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
@@ -35,6 +34,7 @@
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trans_space.h"
@@ -74,13 +74,6 @@ STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
 STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
 STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
 
-/*
- * Routines to manipulate out-of-line attribute values.
- */
-STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
-STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
-
-#define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
 
 STATIC int
 xfs_attr_name_to_xname(
@@ -300,9 +293,12 @@ xfs_attr_set_int(
 	if (rsvd)
 		args.trans->t_flags |= XFS_TRANS_RESERVE;
 
-	if ((error = xfs_trans_reserve(args.trans, args.total,
-			XFS_ATTRSET_LOG_RES(mp, args.total), 0,
-			XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
+	error = xfs_trans_reserve(args.trans, args.total,
+				  XFS_ATTRSETM_LOG_RES(mp) +
+				  XFS_ATTRSETRT_LOG_RES(mp) * args.total,
+				  0, XFS_TRANS_PERM_LOG_RES,
+				  XFS_ATTRSET_LOG_COUNT);
+	if (error) {
 		xfs_trans_cancel(args.trans, 0);
 		return(error);
 	}
@@ -817,7 +813,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
 		error = 0;
 		goto out;
 	}
-	error = xfs_attr_root_inactive(&trans, dp);
+	error = xfs_attr3_root_inactive(&trans, dp);
 	if (error)
 		goto out;
 
@@ -903,7 +899,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	 */
 	dp = args->dp;
 	args->blkno = 0;
-	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
 	if (error)
 		return error;
 
@@ -911,14 +907,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	 * Look up the given attribute in the leaf block.  Figure out if
 	 * the given flags produce an error or call for an atomic rename.
 	 */
-	retval = xfs_attr_leaf_lookup_int(bp, args);
+	retval = xfs_attr3_leaf_lookup_int(bp, args);
 	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
 		xfs_trans_brelse(args->trans, bp);
-		return(retval);
+		return retval;
 	} else if (retval == EEXIST) {
 		if (args->flags & ATTR_CREATE) {	/* pure create op */
 			xfs_trans_brelse(args->trans, bp);
-			return(retval);
+			return retval;
 		}
 
 		trace_xfs_attr_leaf_replace(args);
@@ -934,7 +930,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	 * Add the attribute to the leaf block, transitioning to a Btree
 	 * if required.
 	 */
-	retval = xfs_attr_leaf_add(bp, args);
+	retval = xfs_attr3_leaf_add(bp, args);
 	if (retval == ENOSPC) {
 		/*
 		 * Promote the attribute list to the Btree format, then
@@ -942,7 +938,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * can manage its own transactions.
 		 */
 		xfs_bmap_init(args->flist, args->firstblock);
-		error = xfs_attr_leaf_to_node(args);
+		error = xfs_attr3_leaf_to_node(args);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
 						&committed);
@@ -1007,7 +1003,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * In a separate transaction, set the incomplete flag on the
 		 * "old" attr and clear the incomplete flag on the "new" attr.
 		 */
-		error = xfs_attr_leaf_flipflags(args);
+		error = xfs_attr3_leaf_flipflags(args);
 		if (error)
 			return(error);
 
@@ -1029,19 +1025,19 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * Read in the block containing the "old" attr, then
 		 * remove the "old" attr from that block (neat, huh!)
 		 */
-		error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
+		error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
 					   -1, &bp);
 		if (error)
 			return error;
 
-		xfs_attr_leaf_remove(bp, args);
+		xfs_attr3_leaf_remove(bp, args);
 
 		/*
 		 * If the result is small enough, shrink it all into the inode.
 		 */
 		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
 			xfs_bmap_init(args->flist, args->firstblock);
-			error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+			error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
 			/* bp is gone due to xfs_da_shrink_inode */
 			if (!error) {
 				error = xfs_bmap_finish(&args->trans,
@@ -1073,9 +1069,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		/*
 		 * Added a "remote" value, just clear the incomplete flag.
 		 */
-		error = xfs_attr_leaf_clearflag(args);
+		error = xfs_attr3_leaf_clearflag(args);
 	}
-	return(error);
+	return error;
 }
 
 /*
@@ -1098,24 +1094,24 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 	 */
 	dp = args->dp;
 	args->blkno = 0;
-	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
 	if (error)
 		return error;
 
-	error = xfs_attr_leaf_lookup_int(bp, args);
+	error = xfs_attr3_leaf_lookup_int(bp, args);
 	if (error == ENOATTR) {
 		xfs_trans_brelse(args->trans, bp);
-		return(error);
+		return error;
 	}
 
-	xfs_attr_leaf_remove(bp, args);
+	xfs_attr3_leaf_remove(bp, args);
 
 	/*
 	 * If the result is small enough, shrink it all into the inode.
 	 */
 	if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
 		xfs_bmap_init(args->flist, args->firstblock);
-		error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+		error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
 		/* bp is gone due to xfs_da_shrink_inode */
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1125,7 +1121,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 			ASSERT(committed);
 			args->trans = NULL;
 			xfs_bmap_cancel(args->flist);
-			return(error);
+			return error;
 		}
 
 		/*
@@ -1135,7 +1131,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 		if (committed)
 			xfs_trans_ijoin(args->trans, dp, 0);
 	}
-	return(0);
+	return 0;
 }
 
 /*
@@ -1153,21 +1149,21 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
 	trace_xfs_attr_leaf_get(args);
 
 	args->blkno = 0;
-	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
 	if (error)
 		return error;
 
-	error = xfs_attr_leaf_lookup_int(bp, args);
+	error = xfs_attr3_leaf_lookup_int(bp, args);
 	if (error != EEXIST)  {
 		xfs_trans_brelse(args->trans, bp);
-		return(error);
+		return error;
 	}
-	error = xfs_attr_leaf_getvalue(bp, args);
+	error = xfs_attr3_leaf_getvalue(bp, args);
 	xfs_trans_brelse(args->trans, bp);
 	if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
 		error = xfs_attr_rmtval_get(args);
 	}
-	return(error);
+	return error;
 }
 
 /*
@@ -1182,11 +1178,11 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
 	trace_xfs_attr_leaf_list(context);
 
 	context->cursor->blkno = 0;
-	error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
+	error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
 	if (error)
 		return XFS_ERROR(error);
 
-	error = xfs_attr_leaf_list_int(bp, context);
+	error = xfs_attr3_leaf_list_int(bp, context);
 	xfs_trans_brelse(NULL, bp);
 	return XFS_ERROR(error);
 }
@@ -1233,7 +1229,7 @@ restart:
 	 * Search to see if name already exists, and get back a pointer
 	 * to where it should go.
 	 */
-	error = xfs_da_node_lookup_int(state, &retval);
+	error = xfs_da3_node_lookup_int(state, &retval);
 	if (error)
 		goto out;
 	blk = &state->path.blk[ state->path.active-1 ];
@@ -1255,7 +1251,7 @@ restart:
 		args->rmtblkcnt = 0;
 	}
 
-	retval = xfs_attr_leaf_add(blk->bp, state->args);
+	retval = xfs_attr3_leaf_add(blk->bp, state->args);
 	if (retval == ENOSPC) {
 		if (state->path.active == 1) {
 			/*
@@ -1265,7 +1261,7 @@ restart:
 			 */
 			xfs_da_state_free(state);
 			xfs_bmap_init(args->flist, args->firstblock);
-			error = xfs_attr_leaf_to_node(args);
+			error = xfs_attr3_leaf_to_node(args);
 			if (!error) {
 				error = xfs_bmap_finish(&args->trans,
 							args->flist,
@@ -1304,7 +1300,7 @@ restart:
 		 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
 		 */
 		xfs_bmap_init(args->flist, args->firstblock);
-		error = xfs_da_split(state);
+		error = xfs_da3_split(state);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
 						&committed);
@@ -1326,7 +1322,7 @@ restart:
 		/*
 		 * Addition succeeded, update Btree hashvals.
 		 */
-		xfs_da_fixhashpath(state, &state->path);
+		xfs_da3_fixhashpath(state, &state->path);
 	}
 
 	/*
@@ -1367,7 +1363,7 @@ restart:
 		 * In a separate transaction, set the incomplete flag on the
 		 * "old" attr and clear the incomplete flag on the "new" attr.
 		 */
-		error = xfs_attr_leaf_flipflags(args);
+		error = xfs_attr3_leaf_flipflags(args);
 		if (error)
 			goto out;
 
@@ -1397,7 +1393,7 @@ restart:
 		state->blocksize = state->mp->m_sb.sb_blocksize;
 		state->node_ents = state->mp->m_attr_node_ents;
 		state->inleaf = 0;
-		error = xfs_da_node_lookup_int(state, &retval);
+		error = xfs_da3_node_lookup_int(state, &retval);
 		if (error)
 			goto out;
 
@@ -1406,15 +1402,15 @@ restart:
 		 */
 		blk = &state->path.blk[ state->path.active-1 ];
 		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-		error = xfs_attr_leaf_remove(blk->bp, args);
-		xfs_da_fixhashpath(state, &state->path);
+		error = xfs_attr3_leaf_remove(blk->bp, args);
+		xfs_da3_fixhashpath(state, &state->path);
 
 		/*
 		 * Check to see if the tree needs to be collapsed.
 		 */
 		if (retval && (state->path.active > 1)) {
 			xfs_bmap_init(args->flist, args->firstblock);
-			error = xfs_da_join(state);
+			error = xfs_da3_join(state);
 			if (!error) {
 				error = xfs_bmap_finish(&args->trans,
 							args->flist,
@@ -1447,7 +1443,7 @@ restart:
 		/*
 		 * Added a "remote" value, just clear the incomplete flag.
 		 */
-		error = xfs_attr_leaf_clearflag(args);
+		error = xfs_attr3_leaf_clearflag(args);
 		if (error)
 			goto out;
 	}
@@ -1492,7 +1488,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 	/*
 	 * Search to see if name exists, and get back a pointer to it.
 	 */
-	error = xfs_da_node_lookup_int(state, &retval);
+	error = xfs_da3_node_lookup_int(state, &retval);
 	if (error || (retval != EEXIST)) {
 		if (error == 0)
 			error = retval;
@@ -1521,7 +1517,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 		 * Mark the attribute as INCOMPLETE, then bunmapi() the
 		 * remote value.
 		 */
-		error = xfs_attr_leaf_setflag(args);
+		error = xfs_attr3_leaf_setflag(args);
 		if (error)
 			goto out;
 		error = xfs_attr_rmtval_remove(args);
@@ -1542,15 +1538,15 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 	 */
 	blk = &state->path.blk[ state->path.active-1 ];
 	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-	retval = xfs_attr_leaf_remove(blk->bp, args);
-	xfs_da_fixhashpath(state, &state->path);
+	retval = xfs_attr3_leaf_remove(blk->bp, args);
+	xfs_da3_fixhashpath(state, &state->path);
 
 	/*
 	 * Check to see if the tree needs to be collapsed.
 	 */
 	if (retval && (state->path.active > 1)) {
 		xfs_bmap_init(args->flist, args->firstblock);
-		error = xfs_da_join(state);
+		error = xfs_da3_join(state);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
 						&committed);
@@ -1588,13 +1584,13 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 		ASSERT(state->path.blk[0].bp);
 		state->path.blk[0].bp = NULL;
 
-		error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
+		error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
 		if (error)
 			goto out;
 
 		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
 			xfs_bmap_init(args->flist, args->firstblock);
-			error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
+			error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
 			/* bp is gone due to xfs_da_shrink_inode */
 			if (!error) {
 				error = xfs_bmap_finish(&args->trans,
@@ -1696,7 +1692,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
 	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
 	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
 		if (blk->disk_blkno) {
-			error = xfs_da_node_read(state->args->trans,
+			error = xfs_da3_node_read(state->args->trans,
 						state->args->dp,
 						blk->blkno, blk->disk_blkno,
 						&blk->bp, XFS_ATTR_FORK);
@@ -1715,7 +1711,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
 	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
 	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
 		if (blk->disk_blkno) {
-			error = xfs_da_node_read(state->args->trans,
+			error = xfs_da3_node_read(state->args->trans,
 						state->args->dp,
 						blk->blkno, blk->disk_blkno,
 						&blk->bp, XFS_ATTR_FORK);
@@ -1755,7 +1751,7 @@ xfs_attr_node_get(xfs_da_args_t *args)
 	/*
 	 * Search to see if name exists, and get back a pointer to it.
 	 */
-	error = xfs_da_node_lookup_int(state, &retval);
+	error = xfs_da3_node_lookup_int(state, &retval);
 	if (error) {
 		retval = error;
 	} else if (retval == EEXIST) {
@@ -1766,7 +1762,7 @@ xfs_attr_node_get(xfs_da_args_t *args)
 		/*
 		 * Get the value, local or "remote"
 		 */
-		retval = xfs_attr_leaf_getvalue(blk->bp, args);
+		retval = xfs_attr3_leaf_getvalue(blk->bp, args);
 		if (!retval && (args->rmtblkno > 0)
 		    && !(args->flags & ATTR_KERNOVAL)) {
 			retval = xfs_attr_rmtval_get(args);
@@ -1791,7 +1787,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	attrlist_cursor_kern_t *cursor;
 	xfs_attr_leafblock_t *leaf;
 	xfs_da_intnode_t *node;
-	xfs_da_node_entry_t *btree;
+	struct xfs_attr3_icleaf_hdr leafhdr;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_node_entry *btree;
 	int error, i;
 	struct xfs_buf *bp;
 
@@ -1807,27 +1805,33 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	 */
 	bp = NULL;
 	if (cursor->blkno > 0) {
-		error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
+		error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
 					      &bp, XFS_ATTR_FORK);
 		if ((error != 0) && (error != EFSCORRUPTED))
 			return(error);
 		if (bp) {
+			struct xfs_attr_leaf_entry *entries;
+
 			node = bp->b_addr;
 			switch (be16_to_cpu(node->hdr.info.magic)) {
 			case XFS_DA_NODE_MAGIC:
+			case XFS_DA3_NODE_MAGIC:
 				trace_xfs_attr_list_wrong_blk(context);
 				xfs_trans_brelse(NULL, bp);
 				bp = NULL;
 				break;
 			case XFS_ATTR_LEAF_MAGIC:
+			case XFS_ATTR3_LEAF_MAGIC:
 				leaf = bp->b_addr;
-				if (cursor->hashval > be32_to_cpu(leaf->entries[
-				    be16_to_cpu(leaf->hdr.count)-1].hashval)) {
+				xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+				entries = xfs_attr3_leaf_entryp(leaf);
+				if (cursor->hashval > be32_to_cpu(
+						entries[leafhdr.count - 1].hashval)) {
 					trace_xfs_attr_list_wrong_blk(context);
 					xfs_trans_brelse(NULL, bp);
 					bp = NULL;
-				} else if (cursor->hashval <=
-					     be32_to_cpu(leaf->entries[0].hashval)) {
+				} else if (cursor->hashval <= be32_to_cpu(
+						entries[0].hashval)) {
 					trace_xfs_attr_list_wrong_blk(context);
 					xfs_trans_brelse(NULL, bp);
 					bp = NULL;
@@ -1849,27 +1853,31 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	if (bp == NULL) {
 		cursor->blkno = 0;
 		for (;;) {
-			error = xfs_da_node_read(NULL, context->dp,
+			__uint16_t magic;
+
+			error = xfs_da3_node_read(NULL, context->dp,
 						      cursor->blkno, -1, &bp,
 						      XFS_ATTR_FORK);
 			if (error)
 				return(error);
 			node = bp->b_addr;
-			if (node->hdr.info.magic ==
-			    cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
+			magic = be16_to_cpu(node->hdr.info.magic);
+			if (magic == XFS_ATTR_LEAF_MAGIC ||
+			    magic == XFS_ATTR3_LEAF_MAGIC)
 				break;
-			if (unlikely(node->hdr.info.magic !=
-				     cpu_to_be16(XFS_DA_NODE_MAGIC))) {
+			if (magic != XFS_DA_NODE_MAGIC &&
+			    magic != XFS_DA3_NODE_MAGIC) {
 				XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
 						     XFS_ERRLEVEL_LOW,
 						     context->dp->i_mount,
 						     node);
 				xfs_trans_brelse(NULL, bp);
-				return(XFS_ERROR(EFSCORRUPTED));
+				return XFS_ERROR(EFSCORRUPTED);
 			}
-			btree = node->btree;
-			for (i = 0; i < be16_to_cpu(node->hdr.count);
-								btree++, i++) {
+
+			xfs_da3_node_hdr_from_disk(&nodehdr, node);
+			btree = xfs_da3_node_tree_p(node);
+			for (i = 0; i < nodehdr.count; btree++, i++) {
 				if (cursor->hashval
 						<= be32_to_cpu(btree->hashval)) {
 					cursor->blkno = be32_to_cpu(btree->before);
@@ -1878,9 +1886,9 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 					break;
 				}
 			}
-			if (i == be16_to_cpu(node->hdr.count)) {
+			if (i == nodehdr.count) {
 				xfs_trans_brelse(NULL, bp);
-				return(0);
+				return 0;
 			}
 			xfs_trans_brelse(NULL, bp);
 		}
@@ -1894,310 +1902,21 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	 */
 	for (;;) {
 		leaf = bp->b_addr;
-		error = xfs_attr_leaf_list_int(bp, context);
+		error = xfs_attr3_leaf_list_int(bp, context);
 		if (error) {
 			xfs_trans_brelse(NULL, bp);
 			return error;
 		}
-		if (context->seen_enough || leaf->hdr.info.forw == 0)
+		xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+		if (context->seen_enough || leafhdr.forw == 0)
 			break;
-		cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
+		cursor->blkno = leafhdr.forw;
 		xfs_trans_brelse(NULL, bp);
-		error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
+		error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
 					   &bp);
 		if (error)
 			return error;
 	}
 	xfs_trans_brelse(NULL, bp);
-	return(0);
-}
-
-
-/*========================================================================
- * External routines for manipulating out-of-line attribute values.
- *========================================================================*/
-
-/*
- * Read the value associated with an attribute from the out-of-line buffer
- * that we stored it in.
- */
-int
-xfs_attr_rmtval_get(xfs_da_args_t *args)
-{
-	xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
-	xfs_mount_t *mp;
-	xfs_daddr_t dblkno;
-	void *dst;
-	xfs_buf_t *bp;
-	int nmap, error, tmp, valuelen, blkcnt, i;
-	xfs_dablk_t lblkno;
-
-	trace_xfs_attr_rmtval_get(args);
-
-	ASSERT(!(args->flags & ATTR_KERNOVAL));
-
-	mp = args->dp->i_mount;
-	dst = args->value;
-	valuelen = args->valuelen;
-	lblkno = args->rmtblkno;
-	while (valuelen > 0) {
-		nmap = ATTR_RMTVALUE_MAPSIZE;
-		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
-				       args->rmtblkcnt, map, &nmap,
-				       XFS_BMAPI_ATTRFORK);
-		if (error)
-			return(error);
-		ASSERT(nmap >= 1);
-
-		for (i = 0; (i < nmap) && (valuelen > 0); i++) {
-			ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
-			       (map[i].br_startblock != HOLESTARTBLOCK));
-			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
-			blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
-						   dblkno, blkcnt, 0, &bp, NULL);
-			if (error)
-				return(error);
-
-			tmp = min_t(int, valuelen, BBTOB(bp->b_length));
-			xfs_buf_iomove(bp, 0, tmp, dst, XBRW_READ);
-			xfs_buf_relse(bp);
-			dst += tmp;
-			valuelen -= tmp;
-
-			lblkno += map[i].br_blockcount;
-		}
-	}
-	ASSERT(valuelen == 0);
-	return(0);
-}
-
-/*
- * Write the value associated with an attribute into the out-of-line buffer
- * that we have defined for it.
- */
-STATIC int
-xfs_attr_rmtval_set(xfs_da_args_t *args)
-{
-	xfs_mount_t *mp;
-	xfs_fileoff_t lfileoff;
-	xfs_inode_t *dp;
-	xfs_bmbt_irec_t map;
-	xfs_daddr_t dblkno;
-	void *src;
-	xfs_buf_t *bp;
-	xfs_dablk_t lblkno;
-	int blkcnt, valuelen, nmap, error, tmp, committed;
-
-	trace_xfs_attr_rmtval_set(args);
-
-	dp = args->dp;
-	mp = dp->i_mount;
-	src = args->value;
-
-	/*
-	 * Find a "hole" in the attribute address space large enough for
-	 * us to drop the new attribute's value into.
-	 */
-	blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
-	lfileoff = 0;
-	error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
-						   XFS_ATTR_FORK);
-	if (error) {
-		return(error);
-	}
-	args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
-	args->rmtblkcnt = blkcnt;
-
-	/*
-	 * Roll through the "value", allocating blocks on disk as required.
-	 */
-	while (blkcnt > 0) {
-		/*
-		 * Allocate a single extent, up to the size of the value.
-		 */
-		xfs_bmap_init(args->flist, args->firstblock);
-		nmap = 1;
-		error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
-				  blkcnt,
-				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				  args->firstblock, args->total, &map, &nmap,
-				  args->flist);
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
-		if (error) {
-			ASSERT(committed);
-			args->trans = NULL;
-			xfs_bmap_cancel(args->flist);
-			return(error);
-		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, dp, 0);
-
-		ASSERT(nmap == 1);
-		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-		       (map.br_startblock != HOLESTARTBLOCK));
-		lblkno += map.br_blockcount;
-		blkcnt -= map.br_blockcount;
-
-		/*
-		 * Start the next trans in the chain.
-		 */
-		error = xfs_trans_roll(&args->trans, dp);
-		if (error)
-			return (error);
-	}
-
-	/*
-	 * Roll through the "value", copying the attribute value to the
-	 * already-allocated blocks.  Blocks are written synchronously
-	 * so that we can know they are all on disk before we turn off
-	 * the INCOMPLETE flag.
-	 */
-	lblkno = args->rmtblkno;
-	valuelen = args->valuelen;
-	while (valuelen > 0) {
-		int buflen;
-
-		/*
-		 * Try to remember where we decided to put the value.
-		 */
-		xfs_bmap_init(args->flist, args->firstblock);
-		nmap = 1;
-		error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
-				       args->rmtblkcnt, &map, &nmap,
-				       XFS_BMAPI_ATTRFORK);
-		if (error)
-			return(error);
-		ASSERT(nmap == 1);
-		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-		       (map.br_startblock != HOLESTARTBLOCK));
-
-		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
-		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
-		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0);
-		if (!bp)
-			return ENOMEM;
-
-		buflen = BBTOB(bp->b_length);
-		tmp = min_t(int, valuelen, buflen);
-		xfs_buf_iomove(bp, 0, tmp, src, XBRW_WRITE);
-		if (tmp < buflen)
-			xfs_buf_zero(bp, tmp, buflen - tmp);
-
-		error = xfs_bwrite(bp);	/* GROT: NOTE: synchronous write */
-		xfs_buf_relse(bp);
-		if (error)
-			return error;
-		src += tmp;
-		valuelen -= tmp;
-
-		lblkno += map.br_blockcount;
-	}
-	ASSERT(valuelen == 0);
-	return(0);
-}
-
-/*
- * Remove the value associated with an attribute by deleting the
- * out-of-line buffer that it is stored on.
- */
-STATIC int
-xfs_attr_rmtval_remove(xfs_da_args_t *args)
-{
-	xfs_mount_t *mp;
-	xfs_bmbt_irec_t map;
-	xfs_buf_t *bp;
-	xfs_daddr_t dblkno;
-	xfs_dablk_t lblkno;
-	int valuelen, blkcnt, nmap, error, done, committed;
-
-	trace_xfs_attr_rmtval_remove(args);
-
-	mp = args->dp->i_mount;
-
-	/*
-	 * Roll through the "value", invalidating the attribute value's
-	 * blocks.
-	 */
-	lblkno = args->rmtblkno;
-	valuelen = args->rmtblkcnt;
-	while (valuelen > 0) {
-		/*
-		 * Try to remember where we decided to put the value.
-		 */
-		nmap = 1;
-		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
-				       args->rmtblkcnt, &map, &nmap,
-				       XFS_BMAPI_ATTRFORK);
-		if (error)
-			return(error);
-		ASSERT(nmap == 1);
-		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-		       (map.br_startblock != HOLESTARTBLOCK));
-
-		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
-		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
-		/*
-		 * If the "remote" value is in the cache, remove it.
-		 */
-		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
-		if (bp) {
-			xfs_buf_stale(bp);
-			xfs_buf_relse(bp);
-			bp = NULL;
-		}
-
-		valuelen -= map.br_blockcount;
-
-		lblkno += map.br_blockcount;
-	}
-
-	/*
-	 * Keep de-allocating extents until the remote-value region is gone.
-	 */
-	lblkno = args->rmtblkno;
-	blkcnt = args->rmtblkcnt;
-	done = 0;
-	while (!done) {
-		xfs_bmap_init(args->flist, args->firstblock);
-		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
-				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				    1, args->firstblock, args->flist,
-				    &done);
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
-		if (error) {
-			ASSERT(committed);
-			args->trans = NULL;
-			xfs_bmap_cancel(args->flist);
-			return(error);
-		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, args->dp, 0);
-
-		/*
-		 * Close out trans and start the next one in the chain.
-		 */
-		error = xfs_trans_roll(&args->trans, args->dp);
-		if (error)
-			return (error);
-	}
-	return(0);
+	return 0;
 }
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index e920d68ef50..de8dd58da46 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -140,7 +140,6 @@ typedef struct xfs_attr_list_context {
  * Overall external interface routines.
  */
 int xfs_attr_inactive(struct xfs_inode *dp);
-int xfs_attr_rmtval_get(struct xfs_da_args *args);
 int xfs_attr_list_int(struct xfs_attr_list_context *);
 
 #endif	/* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index ee24993c7d1..31d3cd12926 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -31,6 +32,7 @@
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -39,6 +41,9 @@
 #include "xfs_attr_leaf.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
+
 
 /*
  * xfs_attr_leaf.c
@@ -53,85 +58,226 @@
 /*
  * Routines used for growing the Btree.
  */
-STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
-				struct xfs_buf **bpp);
-STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
-				  xfs_da_args_t *args, int freemap_index);
-STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
-				  struct xfs_buf *leaf_buffer);
-STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
+STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args,
+				 xfs_dablk_t which_block, struct xfs_buf **bpp);
+STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer,
+				   struct xfs_attr3_icleaf_hdr *ichdr,
+				   struct xfs_da_args *args, int freemap_index);
+STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args,
+				   struct xfs_attr3_icleaf_hdr *ichdr,
+				   struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state,
 						   xfs_da_state_blk_t *blk1,
 						   xfs_da_state_blk_t *blk2);
-STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
-					   xfs_da_state_blk_t *leaf_blk_1,
-					   xfs_da_state_blk_t *leaf_blk_2,
-					   int *number_entries_in_blk1,
-					   int *number_usedbytes_in_blk1);
+STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
+			xfs_da_state_blk_t *leaf_blk_1,
+			struct xfs_attr3_icleaf_hdr *ichdr1,
+			xfs_da_state_blk_t *leaf_blk_2,
+			struct xfs_attr3_icleaf_hdr *ichdr2,
+			int *number_entries_in_blk1,
+			int *number_usedbytes_in_blk1);
 
 /*
  * Routines used for shrinking the Btree.
  */
-STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
+STATIC int xfs_attr3_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
 				  struct xfs_buf *bp, int level);
-STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
+STATIC int xfs_attr3_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
 				  struct xfs_buf *bp);
-STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
+STATIC int xfs_attr3_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 				   xfs_dablk_t blkno, int blkcnt);
 
 /*
  * Utility routines.
  */
-STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
-					 int src_start,
-					 xfs_attr_leafblock_t *dst_leaf,
-					 int dst_start, int move_count,
-					 xfs_mount_t *mp);
+STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf,
+			struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
+			struct xfs_attr_leafblock *dst_leaf,
+			struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
+			int move_count, struct xfs_mount *mp);
 STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
 
-static void
-xfs_attr_leaf_verify(
+void
+xfs_attr3_leaf_hdr_from_disk(
+	struct xfs_attr3_icleaf_hdr	*to,
+	struct xfs_attr_leafblock	*from)
+{
+	int	i;
+
+	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+	       from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+	if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+		struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from;
+
+		to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+		to->back = be32_to_cpu(hdr3->info.hdr.back);
+		to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+		to->count = be16_to_cpu(hdr3->count);
+		to->usedbytes = be16_to_cpu(hdr3->usedbytes);
+		to->firstused = be16_to_cpu(hdr3->firstused);
+		to->holes = hdr3->holes;
+
+		for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+			to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base);
+			to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size);
+		}
+		return;
+	}
+	to->forw = be32_to_cpu(from->hdr.info.forw);
+	to->back = be32_to_cpu(from->hdr.info.back);
+	to->magic = be16_to_cpu(from->hdr.info.magic);
+	to->count = be16_to_cpu(from->hdr.count);
+	to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
+	to->firstused = be16_to_cpu(from->hdr.firstused);
+	to->holes = from->hdr.holes;
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base);
+		to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size);
+	}
+}
+
+void
+xfs_attr3_leaf_hdr_to_disk(
+	struct xfs_attr_leafblock	*to,
+	struct xfs_attr3_icleaf_hdr	*from)
+{
+	int	i;
+
+	ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
+	       from->magic == XFS_ATTR3_LEAF_MAGIC);
+
+	if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+		struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to;
+
+		hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+		hdr3->info.hdr.back = cpu_to_be32(from->back);
+		hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+		hdr3->count = cpu_to_be16(from->count);
+		hdr3->usedbytes = cpu_to_be16(from->usedbytes);
+		hdr3->firstused = cpu_to_be16(from->firstused);
+		hdr3->holes = from->holes;
+		hdr3->pad1 = 0;
+
+		for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+			hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base);
+			hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size);
+		}
+		return;
+	}
+	to->hdr.info.forw = cpu_to_be32(from->forw);
+	to->hdr.info.back = cpu_to_be32(from->back);
+	to->hdr.info.magic = cpu_to_be16(from->magic);
+	to->hdr.count = cpu_to_be16(from->count);
+	to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
+	to->hdr.firstused = cpu_to_be16(from->firstused);
+	to->hdr.holes = from->holes;
+	to->hdr.pad1 = 0;
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base);
+		to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size);
+	}
+}
+
+static bool
+xfs_attr3_leaf_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
-	int			block_ok = 0;
+	struct xfs_attr_leafblock *leaf = bp->b_addr;
+	struct xfs_attr3_icleaf_hdr ichdr;
 
-	block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
-	if (!block_ok) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+		if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
+			return false;
+
+		if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
+			return false;
 	}
+	if (ichdr.count == 0)
+		return false;
+
+	/* XXX: need to range check rest of attr header values */
+	/* XXX: hash order check? */
+
+	return true;
 }
 
 static void
-xfs_attr_leaf_read_verify(
+xfs_attr3_leaf_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_attr_leaf_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
+
+	if (!xfs_attr3_leaf_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF);
 }
 
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
 static void
-xfs_attr_leaf_write_verify(
-	struct xfs_buf	*bp)
+xfs_attr3_leaf_read_verify(
+	struct xfs_buf		*bp)
 {
-	xfs_attr_leaf_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+					  XFS_ATTR3_LEAF_CRC_OFF)) ||
+	    !xfs_attr3_leaf_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
 }
 
-const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
-	.verify_read = xfs_attr_leaf_read_verify,
-	.verify_write = xfs_attr_leaf_write_verify,
+const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+	.verify_read = xfs_attr3_leaf_read_verify,
+	.verify_write = xfs_attr3_leaf_write_verify,
 };
 
 int
-xfs_attr_leaf_read(
+xfs_attr3_leaf_read(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*dp,
 	xfs_dablk_t		bno,
 	xfs_daddr_t		mappedbno,
 	struct xfs_buf		**bpp)
 {
-	return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
-				XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+				XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
+	return err;
 }
 
 /*========================================================================
@@ -172,7 +318,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
 	int dsize;
 	xfs_mount_t *mp = dp->i_mount;
 
-	offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */
+	/* rounded down */
+	offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
 
 	switch (dp->i_d.di_format) {
 	case XFS_DINODE_FMT_DEV:
@@ -231,7 +378,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
 				return 0;
 			return dp->i_d.di_forkoff;
 		}
-		dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot);
+		dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot);
 		break;
 	}
 
@@ -243,7 +390,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
 	minforkoff = roundup(minforkoff, 8) >> 3;
 
 	/* attr fork btree root can have at least this many key/ptr pairs */
-	maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
+			XFS_BMDR_SPACE_CALC(MINABTPTRS);
 	maxforkoff = maxforkoff >> 3;	/* rounded down */
 
 	if (offset >= maxforkoff)
@@ -557,7 +705,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 	}
 
 	ASSERT(blkno == 0);
-	error = xfs_attr_leaf_create(args, blkno, &bp);
+	error = xfs_attr3_leaf_create(args, blkno, &bp);
 	if (error) {
 		error = xfs_da_shrink_inode(args, 0, bp);
 		bp = NULL;
@@ -586,9 +734,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 		nargs.hashval = xfs_da_hashname(sfe->nameval,
 						sfe->namelen);
 		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
-		error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
+		error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
 		ASSERT(error == ENOATTR);
-		error = xfs_attr_leaf_add(bp, &nargs);
+		error = xfs_attr3_leaf_add(bp, &nargs);
 		ASSERT(error != ENOSPC);
 		if (error)
 			goto out;
@@ -783,67 +931,74 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
  */
 int
 xfs_attr_shortform_allfit(
-	struct xfs_buf	*bp,
-	struct xfs_inode *dp)
+	struct xfs_buf		*bp,
+	struct xfs_inode	*dp)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_entry_t *entry;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
 	xfs_attr_leaf_name_local_t *name_loc;
-	int bytes, i;
+	struct xfs_attr3_icleaf_hdr leafhdr;
+	int			bytes;
+	int			i;
 
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+	xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+	entry = xfs_attr3_leaf_entryp(leaf);
 
-	entry = &leaf->entries[0];
 	bytes = sizeof(struct xfs_attr_sf_hdr);
-	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+	for (i = 0; i < leafhdr.count; entry++, i++) {
 		if (entry->flags & XFS_ATTR_INCOMPLETE)
 			continue;		/* don't copy partial entries */
 		if (!(entry->flags & XFS_ATTR_LOCAL))
 			return(0);
-		name_loc = xfs_attr_leaf_name_local(leaf, i);
+		name_loc = xfs_attr3_leaf_name_local(leaf, i);
 		if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
 			return(0);
 		if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
 			return(0);
-		bytes += sizeof(struct xfs_attr_sf_entry)-1
+		bytes += sizeof(struct xfs_attr_sf_entry) - 1
 				+ name_loc->namelen
 				+ be16_to_cpu(name_loc->valuelen);
 	}
 	if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
 	    (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
 	    (bytes == sizeof(struct xfs_attr_sf_hdr)))
-		return(-1);
-	return(xfs_attr_shortform_bytesfit(dp, bytes));
+		return -1;
+	return xfs_attr_shortform_bytesfit(dp, bytes);
 }
 
 /*
  * Convert a leaf attribute list to shortform attribute list
  */
 int
-xfs_attr_leaf_to_shortform(
-	struct xfs_buf	*bp,
-	xfs_da_args_t	*args,
-	int		forkoff)
+xfs_attr3_leaf_to_shortform(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args,
+	int			forkoff)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_entry_t *entry;
-	xfs_attr_leaf_name_local_t *name_loc;
-	xfs_da_args_t nargs;
-	xfs_inode_t *dp;
-	char *tmpbuffer;
-	int error, i;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_da_args	nargs;
+	struct xfs_inode	*dp = args->dp;
+	char			*tmpbuffer;
+	int			error;
+	int			i;
 
 	trace_xfs_attr_leaf_to_sf(args);
 
-	dp = args->dp;
 	tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
-	ASSERT(tmpbuffer != NULL);
+	if (!tmpbuffer)
+		return ENOMEM;
 
-	ASSERT(bp != NULL);
 	memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount));
+
 	leaf = (xfs_attr_leafblock_t *)tmpbuffer;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	entry = xfs_attr3_leaf_entryp(leaf);
+
+	/* XXX (dgc): buffer is about to be marked stale - why zero it? */
 	memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount));
 
 	/*
@@ -873,14 +1028,14 @@ xfs_attr_leaf_to_shortform(
 	nargs.whichfork = XFS_ATTR_FORK;
 	nargs.trans = args->trans;
 	nargs.op_flags = XFS_DA_OP_OKNOENT;
-	entry = &leaf->entries[0];
-	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+
+	for (i = 0; i < ichdr.count; entry++, i++) {
 		if (entry->flags & XFS_ATTR_INCOMPLETE)
 			continue;	/* don't copy partial entries */
 		if (!entry->nameidx)
 			continue;
 		ASSERT(entry->flags & XFS_ATTR_LOCAL);
-		name_loc = xfs_attr_leaf_name_local(leaf, i);
+		name_loc = xfs_attr3_leaf_name_local(leaf, i);
 		nargs.name = name_loc->nameval;
 		nargs.namelen = name_loc->namelen;
 		nargs.value = &name_loc->nameval[nargs.namelen];
@@ -893,61 +1048,75 @@ xfs_attr_leaf_to_shortform(
 
 out:
 	kmem_free(tmpbuffer);
-	return(error);
+	return error;
 }
 
 /*
  * Convert from using a single leaf to a root node and a leaf.
  */
 int
-xfs_attr_leaf_to_node(xfs_da_args_t *args)
+xfs_attr3_leaf_to_node(
+	struct xfs_da_args	*args)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_da_intnode_t *node;
-	xfs_inode_t *dp;
-	struct xfs_buf *bp1, *bp2;
-	xfs_dablk_t blkno;
-	int error;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr icleafhdr;
+	struct xfs_attr_leaf_entry *entries;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr icnodehdr;
+	struct xfs_da_intnode	*node;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp1 = NULL;
+	struct xfs_buf		*bp2 = NULL;
+	xfs_dablk_t		blkno;
+	int			error;
 
 	trace_xfs_attr_leaf_to_node(args);
 
-	dp = args->dp;
-	bp1 = bp2 = NULL;
 	error = xfs_da_grow_inode(args, &blkno);
 	if (error)
 		goto out;
-	error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
+	error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
 	if (error)
 		goto out;
 
-	bp2 = NULL;
-	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
-					    XFS_ATTR_FORK);
+	error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
 	if (error)
 		goto out;
+
+	/* copy leaf to new buffer, update identifiers */
+	xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
 	bp2->b_ops = bp1->b_ops;
-	memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
-	bp1 = NULL;
-	xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+	memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(mp));
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
+		hdr3->blkno = cpu_to_be64(bp2->b_bn);
+	}
+	xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(mp) - 1);
 
 	/*
 	 * Set up the new root node.
 	 */
-	error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+	error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
 	if (error)
 		goto out;
 	node = bp1->b_addr;
+	xfs_da3_node_hdr_from_disk(&icnodehdr, node);
+	btree = xfs_da3_node_tree_p(node);
+
 	leaf = bp2->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+	xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
+	entries = xfs_attr3_leaf_entryp(leaf);
+
 	/* both on-disk, don't endian-flip twice */
-	node->btree[0].hashval =
-		leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval;
-	node->btree[0].before = cpu_to_be32(blkno);
-	node->hdr.count = cpu_to_be16(1);
-	xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
+	btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+	btree[0].before = cpu_to_be32(blkno);
+	icnodehdr.count = 1;
+	xfs_da3_node_hdr_to_disk(node, &icnodehdr);
+	xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(mp) - 1);
 	error = 0;
 out:
-	return(error);
+	return error;
 }
 
 
@@ -960,52 +1129,63 @@ out:
  * or a leaf in a node attribute list.
  */
 STATIC int
-xfs_attr_leaf_create(
-	xfs_da_args_t	*args,
-	xfs_dablk_t	blkno,
-	struct xfs_buf	**bpp)
+xfs_attr3_leaf_create(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		blkno,
+	struct xfs_buf		**bpp)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_hdr_t *hdr;
-	xfs_inode_t *dp;
-	struct xfs_buf *bp;
-	int error;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
 
 	trace_xfs_attr_leaf_create(args);
 
-	dp = args->dp;
-	ASSERT(dp != NULL);
 	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
 					    XFS_ATTR_FORK);
 	if (error)
-		return(error);
-	bp->b_ops = &xfs_attr_leaf_buf_ops;
+		return error;
+	bp->b_ops = &xfs_attr3_leaf_buf_ops;
+	xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
 	leaf = bp->b_addr;
-	memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
-	hdr = &leaf->hdr;
-	hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
-	hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount));
-	if (!hdr->firstused) {
-		hdr->firstused = cpu_to_be16(
-			XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN);
-	}
+	memset(leaf, 0, XFS_LBSIZE(mp));
+
+	memset(&ichdr, 0, sizeof(ichdr));
+	ichdr.firstused = XFS_LBSIZE(mp);
 
-	hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
-	hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) -
-					   sizeof(xfs_attr_leaf_hdr_t));
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
 
-	xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+		ichdr.magic = XFS_ATTR3_LEAF_MAGIC;
+
+		hdr3->blkno = cpu_to_be64(bp->b_bn);
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+		ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
+	} else {
+		ichdr.magic = XFS_ATTR_LEAF_MAGIC;
+		ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr);
+	}
+	ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
+
+	xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+	xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(mp) - 1);
 
 	*bpp = bp;
-	return(0);
+	return 0;
 }
 
 /*
  * Split the leaf node, rebalance, then add the new entry.
  */
 int
-xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
-				   xfs_da_state_blk_t *newblk)
+xfs_attr3_leaf_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk)
 {
 	xfs_dablk_t blkno;
 	int error;
@@ -1019,7 +1199,7 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	error = xfs_da_grow_inode(state->args, &blkno);
 	if (error)
 		return(error);
-	error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp);
+	error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
 	if (error)
 		return(error);
 	newblk->blkno = blkno;
@@ -1029,8 +1209,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	 * Rebalance the entries across the two leaves.
 	 * NOTE: rebalance() currently depends on the 2nd block being empty.
 	 */
-	xfs_attr_leaf_rebalance(state, oldblk, newblk);
-	error = xfs_da_blk_link(state, oldblk, newblk);
+	xfs_attr3_leaf_rebalance(state, oldblk, newblk);
+	error = xfs_da3_blk_link(state, oldblk, newblk);
 	if (error)
 		return(error);
 
@@ -1043,10 +1223,10 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	 */
 	if (state->inleaf) {
 		trace_xfs_attr_leaf_add_old(state->args);
-		error = xfs_attr_leaf_add(oldblk->bp, state->args);
+		error = xfs_attr3_leaf_add(oldblk->bp, state->args);
 	} else {
 		trace_xfs_attr_leaf_add_new(state->args);
-		error = xfs_attr_leaf_add(newblk->bp, state->args);
+		error = xfs_attr3_leaf_add(newblk->bp, state->args);
 	}
 
 	/*
@@ -1061,22 +1241,23 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
  * Add a name to the leaf attribute list structure.
  */
 int
-xfs_attr_leaf_add(
+xfs_attr3_leaf_add(
 	struct xfs_buf		*bp,
 	struct xfs_da_args	*args)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_hdr_t *hdr;
-	xfs_attr_leaf_map_t *map;
-	int tablesize, entsize, sum, tmp, i;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	int			tablesize;
+	int			entsize;
+	int			sum;
+	int			tmp;
+	int			i;
 
 	trace_xfs_attr_leaf_add(args);
 
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	ASSERT((args->index >= 0)
-		&& (args->index <= be16_to_cpu(leaf->hdr.count)));
-	hdr = &leaf->hdr;
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	ASSERT(args->index >= 0 && args->index <= ichdr.count);
 	entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
 			   args->trans->t_mountp->m_sb.sb_blocksize, NULL);
 
@@ -1084,25 +1265,23 @@ xfs_attr_leaf_add(
 	 * Search through freemap for first-fit on new name length.
 	 * (may need to figure in size of entry struct too)
 	 */
-	tablesize = (be16_to_cpu(hdr->count) + 1)
-					* sizeof(xfs_attr_leaf_entry_t)
-					+ sizeof(xfs_attr_leaf_hdr_t);
-	map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1];
-	for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
-		if (tablesize > be16_to_cpu(hdr->firstused)) {
-			sum += be16_to_cpu(map->size);
+	tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+	for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) {
+		if (tablesize > ichdr.firstused) {
+			sum += ichdr.freemap[i].size;
 			continue;
 		}
-		if (!map->size)
+		if (!ichdr.freemap[i].size)
 			continue;	/* no space in this map */
 		tmp = entsize;
-		if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused))
+		if (ichdr.freemap[i].base < ichdr.firstused)
 			tmp += sizeof(xfs_attr_leaf_entry_t);
-		if (be16_to_cpu(map->size) >= tmp) {
-			tmp = xfs_attr_leaf_add_work(bp, args, i);
-			return(tmp);
+		if (ichdr.freemap[i].size >= tmp) {
+			tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i);
+			goto out_log_hdr;
 		}
-		sum += be16_to_cpu(map->size);
+		sum += ichdr.freemap[i].size;
 	}
 
 	/*
@@ -1110,82 +1289,89 @@ xfs_attr_leaf_add(
 	 * and we don't have enough freespace, then compaction will do us
 	 * no good and we should just give up.
 	 */
-	if (!hdr->holes && (sum < entsize))
-		return(XFS_ERROR(ENOSPC));
+	if (!ichdr.holes && sum < entsize)
+		return XFS_ERROR(ENOSPC);
 
 	/*
 	 * Compact the entries to coalesce free space.
 	 * This may change the hdr->count via dropping INCOMPLETE entries.
 	 */
-	xfs_attr_leaf_compact(args, bp);
+	xfs_attr3_leaf_compact(args, &ichdr, bp);
 
 	/*
 	 * After compaction, the block is guaranteed to have only one
 	 * free region, in freemap[0].  If it is not big enough, give up.
 	 */
-	if (be16_to_cpu(hdr->freemap[0].size)
-				< (entsize + sizeof(xfs_attr_leaf_entry_t)))
-		return(XFS_ERROR(ENOSPC));
+	if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
+		tmp = ENOSPC;
+		goto out_log_hdr;
+	}
+
+	tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
 
-	return(xfs_attr_leaf_add_work(bp, args, 0));
+out_log_hdr:
+	xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+	xfs_trans_log_buf(args->trans, bp,
+		XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+				xfs_attr3_leaf_hdr_size(leaf)));
+	return tmp;
 }
 
 /*
  * Add a name to a leaf attribute list structure.
  */
 STATIC int
-xfs_attr_leaf_add_work(
-	struct xfs_buf	*bp,
-	xfs_da_args_t	*args,
-	int		mapindex)
+xfs_attr3_leaf_add_work(
+	struct xfs_buf		*bp,
+	struct xfs_attr3_icleaf_hdr *ichdr,
+	struct xfs_da_args	*args,
+	int			mapindex)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_hdr_t *hdr;
-	xfs_attr_leaf_entry_t *entry;
-	xfs_attr_leaf_name_local_t *name_loc;
-	xfs_attr_leaf_name_remote_t *name_rmt;
-	xfs_attr_leaf_map_t *map;
-	xfs_mount_t *mp;
-	int tmp, i;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_mount	*mp;
+	int			tmp;
+	int			i;
 
 	trace_xfs_attr_leaf_add_work(args);
 
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	hdr = &leaf->hdr;
-	ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE));
-	ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count)));
+	ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE);
+	ASSERT(args->index >= 0 && args->index <= ichdr->count);
 
 	/*
 	 * Force open some space in the entry array and fill it in.
 	 */
-	entry = &leaf->entries[args->index];
-	if (args->index < be16_to_cpu(hdr->count)) {
-		tmp  = be16_to_cpu(hdr->count) - args->index;
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+	if (args->index < ichdr->count) {
+		tmp  = ichdr->count - args->index;
 		tmp *= sizeof(xfs_attr_leaf_entry_t);
-		memmove((char *)(entry+1), (char *)entry, tmp);
+		memmove(entry + 1, entry, tmp);
 		xfs_trans_log_buf(args->trans, bp,
 		    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
 	}
-	be16_add_cpu(&hdr->count, 1);
+	ichdr->count++;
 
 	/*
 	 * Allocate space for the new string (at the end of the run).
 	 */
-	map = &hdr->freemap[mapindex];
 	mp = args->trans->t_mountp;
-	ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
-	ASSERT((be16_to_cpu(map->base) & 0x3) == 0);
-	ASSERT(be16_to_cpu(map->size) >=
+	ASSERT(ichdr->freemap[mapindex].base < XFS_LBSIZE(mp));
+	ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
+	ASSERT(ichdr->freemap[mapindex].size >=
 		xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
 					 mp->m_sb.sb_blocksize, NULL));
-	ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
-	ASSERT((be16_to_cpu(map->size) & 0x3) == 0);
-	be16_add_cpu(&map->size,
-		-xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
-					  mp->m_sb.sb_blocksize, &tmp));
-	entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) +
-				     be16_to_cpu(map->size));
+	ASSERT(ichdr->freemap[mapindex].size < XFS_LBSIZE(mp));
+	ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
+
+	ichdr->freemap[mapindex].size -=
+			xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
+						 mp->m_sb.sb_blocksize, &tmp);
+
+	entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
+				     ichdr->freemap[mapindex].size);
 	entry->hashval = cpu_to_be32(args->hashval);
 	entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
 	entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
@@ -1200,7 +1386,7 @@ xfs_attr_leaf_add_work(
 			  XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
 	ASSERT((args->index == 0) ||
 	       (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval)));
-	ASSERT((args->index == be16_to_cpu(hdr->count)-1) ||
+	ASSERT((args->index == ichdr->count - 1) ||
 	       (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval)));
 
 	/*
@@ -1211,14 +1397,14 @@ xfs_attr_leaf_add_work(
 	 * as part of this transaction (a split operation for example).
 	 */
 	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
 		name_loc->namelen = args->namelen;
 		name_loc->valuelen = cpu_to_be16(args->valuelen);
 		memcpy((char *)name_loc->nameval, args->name, args->namelen);
 		memcpy((char *)&name_loc->nameval[args->namelen], args->value,
 				   be16_to_cpu(name_loc->valuelen));
 	} else {
-		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
 		name_rmt->namelen = args->namelen;
 		memcpy((char *)name_rmt->name, args->name, args->namelen);
 		entry->flags |= XFS_ATTR_INCOMPLETE;
@@ -1226,47 +1412,45 @@ xfs_attr_leaf_add_work(
 		name_rmt->valuelen = 0;
 		name_rmt->valueblk = 0;
 		args->rmtblkno = 1;
-		args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+		args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
 	}
 	xfs_trans_log_buf(args->trans, bp,
-	     XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
+	     XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
 				   xfs_attr_leaf_entsize(leaf, args->index)));
 
 	/*
 	 * Update the control info for this leaf node
 	 */
-	if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused)) {
-		/* both on-disk, don't endian-flip twice */
-		hdr->firstused = entry->nameidx;
-	}
-	ASSERT(be16_to_cpu(hdr->firstused) >=
-	       ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr)));
-	tmp = (be16_to_cpu(hdr->count)-1) * sizeof(xfs_attr_leaf_entry_t)
-					+ sizeof(xfs_attr_leaf_hdr_t);
-	map = &hdr->freemap[0];
-	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
-		if (be16_to_cpu(map->base) == tmp) {
-			be16_add_cpu(&map->base, sizeof(xfs_attr_leaf_entry_t));
-			be16_add_cpu(&map->size,
-				 -((int)sizeof(xfs_attr_leaf_entry_t)));
+	if (be16_to_cpu(entry->nameidx) < ichdr->firstused)
+		ichdr->firstused = be16_to_cpu(entry->nameidx);
+
+	ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf));
+	tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		if (ichdr->freemap[i].base == tmp) {
+			ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
+			ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
 		}
 	}
-	be16_add_cpu(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index));
-	xfs_trans_log_buf(args->trans, bp,
-		XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
-	return(0);
+	ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
+	return 0;
 }
 
 /*
  * Garbage collect a leaf attribute list block by copying it to a new buffer.
  */
 STATIC void
-xfs_attr_leaf_compact(
+xfs_attr3_leaf_compact(
 	struct xfs_da_args	*args,
+	struct xfs_attr3_icleaf_hdr *ichdr_dst,
 	struct xfs_buf		*bp)
 {
-	xfs_attr_leafblock_t	*leaf_s, *leaf_d;
-	xfs_attr_leaf_hdr_t	*hdr_s, *hdr_d;
+	struct xfs_attr_leafblock *leaf_src;
+	struct xfs_attr_leafblock *leaf_dst;
+	struct xfs_attr3_icleaf_hdr ichdr_src;
 	struct xfs_trans	*trans = args->trans;
 	struct xfs_mount	*mp = trans->t_mountp;
 	char			*tmpbuffer;
@@ -1274,43 +1458,87 @@ xfs_attr_leaf_compact(
 	trace_xfs_attr_leaf_compact(args);
 
 	tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
-	ASSERT(tmpbuffer != NULL);
 	memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
 	memset(bp->b_addr, 0, XFS_LBSIZE(mp));
+	leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
+	leaf_dst = bp->b_addr;
 
 	/*
-	 * Copy basic information
+	 * Copy the on-disk header back into the destination buffer to ensure
+	 * all the information in the header that is not part of the incore
+	 * header structure is preserved.
 	 */
-	leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
-	leaf_d = bp->b_addr;
-	hdr_s = &leaf_s->hdr;
-	hdr_d = &leaf_d->hdr;
-	hdr_d->info = hdr_s->info;	/* struct copy */
-	hdr_d->firstused = cpu_to_be16(XFS_LBSIZE(mp));
-	/* handle truncation gracefully */
-	if (!hdr_d->firstused) {
-		hdr_d->firstused = cpu_to_be16(
-				XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN);
-	}
-	hdr_d->usedbytes = 0;
-	hdr_d->count = 0;
-	hdr_d->holes = 0;
-	hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
-	hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) -
-					     sizeof(xfs_attr_leaf_hdr_t));
+	memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src));
+
+	/* Initialise the incore headers */
+	ichdr_src = *ichdr_dst;	/* struct copy */
+	ichdr_dst->firstused = XFS_LBSIZE(mp);
+	ichdr_dst->usedbytes = 0;
+	ichdr_dst->count = 0;
+	ichdr_dst->holes = 0;
+	ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src);
+	ichdr_dst->freemap[0].size = ichdr_dst->firstused -
+						ichdr_dst->freemap[0].base;
+
+
+	/* write the header back to initialise the underlying buffer */
+	xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
 
 	/*
 	 * Copy all entry's in the same (sorted) order,
 	 * but allocate name/value pairs packed and in sequence.
 	 */
-	xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0,
-				be16_to_cpu(hdr_s->count), mp);
+	xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0,
+				ichdr_src.count, mp);
+	/*
+	 * this logs the entire buffer, but the caller must write the header
+	 * back to the buffer when it is finished modifying it.
+	 */
 	xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
 
 	kmem_free(tmpbuffer);
 }
 
 /*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+static int
+xfs_attr3_leaf_order(
+	struct xfs_buf	*leaf1_bp,
+	struct xfs_attr3_icleaf_hdr *leaf1hdr,
+	struct xfs_buf	*leaf2_bp,
+	struct xfs_attr3_icleaf_hdr *leaf2hdr)
+{
+	struct xfs_attr_leaf_entry *entries1;
+	struct xfs_attr_leaf_entry *entries2;
+
+	entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr);
+	entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr);
+	if (leaf1hdr->count > 0 && leaf2hdr->count > 0 &&
+	    ((be32_to_cpu(entries2[0].hashval) <
+	      be32_to_cpu(entries1[0].hashval)) ||
+	     (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) <
+	      be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) {
+		return 1;
+	}
+	return 0;
+}
+
+int
+xfs_attr_leaf_order(
+	struct xfs_buf	*leaf1_bp,
+	struct xfs_buf	*leaf2_bp)
+{
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
+
+	xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
+	xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
+	return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
+}
+
+/*
  * Redistribute the attribute list entries between two leaf nodes,
  * taking into account the size of the new entry.
  *
@@ -1323,14 +1551,23 @@ xfs_attr_leaf_compact(
  * the "new" and "old" values can end up in different blocks.
  */
 STATIC void
-xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
-				       xfs_da_state_blk_t *blk2)
+xfs_attr3_leaf_rebalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
 {
-	xfs_da_args_t *args;
-	xfs_da_state_blk_t *tmp_blk;
-	xfs_attr_leafblock_t *leaf1, *leaf2;
-	xfs_attr_leaf_hdr_t *hdr1, *hdr2;
-	int count, totallen, max, space, swap;
+	struct xfs_da_args	*args;
+	struct xfs_attr_leafblock *leaf1;
+	struct xfs_attr_leafblock *leaf2;
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
+	struct xfs_attr_leaf_entry *entries1;
+	struct xfs_attr_leaf_entry *entries2;
+	int			count;
+	int			totallen;
+	int			max;
+	int			space;
+	int			swap;
 
 	/*
 	 * Set up environment.
@@ -1339,9 +1576,9 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
 	leaf1 = blk1->bp->b_addr;
 	leaf2 = blk2->bp->b_addr;
-	ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	ASSERT(leaf2->hdr.count == 0);
+	xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+	xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+	ASSERT(ichdr2.count == 0);
 	args = state->args;
 
 	trace_xfs_attr_leaf_rebalance(args);
@@ -1353,16 +1590,23 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	 * second block, this code should never set "swap".
 	 */
 	swap = 0;
-	if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) {
+	if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) {
+		struct xfs_da_state_blk	*tmp_blk;
+		struct xfs_attr3_icleaf_hdr tmp_ichdr;
+
 		tmp_blk = blk1;
 		blk1 = blk2;
 		blk2 = tmp_blk;
+
+		/* struct copies to swap them rather than reconverting */
+		tmp_ichdr = ichdr1;
+		ichdr1 = ichdr2;
+		ichdr2 = tmp_ichdr;
+
 		leaf1 = blk1->bp->b_addr;
 		leaf2 = blk2->bp->b_addr;
 		swap = 1;
 	}
-	hdr1 = &leaf1->hdr;
-	hdr2 = &leaf2->hdr;
 
 	/*
 	 * Examine entries until we reduce the absolute difference in
@@ -1372,41 +1616,39 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	 * "inleaf" is true if the new entry should be inserted into blk1.
 	 * If "swap" is also true, then reverse the sense of "inleaf".
 	 */
-	state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2,
-							    &count, &totallen);
+	state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1,
+						      blk2, &ichdr2,
+						      &count, &totallen);
 	if (swap)
 		state->inleaf = !state->inleaf;
 
 	/*
 	 * Move any entries required from leaf to leaf:
 	 */
-	if (count < be16_to_cpu(hdr1->count)) {
+	if (count < ichdr1.count) {
 		/*
 		 * Figure the total bytes to be added to the destination leaf.
 		 */
 		/* number entries being moved */
-		count = be16_to_cpu(hdr1->count) - count;
-		space  = be16_to_cpu(hdr1->usedbytes) - totallen;
+		count = ichdr1.count - count;
+		space  = ichdr1.usedbytes - totallen;
 		space += count * sizeof(xfs_attr_leaf_entry_t);
 
 		/*
 		 * leaf2 is the destination, compact it if it looks tight.
 		 */
-		max  = be16_to_cpu(hdr2->firstused)
-						- sizeof(xfs_attr_leaf_hdr_t);
-		max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
+		max  = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+		max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t);
 		if (space > max)
-			xfs_attr_leaf_compact(args, blk2->bp);
+			xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp);
 
 		/*
 		 * Move high entries from leaf1 to low end of leaf2.
 		 */
-		xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count,
-				leaf2, 0, count, state->mp);
+		xfs_attr3_leaf_moveents(leaf1, &ichdr1, ichdr1.count - count,
+				leaf2, &ichdr2, 0, count, state->mp);
 
-		xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
-		xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
-	} else if (count > be16_to_cpu(hdr1->count)) {
+	} else if (count > ichdr1.count) {
 		/*
 		 * I assert that since all callers pass in an empty
 		 * second buffer, this code should never execute.
@@ -1417,36 +1659,37 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		 * Figure the total bytes to be added to the destination leaf.
 		 */
 		/* number entries being moved */
-		count -= be16_to_cpu(hdr1->count);
-		space  = totallen - be16_to_cpu(hdr1->usedbytes);
+		count -= ichdr1.count;
+		space  = totallen - ichdr1.usedbytes;
 		space += count * sizeof(xfs_attr_leaf_entry_t);
 
 		/*
 		 * leaf1 is the destination, compact it if it looks tight.
 		 */
-		max  = be16_to_cpu(hdr1->firstused)
-						- sizeof(xfs_attr_leaf_hdr_t);
-		max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
+		max  = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1);
+		max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t);
 		if (space > max)
-			xfs_attr_leaf_compact(args, blk1->bp);
+			xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp);
 
 		/*
 		 * Move low entries from leaf2 to high end of leaf1.
 		 */
-		xfs_attr_leaf_moveents(leaf2, 0, leaf1,
-				be16_to_cpu(hdr1->count), count, state->mp);
-
-		xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
-		xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+		xfs_attr3_leaf_moveents(leaf2, &ichdr2, 0, leaf1, &ichdr1,
+					ichdr1.count, count, state->mp);
 	}
 
+	xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
+	xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
+	xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+	xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+
 	/*
 	 * Copy out last hashval in each block for B-tree code.
 	 */
-	blk1->hashval = be32_to_cpu(
-		leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval);
-	blk2->hashval = be32_to_cpu(
-		leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval);
+	entries1 = xfs_attr3_leaf_entryp(leaf1);
+	entries2 = xfs_attr3_leaf_entryp(leaf2);
+	blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval);
+	blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval);
 
 	/*
 	 * Adjust the expected index for insertion.
@@ -1460,12 +1703,12 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	 * inserting.  The index/blkno fields refer to the "old" entry,
 	 * while the index2/blkno2 fields refer to the "new" entry.
 	 */
-	if (blk1->index > be16_to_cpu(leaf1->hdr.count)) {
+	if (blk1->index > ichdr1.count) {
 		ASSERT(state->inleaf == 0);
-		blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
+		blk2->index = blk1->index - ichdr1.count;
 		args->index = args->index2 = blk2->index;
 		args->blkno = args->blkno2 = blk2->blkno;
-	} else if (blk1->index == be16_to_cpu(leaf1->hdr.count)) {
+	} else if (blk1->index == ichdr1.count) {
 		if (state->inleaf) {
 			args->index = blk1->index;
 			args->blkno = blk1->blkno;
@@ -1477,8 +1720,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 			 * is already stored in blkno2/index2, so don't
 			 * overwrite it overwise we corrupt the tree.
 			 */
-			blk2->index = blk1->index
-				    - be16_to_cpu(leaf1->hdr.count);
+			blk2->index = blk1->index - ichdr1.count;
 			args->index = blk2->index;
 			args->blkno = blk2->blkno;
 			if (!state->extravalid) {
@@ -1506,42 +1748,40 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
  * GROT: Do a double-split for this case?
  */
 STATIC int
-xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
-				    xfs_da_state_blk_t *blk1,
-				    xfs_da_state_blk_t *blk2,
-				    int *countarg, int *usedbytesarg)
+xfs_attr3_leaf_figure_balance(
+	struct xfs_da_state		*state,
+	struct xfs_da_state_blk		*blk1,
+	struct xfs_attr3_icleaf_hdr	*ichdr1,
+	struct xfs_da_state_blk		*blk2,
+	struct xfs_attr3_icleaf_hdr	*ichdr2,
+	int				*countarg,
+	int				*usedbytesarg)
 {
-	xfs_attr_leafblock_t *leaf1, *leaf2;
-	xfs_attr_leaf_hdr_t *hdr1, *hdr2;
-	xfs_attr_leaf_entry_t *entry;
-	int count, max, index, totallen, half;
-	int lastdelta, foundit, tmp;
-
-	/*
-	 * Set up environment.
-	 */
-	leaf1 = blk1->bp->b_addr;
-	leaf2 = blk2->bp->b_addr;
-	hdr1 = &leaf1->hdr;
-	hdr2 = &leaf2->hdr;
-	foundit = 0;
-	totallen = 0;
+	struct xfs_attr_leafblock	*leaf1 = blk1->bp->b_addr;
+	struct xfs_attr_leafblock	*leaf2 = blk2->bp->b_addr;
+	struct xfs_attr_leaf_entry	*entry;
+	int				count;
+	int				max;
+	int				index;
+	int				totallen = 0;
+	int				half;
+	int				lastdelta;
+	int				foundit = 0;
+	int				tmp;
 
 	/*
 	 * Examine entries until we reduce the absolute difference in
 	 * byte usage between the two blocks to a minimum.
 	 */
-	max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count);
-	half  = (max+1) * sizeof(*entry);
-	half += be16_to_cpu(hdr1->usedbytes) +
-		be16_to_cpu(hdr2->usedbytes) +
-		xfs_attr_leaf_newentsize(
-				state->args->namelen,
-				state->args->valuelen,
-				state->blocksize, NULL);
+	max = ichdr1->count + ichdr2->count;
+	half = (max + 1) * sizeof(*entry);
+	half += ichdr1->usedbytes + ichdr2->usedbytes +
+			xfs_attr_leaf_newentsize(state->args->namelen,
+						 state->args->valuelen,
+						 state->blocksize, NULL);
 	half /= 2;
 	lastdelta = state->blocksize;
-	entry = &leaf1->entries[0];
+	entry = xfs_attr3_leaf_entryp(leaf1);
 	for (count = index = 0; count < max; entry++, index++, count++) {
 
 #define XFS_ATTR_ABS(A)	(((A) < 0) ? -(A) : (A))
@@ -1564,9 +1804,9 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
 		/*
 		 * Wrap around into the second block if necessary.
 		 */
-		if (count == be16_to_cpu(hdr1->count)) {
+		if (count == ichdr1->count) {
 			leaf1 = leaf2;
-			entry = &leaf1->entries[0];
+			entry = xfs_attr3_leaf_entryp(leaf1);
 			index = 0;
 		}
 
@@ -1597,7 +1837,7 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
 
 	*countarg = count;
 	*usedbytesarg = totallen;
-	return(foundit);
+	return foundit;
 }
 
 /*========================================================================
@@ -1616,14 +1856,20 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
  * GROT: allow for INCOMPLETE entries in calculation.
  */
 int
-xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
+xfs_attr3_leaf_toosmall(
+	struct xfs_da_state	*state,
+	int			*action)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_da_state_blk_t *blk;
-	xfs_da_blkinfo_t *info;
-	int count, bytes, forward, error, retval, i;
-	xfs_dablk_t blkno;
-	struct xfs_buf *bp;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_da_state_blk	*blk;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_buf		*bp;
+	xfs_dablk_t		blkno;
+	int			bytes;
+	int			forward;
+	int			error;
+	int			retval;
+	int			i;
 
 	trace_xfs_attr_leaf_toosmall(state->args);
 
@@ -1633,13 +1879,11 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
 	 * to coalesce with a sibling.
 	 */
 	blk = &state->path.blk[ state->path.active-1 ];
-	info = blk->bp->b_addr;
-	ASSERT(info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	leaf = (xfs_attr_leafblock_t *)info;
-	count = be16_to_cpu(leaf->hdr.count);
-	bytes = sizeof(xfs_attr_leaf_hdr_t) +
-		count * sizeof(xfs_attr_leaf_entry_t) +
-		be16_to_cpu(leaf->hdr.usedbytes);
+	leaf = blk->bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	bytes = xfs_attr3_leaf_hdr_size(leaf) +
+		ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
+		ichdr.usedbytes;
 	if (bytes > (state->blocksize >> 1)) {
 		*action = 0;	/* blk over 50%, don't try to join */
 		return(0);
@@ -1651,14 +1895,14 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
 	 * coalesce it with a sibling block.  We choose (arbitrarily)
 	 * to merge with the forward block unless it is NULL.
 	 */
-	if (count == 0) {
+	if (ichdr.count == 0) {
 		/*
 		 * Make altpath point to the block we want to keep and
 		 * path point to the block we want to drop (this one).
 		 */
-		forward = (info->forw != 0);
+		forward = (ichdr.forw != 0);
 		memcpy(&state->altpath, &state->path, sizeof(state->path));
-		error = xfs_da_path_shift(state, &state->altpath, forward,
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
 						 0, &retval);
 		if (error)
 			return(error);
@@ -1667,7 +1911,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
 		} else {
 			*action = 2;
 		}
-		return(0);
+		return 0;
 	}
 
 	/*
@@ -1678,28 +1922,28 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
 	 * to shrink an attribute list over time.
 	 */
 	/* start with smaller blk num */
-	forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
+	forward = ichdr.forw < ichdr.back;
 	for (i = 0; i < 2; forward = !forward, i++) {
+		struct xfs_attr3_icleaf_hdr ichdr2;
 		if (forward)
-			blkno = be32_to_cpu(info->forw);
+			blkno = ichdr.forw;
 		else
-			blkno = be32_to_cpu(info->back);
+			blkno = ichdr.back;
 		if (blkno == 0)
 			continue;
-		error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
+		error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
 					blkno, -1, &bp);
 		if (error)
 			return(error);
 
-		leaf = (xfs_attr_leafblock_t *)info;
-		count  = be16_to_cpu(leaf->hdr.count);
-		bytes  = state->blocksize - (state->blocksize>>2);
-		bytes -= be16_to_cpu(leaf->hdr.usedbytes);
-		leaf = bp->b_addr;
-		count += be16_to_cpu(leaf->hdr.count);
-		bytes -= be16_to_cpu(leaf->hdr.usedbytes);
-		bytes -= count * sizeof(xfs_attr_leaf_entry_t);
-		bytes -= sizeof(xfs_attr_leaf_hdr_t);
+		xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
+
+		bytes = state->blocksize - (state->blocksize >> 2) -
+			ichdr.usedbytes - ichdr2.usedbytes -
+			((ichdr.count + ichdr2.count) *
+					sizeof(xfs_attr_leaf_entry_t)) -
+			xfs_attr3_leaf_hdr_size(leaf);
+
 		xfs_trans_brelse(state->args->trans, bp);
 		if (bytes >= 0)
 			break;	/* fits with at least 25% to spare */
@@ -1715,10 +1959,10 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
 	 */
 	memcpy(&state->altpath, &state->path, sizeof(state->path));
 	if (blkno < blk->blkno) {
-		error = xfs_da_path_shift(state, &state->altpath, forward,
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
 						 0, &retval);
 	} else {
-		error = xfs_da_path_shift(state, &state->path, forward,
+		error = xfs_da3_path_shift(state, &state->path, forward,
 						 0, &retval);
 	}
 	if (error)
@@ -1738,32 +1982,35 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
  * If two leaves are 37% full, when combined they will leave 25% free.
  */
 int
-xfs_attr_leaf_remove(
-	struct xfs_buf	*bp,
-	xfs_da_args_t	*args)
+xfs_attr3_leaf_remove(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_hdr_t *hdr;
-	xfs_attr_leaf_map_t *map;
-	xfs_attr_leaf_entry_t *entry;
-	int before, after, smallest, entsize;
-	int tablesize, tmp, i;
-	xfs_mount_t *mp;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_mount	*mp = args->trans->t_mountp;
+	int			before;
+	int			after;
+	int			smallest;
+	int			entsize;
+	int			tablesize;
+	int			tmp;
+	int			i;
 
 	trace_xfs_attr_leaf_remove(args);
 
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	hdr = &leaf->hdr;
-	mp = args->trans->t_mountp;
-	ASSERT((be16_to_cpu(hdr->count) > 0)
-		&& (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8)));
-	ASSERT((args->index >= 0)
-		&& (args->index < be16_to_cpu(hdr->count)));
-	ASSERT(be16_to_cpu(hdr->firstused) >=
-	       ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr)));
-	entry = &leaf->entries[args->index];
-	ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused));
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+
+	ASSERT(ichdr.count > 0 && ichdr.count < XFS_LBSIZE(mp) / 8);
+	ASSERT(args->index >= 0 && args->index < ichdr.count);
+	ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
+					xfs_attr3_leaf_hdr_size(leaf));
+
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
+
+	ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
 	ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
 
 	/*
@@ -1772,30 +2019,28 @@ xfs_attr_leaf_remove(
 	 *    find smallest free region in case we need to replace it,
 	 *    adjust any map that borders the entry table,
 	 */
-	tablesize = be16_to_cpu(hdr->count) * sizeof(xfs_attr_leaf_entry_t)
-					+ sizeof(xfs_attr_leaf_hdr_t);
-	map = &hdr->freemap[0];
-	tmp = be16_to_cpu(map->size);
+	tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf);
+	tmp = ichdr.freemap[0].size;
 	before = after = -1;
 	smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
 	entsize = xfs_attr_leaf_entsize(leaf, args->index);
-	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
-		ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
-		ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
-		if (be16_to_cpu(map->base) == tablesize) {
-			be16_add_cpu(&map->base,
-				 -((int)sizeof(xfs_attr_leaf_entry_t)));
-			be16_add_cpu(&map->size, sizeof(xfs_attr_leaf_entry_t));
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
+		ASSERT(ichdr.freemap[i].base < XFS_LBSIZE(mp));
+		ASSERT(ichdr.freemap[i].size < XFS_LBSIZE(mp));
+		if (ichdr.freemap[i].base == tablesize) {
+			ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
+			ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
 		}
 
-		if ((be16_to_cpu(map->base) + be16_to_cpu(map->size))
-				== be16_to_cpu(entry->nameidx)) {
+		if (ichdr.freemap[i].base + ichdr.freemap[i].size ==
+				be16_to_cpu(entry->nameidx)) {
 			before = i;
-		} else if (be16_to_cpu(map->base)
-			== (be16_to_cpu(entry->nameidx) + entsize)) {
+		} else if (ichdr.freemap[i].base ==
+				(be16_to_cpu(entry->nameidx) + entsize)) {
 			after = i;
-		} else if (be16_to_cpu(map->size) < tmp) {
-			tmp = be16_to_cpu(map->size);
+		} else if (ichdr.freemap[i].size < tmp) {
+			tmp = ichdr.freemap[i].size;
 			smallest = i;
 		}
 	}
@@ -1806,36 +2051,30 @@ xfs_attr_leaf_remove(
 	 */
 	if ((before >= 0) || (after >= 0)) {
 		if ((before >= 0) && (after >= 0)) {
-			map = &hdr->freemap[before];
-			be16_add_cpu(&map->size, entsize);
-			be16_add_cpu(&map->size,
-				 be16_to_cpu(hdr->freemap[after].size));
-			hdr->freemap[after].base = 0;
-			hdr->freemap[after].size = 0;
+			ichdr.freemap[before].size += entsize;
+			ichdr.freemap[before].size += ichdr.freemap[after].size;
+			ichdr.freemap[after].base = 0;
+			ichdr.freemap[after].size = 0;
 		} else if (before >= 0) {
-			map = &hdr->freemap[before];
-			be16_add_cpu(&map->size, entsize);
+			ichdr.freemap[before].size += entsize;
 		} else {
-			map = &hdr->freemap[after];
-			/* both on-disk, don't endian flip twice */
-			map->base = entry->nameidx;
-			be16_add_cpu(&map->size, entsize);
+			ichdr.freemap[after].base = be16_to_cpu(entry->nameidx);
+			ichdr.freemap[after].size += entsize;
 		}
 	} else {
 		/*
 		 * Replace smallest region (if it is smaller than free'd entry)
 		 */
-		map = &hdr->freemap[smallest];
-		if (be16_to_cpu(map->size) < entsize) {
-			map->base = cpu_to_be16(be16_to_cpu(entry->nameidx));
-			map->size = cpu_to_be16(entsize);
+		if (ichdr.freemap[smallest].size < entsize) {
+			ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx);
+			ichdr.freemap[smallest].size = entsize;
 		}
 	}
 
 	/*
 	 * Did we remove the first entry?
 	 */
-	if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused))
+	if (be16_to_cpu(entry->nameidx) == ichdr.firstused)
 		smallest = 1;
 	else
 		smallest = 0;
@@ -1843,20 +2082,20 @@ xfs_attr_leaf_remove(
 	/*
 	 * Compress the remaining entries and zero out the removed stuff.
 	 */
-	memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize);
-	be16_add_cpu(&hdr->usedbytes, -entsize);
+	memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize);
+	ichdr.usedbytes -= entsize;
 	xfs_trans_log_buf(args->trans, bp,
-	     XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
+	     XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
 				   entsize));
 
-	tmp = (be16_to_cpu(hdr->count) - args->index)
-					* sizeof(xfs_attr_leaf_entry_t);
-	memmove((char *)entry, (char *)(entry+1), tmp);
-	be16_add_cpu(&hdr->count, -1);
+	tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t);
+	memmove(entry, entry + 1, tmp);
+	ichdr.count--;
 	xfs_trans_log_buf(args->trans, bp,
-	    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
-	entry = &leaf->entries[be16_to_cpu(hdr->count)];
-	memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t));
+	    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t)));
+
+	entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count];
+	memset(entry, 0, sizeof(xfs_attr_leaf_entry_t));
 
 	/*
 	 * If we removed the first entry, re-find the first used byte
@@ -1866,130 +2105,140 @@ xfs_attr_leaf_remove(
 	 */
 	if (smallest) {
 		tmp = XFS_LBSIZE(mp);
-		entry = &leaf->entries[0];
-		for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) {
-			ASSERT(be16_to_cpu(entry->nameidx) >=
-			       be16_to_cpu(hdr->firstused));
+		entry = xfs_attr3_leaf_entryp(leaf);
+		for (i = ichdr.count - 1; i >= 0; entry++, i--) {
+			ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
 			ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
 
 			if (be16_to_cpu(entry->nameidx) < tmp)
 				tmp = be16_to_cpu(entry->nameidx);
 		}
-		hdr->firstused = cpu_to_be16(tmp);
-		if (!hdr->firstused) {
-			hdr->firstused = cpu_to_be16(
-					tmp - XFS_ATTR_LEAF_NAME_ALIGN);
-		}
+		ichdr.firstused = tmp;
+		if (!ichdr.firstused)
+			ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
 	} else {
-		hdr->holes = 1;		/* mark as needing compaction */
+		ichdr.holes = 1;	/* mark as needing compaction */
 	}
+	xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
 	xfs_trans_log_buf(args->trans, bp,
-			  XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+			  XFS_DA_LOGRANGE(leaf, &leaf->hdr,
+					  xfs_attr3_leaf_hdr_size(leaf)));
 
 	/*
 	 * Check if leaf is less than 50% full, caller may want to
 	 * "join" the leaf with a sibling if so.
 	 */
-	tmp  = sizeof(xfs_attr_leaf_hdr_t);
-	tmp += be16_to_cpu(leaf->hdr.count) * sizeof(xfs_attr_leaf_entry_t);
-	tmp += be16_to_cpu(leaf->hdr.usedbytes);
-	return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */
+	tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
+	      ichdr.count * sizeof(xfs_attr_leaf_entry_t);
+
+	return tmp < mp->m_attr_magicpct; /* leaf is < 37% full */
 }
 
 /*
  * Move all the attribute list entries from drop_leaf into save_leaf.
  */
 void
-xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
-				       xfs_da_state_blk_t *save_blk)
+xfs_attr3_leaf_unbalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
 {
-	xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
-	xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
-	xfs_mount_t *mp;
-	char *tmpbuffer;
+	struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr;
+	struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr;
+	struct xfs_attr3_icleaf_hdr drophdr;
+	struct xfs_attr3_icleaf_hdr savehdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_mount	*mp = state->mp;
 
 	trace_xfs_attr_leaf_unbalance(state->args);
 
-	/*
-	 * Set up environment.
-	 */
-	mp = state->mp;
-	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC);
-	ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC);
 	drop_leaf = drop_blk->bp->b_addr;
 	save_leaf = save_blk->bp->b_addr;
-	ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	drop_hdr = &drop_leaf->hdr;
-	save_hdr = &save_leaf->hdr;
+	xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
+	xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
+	entry = xfs_attr3_leaf_entryp(drop_leaf);
 
 	/*
 	 * Save last hashval from dying block for later Btree fixup.
 	 */
-	drop_blk->hashval = be32_to_cpu(
-		drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval);
+	drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval);
 
 	/*
 	 * Check if we need a temp buffer, or can we do it in place.
 	 * Note that we don't check "leaf" for holes because we will
 	 * always be dropping it, toosmall() decided that for us already.
 	 */
-	if (save_hdr->holes == 0) {
+	if (savehdr.holes == 0) {
 		/*
 		 * dest leaf has no holes, so we add there.  May need
 		 * to make some room in the entry array.
 		 */
-		if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
-			xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0,
-			     be16_to_cpu(drop_hdr->count), mp);
+		if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+					 drop_blk->bp, &drophdr)) {
+			xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+						save_leaf, &savehdr, 0,
+						drophdr.count, mp);
 		} else {
-			xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf,
-				  be16_to_cpu(save_hdr->count),
-				  be16_to_cpu(drop_hdr->count), mp);
+			xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+						save_leaf, &savehdr,
+						savehdr.count, drophdr.count, mp);
 		}
 	} else {
 		/*
 		 * Destination has holes, so we make a temporary copy
 		 * of the leaf and add them both to that.
 		 */
-		tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
-		ASSERT(tmpbuffer != NULL);
-		memset(tmpbuffer, 0, state->blocksize);
-		tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer;
-		tmp_hdr = &tmp_leaf->hdr;
-		tmp_hdr->info = save_hdr->info;	/* struct copy */
-		tmp_hdr->count = 0;
-		tmp_hdr->firstused = cpu_to_be16(state->blocksize);
-		if (!tmp_hdr->firstused) {
-			tmp_hdr->firstused = cpu_to_be16(
-				state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN);
-		}
-		tmp_hdr->usedbytes = 0;
-		if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
-			xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
-				be16_to_cpu(drop_hdr->count), mp);
-			xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf,
-				  be16_to_cpu(tmp_leaf->hdr.count),
-				  be16_to_cpu(save_hdr->count), mp);
+		struct xfs_attr_leafblock *tmp_leaf;
+		struct xfs_attr3_icleaf_hdr tmphdr;
+
+		tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP);
+
+		/*
+		 * Copy the header into the temp leaf so that all the stuff
+		 * not in the incore header is present and gets copied back in
+		 * once we've moved all the entries.
+		 */
+		memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf));
+
+		memset(&tmphdr, 0, sizeof(tmphdr));
+		tmphdr.magic = savehdr.magic;
+		tmphdr.forw = savehdr.forw;
+		tmphdr.back = savehdr.back;
+		tmphdr.firstused = state->blocksize;
+
+		/* write the header to the temp buffer to initialise it */
+		xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+
+		if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
+					 drop_blk->bp, &drophdr)) {
+			xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+						tmp_leaf, &tmphdr, 0,
+						drophdr.count, mp);
+			xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0,
+						tmp_leaf, &tmphdr, tmphdr.count,
+						savehdr.count, mp);
 		} else {
-			xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
-				be16_to_cpu(save_hdr->count), mp);
-			xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf,
-				be16_to_cpu(tmp_leaf->hdr.count),
-				be16_to_cpu(drop_hdr->count), mp);
+			xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0,
+						tmp_leaf, &tmphdr, 0,
+						savehdr.count, mp);
+			xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+						tmp_leaf, &tmphdr, tmphdr.count,
+						drophdr.count, mp);
 		}
-		memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize);
-		kmem_free(tmpbuffer);
+		memcpy(save_leaf, tmp_leaf, state->blocksize);
+		savehdr = tmphdr; /* struct copy */
+		kmem_free(tmp_leaf);
 	}
 
+	xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
 	xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
 					   state->blocksize - 1);
 
 	/*
 	 * Copy out last hashval in each block for B-tree code.
 	 */
-	save_blk->hashval = be32_to_cpu(
-		save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1].hashval);
+	entry = xfs_attr3_leaf_entryp(save_leaf);
+	save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval);
 }
 
 /*========================================================================
@@ -2010,31 +2259,33 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
  * Don't change the args->value unless we find the attribute.
  */
 int
-xfs_attr_leaf_lookup_int(
-	struct xfs_buf	*bp,
-	xfs_da_args_t	*args)
+xfs_attr3_leaf_lookup_int(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_entry_t *entry;
-	xfs_attr_leaf_name_local_t *name_loc;
-	xfs_attr_leaf_name_remote_t *name_rmt;
-	int probe, span;
-	xfs_dahash_t hashval;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_entry *entries;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	xfs_dahash_t		hashval;
+	int			probe;
+	int			span;
 
 	trace_xfs_attr_leaf_lookup(args);
 
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	ASSERT(be16_to_cpu(leaf->hdr.count)
-					< (XFS_LBSIZE(args->dp->i_mount)/8));
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	entries = xfs_attr3_leaf_entryp(leaf);
+	ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8);
 
 	/*
 	 * Binary search.  (note: small blocks will skip this loop)
 	 */
 	hashval = args->hashval;
-	probe = span = be16_to_cpu(leaf->hdr.count) / 2;
-	for (entry = &leaf->entries[probe]; span > 4;
-		   entry = &leaf->entries[probe]) {
+	probe = span = ichdr.count / 2;
+	for (entry = &entries[probe]; span > 4; entry = &entries[probe]) {
 		span /= 2;
 		if (be32_to_cpu(entry->hashval) < hashval)
 			probe += span;
@@ -2043,35 +2294,31 @@ xfs_attr_leaf_lookup_int(
 		else
 			break;
 	}
-	ASSERT((probe >= 0) &&
-	       (!leaf->hdr.count
-	       || (probe < be16_to_cpu(leaf->hdr.count))));
-	ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval));
+	ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count));
+	ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval);
 
 	/*
 	 * Since we may have duplicate hashval's, find the first matching
 	 * hashval in the leaf.
 	 */
-	while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) {
+	while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) {
 		entry--;
 		probe--;
 	}
-	while ((probe < be16_to_cpu(leaf->hdr.count)) &&
-	       (be32_to_cpu(entry->hashval) < hashval)) {
+	while (probe < ichdr.count &&
+	       be32_to_cpu(entry->hashval) < hashval) {
 		entry++;
 		probe++;
 	}
-	if ((probe == be16_to_cpu(leaf->hdr.count)) ||
-	    (be32_to_cpu(entry->hashval) != hashval)) {
+	if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
 		args->index = probe;
-		return(XFS_ERROR(ENOATTR));
+		return XFS_ERROR(ENOATTR);
 	}
 
 	/*
 	 * Duplicate keys may be present, so search all of them for a match.
 	 */
-	for (  ; (probe < be16_to_cpu(leaf->hdr.count)) &&
-			(be32_to_cpu(entry->hashval) == hashval);
+	for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval);
 			entry++, probe++) {
 /*
  * GROT: Add code to remove incomplete entries.
@@ -2085,33 +2332,36 @@ xfs_attr_leaf_lookup_int(
 			continue;
 		}
 		if (entry->flags & XFS_ATTR_LOCAL) {
-			name_loc = xfs_attr_leaf_name_local(leaf, probe);
+			name_loc = xfs_attr3_leaf_name_local(leaf, probe);
 			if (name_loc->namelen != args->namelen)
 				continue;
-			if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0)
+			if (memcmp(args->name, name_loc->nameval,
+							args->namelen) != 0)
 				continue;
 			if (!xfs_attr_namesp_match(args->flags, entry->flags))
 				continue;
 			args->index = probe;
-			return(XFS_ERROR(EEXIST));
+			return XFS_ERROR(EEXIST);
 		} else {
-			name_rmt = xfs_attr_leaf_name_remote(leaf, probe);
+			name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
 			if (name_rmt->namelen != args->namelen)
 				continue;
-			if (memcmp(args->name, (char *)name_rmt->name,
-					     args->namelen) != 0)
+			if (memcmp(args->name, name_rmt->name,
+							args->namelen) != 0)
 				continue;
 			if (!xfs_attr_namesp_match(args->flags, entry->flags))
 				continue;
 			args->index = probe;
+			args->valuelen = be32_to_cpu(name_rmt->valuelen);
 			args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-			args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
-						   be32_to_cpu(name_rmt->valuelen));
-			return(XFS_ERROR(EEXIST));
+			args->rmtblkcnt = xfs_attr3_rmt_blocks(
+							args->dp->i_mount,
+							args->valuelen);
+			return XFS_ERROR(EEXIST);
 		}
 	}
 	args->index = probe;
-	return(XFS_ERROR(ENOATTR));
+	return XFS_ERROR(ENOATTR);
 }
 
 /*
@@ -2119,56 +2369,57 @@ xfs_attr_leaf_lookup_int(
  * list structure.
  */
 int
-xfs_attr_leaf_getvalue(
-	struct xfs_buf	*bp,
-	xfs_da_args_t	*args)
+xfs_attr3_leaf_getvalue(
+	struct xfs_buf		*bp,
+	struct xfs_da_args	*args)
 {
-	int valuelen;
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_entry_t *entry;
-	xfs_attr_leaf_name_local_t *name_loc;
-	xfs_attr_leaf_name_remote_t *name_rmt;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_local *name_loc;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	int			valuelen;
 
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	ASSERT(be16_to_cpu(leaf->hdr.count)
-					< (XFS_LBSIZE(args->dp->i_mount)/8));
-	ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8);
+	ASSERT(args->index < ichdr.count);
 
-	entry = &leaf->entries[args->index];
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
 	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
 		ASSERT(name_loc->namelen == args->namelen);
 		ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
 		valuelen = be16_to_cpu(name_loc->valuelen);
 		if (args->flags & ATTR_KERNOVAL) {
 			args->valuelen = valuelen;
-			return(0);
+			return 0;
 		}
 		if (args->valuelen < valuelen) {
 			args->valuelen = valuelen;
-			return(XFS_ERROR(ERANGE));
+			return XFS_ERROR(ERANGE);
 		}
 		args->valuelen = valuelen;
 		memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
 	} else {
-		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
 		ASSERT(name_rmt->namelen == args->namelen);
 		ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
 		valuelen = be32_to_cpu(name_rmt->valuelen);
 		args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
-		args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
+		args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+						       valuelen);
 		if (args->flags & ATTR_KERNOVAL) {
 			args->valuelen = valuelen;
-			return(0);
+			return 0;
 		}
 		if (args->valuelen < valuelen) {
 			args->valuelen = valuelen;
-			return(XFS_ERROR(ERANGE));
+			return XFS_ERROR(ERANGE);
 		}
 		args->valuelen = valuelen;
 	}
-	return(0);
+	return 0;
 }
 
 /*========================================================================
@@ -2181,13 +2432,21 @@ xfs_attr_leaf_getvalue(
  */
 /*ARGSUSED*/
 STATIC void
-xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
-			xfs_attr_leafblock_t *leaf_d, int start_d,
-			int count, xfs_mount_t *mp)
+xfs_attr3_leaf_moveents(
+	struct xfs_attr_leafblock	*leaf_s,
+	struct xfs_attr3_icleaf_hdr	*ichdr_s,
+	int				start_s,
+	struct xfs_attr_leafblock	*leaf_d,
+	struct xfs_attr3_icleaf_hdr	*ichdr_d,
+	int				start_d,
+	int				count,
+	struct xfs_mount		*mp)
 {
-	xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
-	xfs_attr_leaf_entry_t *entry_s, *entry_d;
-	int desti, tmp, i;
+	struct xfs_attr_leaf_entry	*entry_s;
+	struct xfs_attr_leaf_entry	*entry_d;
+	int				desti;
+	int				tmp;
+	int				i;
 
 	/*
 	 * Check for nothing to do.
@@ -2198,45 +2457,41 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
 	/*
 	 * Set up environment.
 	 */
-	ASSERT(leaf_s->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	ASSERT(leaf_d->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	hdr_s = &leaf_s->hdr;
-	hdr_d = &leaf_d->hdr;
-	ASSERT((be16_to_cpu(hdr_s->count) > 0) &&
-	       (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8)));
-	ASSERT(be16_to_cpu(hdr_s->firstused) >=
-		((be16_to_cpu(hdr_s->count)
-					* sizeof(*entry_s))+sizeof(*hdr_s)));
-	ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8));
-	ASSERT(be16_to_cpu(hdr_d->firstused) >=
-		((be16_to_cpu(hdr_d->count)
-					* sizeof(*entry_d))+sizeof(*hdr_d)));
-
-	ASSERT(start_s < be16_to_cpu(hdr_s->count));
-	ASSERT(start_d <= be16_to_cpu(hdr_d->count));
-	ASSERT(count <= be16_to_cpu(hdr_s->count));
+	ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
+	       ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
+	ASSERT(ichdr_s->magic == ichdr_d->magic);
+	ASSERT(ichdr_s->count > 0 && ichdr_s->count < XFS_LBSIZE(mp) / 8);
+	ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+					+ xfs_attr3_leaf_hdr_size(leaf_s));
+	ASSERT(ichdr_d->count < XFS_LBSIZE(mp) / 8);
+	ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+					+ xfs_attr3_leaf_hdr_size(leaf_d));
+
+	ASSERT(start_s < ichdr_s->count);
+	ASSERT(start_d <= ichdr_d->count);
+	ASSERT(count <= ichdr_s->count);
+
 
 	/*
 	 * Move the entries in the destination leaf up to make a hole?
 	 */
-	if (start_d < be16_to_cpu(hdr_d->count)) {
-		tmp  = be16_to_cpu(hdr_d->count) - start_d;
+	if (start_d < ichdr_d->count) {
+		tmp  = ichdr_d->count - start_d;
 		tmp *= sizeof(xfs_attr_leaf_entry_t);
-		entry_s = &leaf_d->entries[start_d];
-		entry_d = &leaf_d->entries[start_d + count];
-		memmove((char *)entry_d, (char *)entry_s, tmp);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
+		entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count];
+		memmove(entry_d, entry_s, tmp);
 	}
 
 	/*
 	 * Copy all entry's in the same (sorted) order,
 	 * but allocate attribute info packed and in sequence.
 	 */
-	entry_s = &leaf_s->entries[start_s];
-	entry_d = &leaf_d->entries[start_d];
+	entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+	entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d];
 	desti = start_d;
 	for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
-		ASSERT(be16_to_cpu(entry_s->nameidx)
-				>= be16_to_cpu(hdr_s->firstused));
+		ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused);
 		tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
 #ifdef GROT
 		/*
@@ -2245,36 +2500,34 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
 		 * off for 6.2, should be revisited later.
 		 */
 		if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
-			memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
-			be16_add_cpu(&hdr_s->usedbytes, -tmp);
-			be16_add_cpu(&hdr_s->count, -1);
+			memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+			ichdr_s->usedbytes -= tmp;
+			ichdr_s->count -= 1;
 			entry_d--;	/* to compensate for ++ in loop hdr */
 			desti--;
 			if ((start_s + i) < offset)
 				result++;	/* insertion index adjustment */
 		} else {
 #endif /* GROT */
-			be16_add_cpu(&hdr_d->firstused, -tmp);
+			ichdr_d->firstused -= tmp;
 			/* both on-disk, don't endian flip twice */
 			entry_d->hashval = entry_s->hashval;
-			/* both on-disk, don't endian flip twice */
-			entry_d->nameidx = hdr_d->firstused;
+			entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
 			entry_d->flags = entry_s->flags;
 			ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
 							<= XFS_LBSIZE(mp));
-			memmove(xfs_attr_leaf_name(leaf_d, desti),
-				xfs_attr_leaf_name(leaf_s, start_s + i), tmp);
+			memmove(xfs_attr3_leaf_name(leaf_d, desti),
+				xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
 			ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
 							<= XFS_LBSIZE(mp));
-			memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
-			be16_add_cpu(&hdr_s->usedbytes, -tmp);
-			be16_add_cpu(&hdr_d->usedbytes, tmp);
-			be16_add_cpu(&hdr_s->count, -1);
-			be16_add_cpu(&hdr_d->count, 1);
-			tmp = be16_to_cpu(hdr_d->count)
-						* sizeof(xfs_attr_leaf_entry_t)
-						+ sizeof(xfs_attr_leaf_hdr_t);
-			ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp);
+			memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
+			ichdr_s->usedbytes -= tmp;
+			ichdr_d->usedbytes += tmp;
+			ichdr_s->count -= 1;
+			ichdr_d->count += 1;
+			tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t)
+					+ xfs_attr3_leaf_hdr_size(leaf_d);
+			ASSERT(ichdr_d->firstused >= tmp);
 #ifdef GROT
 		}
 #endif /* GROT */
@@ -2283,71 +2536,40 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
 	/*
 	 * Zero out the entries we just copied.
 	 */
-	if (start_s == be16_to_cpu(hdr_s->count)) {
+	if (start_s == ichdr_s->count) {
 		tmp = count * sizeof(xfs_attr_leaf_entry_t);
-		entry_s = &leaf_s->entries[start_s];
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
 		ASSERT(((char *)entry_s + tmp) <=
 		       ((char *)leaf_s + XFS_LBSIZE(mp)));
-		memset((char *)entry_s, 0, tmp);
+		memset(entry_s, 0, tmp);
 	} else {
 		/*
 		 * Move the remaining entries down to fill the hole,
 		 * then zero the entries at the top.
 		 */
-		tmp  = be16_to_cpu(hdr_s->count) - count;
-		tmp *= sizeof(xfs_attr_leaf_entry_t);
-		entry_s = &leaf_s->entries[start_s + count];
-		entry_d = &leaf_s->entries[start_s];
-		memmove((char *)entry_d, (char *)entry_s, tmp);
+		tmp  = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count];
+		entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
+		memmove(entry_d, entry_s, tmp);
 
 		tmp = count * sizeof(xfs_attr_leaf_entry_t);
-		entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)];
+		entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
 		ASSERT(((char *)entry_s + tmp) <=
 		       ((char *)leaf_s + XFS_LBSIZE(mp)));
-		memset((char *)entry_s, 0, tmp);
+		memset(entry_s, 0, tmp);
 	}
 
 	/*
 	 * Fill in the freemap information
 	 */
-	hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t));
-	be16_add_cpu(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) *
-			sizeof(xfs_attr_leaf_entry_t));
-	hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused)
-			      - be16_to_cpu(hdr_d->freemap[0].base));
-	hdr_d->freemap[1].base = 0;
-	hdr_d->freemap[2].base = 0;
-	hdr_d->freemap[1].size = 0;
-	hdr_d->freemap[2].size = 0;
-	hdr_s->holes = 1;	/* leaf may not be compact */
-}
-
-/*
- * Compare two leaf blocks "order".
- * Return 0 unless leaf2 should go before leaf1.
- */
-int
-xfs_attr_leaf_order(
-	struct xfs_buf	*leaf1_bp,
-	struct xfs_buf	*leaf2_bp)
-{
-	xfs_attr_leafblock_t *leaf1, *leaf2;
-
-	leaf1 = leaf1_bp->b_addr;
-	leaf2 = leaf2_bp->b_addr;
-	ASSERT((leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) &&
-	       (leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)));
-	if ((be16_to_cpu(leaf1->hdr.count) > 0) &&
-	    (be16_to_cpu(leaf2->hdr.count) > 0) &&
-	    ((be32_to_cpu(leaf2->entries[0].hashval) <
-	      be32_to_cpu(leaf1->entries[0].hashval)) ||
-	     (be32_to_cpu(leaf2->entries[
-			be16_to_cpu(leaf2->hdr.count)-1].hashval) <
-	      be32_to_cpu(leaf1->entries[
-			be16_to_cpu(leaf1->hdr.count)-1].hashval)))) {
-		return(1);
-	}
-	return(0);
+	ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d);
+	ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t);
+	ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base;
+	ichdr_d->freemap[1].base = 0;
+	ichdr_d->freemap[2].base = 0;
+	ichdr_d->freemap[1].size = 0;
+	ichdr_d->freemap[2].size = 0;
+	ichdr_s->holes = 1;	/* leaf may not be compact */
 }
 
 /*
@@ -2358,15 +2580,16 @@ xfs_attr_leaf_lasthash(
 	struct xfs_buf	*bp,
 	int		*count)
 {
-	xfs_attr_leafblock_t *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entries;
 
-	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
+	entries = xfs_attr3_leaf_entryp(bp->b_addr);
 	if (count)
-		*count = be16_to_cpu(leaf->hdr.count);
-	if (!leaf->hdr.count)
-		return(0);
-	return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval);
+		*count = ichdr.count;
+	if (!ichdr.count)
+		return 0;
+	return be32_to_cpu(entries[ichdr.count - 1].hashval);
 }
 
 /*
@@ -2376,20 +2599,21 @@ xfs_attr_leaf_lasthash(
 STATIC int
 xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
 {
+	struct xfs_attr_leaf_entry *entries;
 	xfs_attr_leaf_name_local_t *name_loc;
 	xfs_attr_leaf_name_remote_t *name_rmt;
 	int size;
 
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr_leaf_name_local(leaf, index);
+	entries = xfs_attr3_leaf_entryp(leaf);
+	if (entries[index].flags & XFS_ATTR_LOCAL) {
+		name_loc = xfs_attr3_leaf_name_local(leaf, index);
 		size = xfs_attr_leaf_entsize_local(name_loc->namelen,
 						   be16_to_cpu(name_loc->valuelen));
 	} else {
-		name_rmt = xfs_attr_leaf_name_remote(leaf, index);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, index);
 		size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
 	}
-	return(size);
+	return size;
 }
 
 /*
@@ -2414,35 +2638,40 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
 			*local = 0;
 		}
 	}
-	return(size);
+	return size;
 }
 
 /*
  * Copy out attribute list entries for attr_list(), for leaf attribute lists.
  */
 int
-xfs_attr_leaf_list_int(
-	struct xfs_buf		*bp,
-	xfs_attr_list_context_t	*context)
+xfs_attr3_leaf_list_int(
+	struct xfs_buf			*bp,
+	struct xfs_attr_list_context	*context)
 {
-	attrlist_cursor_kern_t *cursor;
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_entry_t *entry;
-	int retval, i;
+	struct attrlist_cursor_kern	*cursor;
+	struct xfs_attr_leafblock	*leaf;
+	struct xfs_attr3_icleaf_hdr	ichdr;
+	struct xfs_attr_leaf_entry	*entries;
+	struct xfs_attr_leaf_entry	*entry;
+	int				retval;
+	int				i;
+
+	trace_xfs_attr_list_leaf(context);
 
-	ASSERT(bp != NULL);
 	leaf = bp->b_addr;
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	entries = xfs_attr3_leaf_entryp(leaf);
+
 	cursor = context->cursor;
 	cursor->initted = 1;
 
-	trace_xfs_attr_list_leaf(context);
-
 	/*
 	 * Re-find our place in the leaf block if this is a new syscall.
 	 */
 	if (context->resynch) {
-		entry = &leaf->entries[0];
-		for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+		entry = &entries[0];
+		for (i = 0; i < ichdr.count; entry++, i++) {
 			if (be32_to_cpu(entry->hashval) == cursor->hashval) {
 				if (cursor->offset == context->dupcnt) {
 					context->dupcnt = 0;
@@ -2455,12 +2684,12 @@ xfs_attr_leaf_list_int(
 				break;
 			}
 		}
-		if (i == be16_to_cpu(leaf->hdr.count)) {
+		if (i == ichdr.count) {
 			trace_xfs_attr_list_notfound(context);
-			return(0);
+			return 0;
 		}
 	} else {
-		entry = &leaf->entries[0];
+		entry = &entries[0];
 		i = 0;
 	}
 	context->resynch = 0;
@@ -2469,7 +2698,7 @@ xfs_attr_leaf_list_int(
 	 * We have found our place, start copying out the new attributes.
 	 */
 	retval = 0;
-	for (  ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) {
+	for (; i < ichdr.count; entry++, i++) {
 		if (be32_to_cpu(entry->hashval) != cursor->hashval) {
 			cursor->hashval = be32_to_cpu(entry->hashval);
 			cursor->offset = 0;
@@ -2480,7 +2709,7 @@ xfs_attr_leaf_list_int(
 
 		if (entry->flags & XFS_ATTR_LOCAL) {
 			xfs_attr_leaf_name_local_t *name_loc =
-				xfs_attr_leaf_name_local(leaf, i);
+				xfs_attr3_leaf_name_local(leaf, i);
 
 			retval = context->put_listent(context,
 						entry->flags,
@@ -2492,7 +2721,7 @@ xfs_attr_leaf_list_int(
 				return retval;
 		} else {
 			xfs_attr_leaf_name_remote_t *name_rmt =
-				xfs_attr_leaf_name_remote(leaf, i);
+				xfs_attr3_leaf_name_remote(leaf, i);
 
 			int valuelen = be32_to_cpu(name_rmt->valuelen);
 
@@ -2505,7 +2734,8 @@ xfs_attr_leaf_list_int(
 				args.valuelen = valuelen;
 				args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
 				args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
-				args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
+				args.rmtblkcnt = xfs_attr3_rmt_blocks(
+							args.dp->i_mount, valuelen);
 				retval = xfs_attr_rmtval_get(&args);
 				if (retval)
 					return retval;
@@ -2532,7 +2762,7 @@ xfs_attr_leaf_list_int(
 		cursor->offset++;
 	}
 	trace_xfs_attr_list_leaf_end(context);
-	return(retval);
+	return retval;
 }
 
 
@@ -2544,14 +2774,16 @@ xfs_attr_leaf_list_int(
  * Clear the INCOMPLETE flag on an entry in a leaf block.
  */
 int
-xfs_attr_leaf_clearflag(xfs_da_args_t *args)
+xfs_attr3_leaf_clearflag(
+	struct xfs_da_args	*args)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_entry_t *entry;
-	xfs_attr_leaf_name_remote_t *name_rmt;
-	struct xfs_buf *bp;
-	int error;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp;
+	int			error;
 #ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr;
 	xfs_attr_leaf_name_local_t *name_loc;
 	int namelen;
 	char *name;
@@ -2561,23 +2793,25 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
 	/*
 	 * Set up the operation.
 	 */
-	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
 	if (error)
 		return(error);
 
 	leaf = bp->b_addr;
-	ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
-	ASSERT(args->index >= 0);
-	entry = &leaf->entries[ args->index ];
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
 	ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
 
 #ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	ASSERT(args->index < ichdr.count);
+	ASSERT(args->index >= 0);
+
 	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
+		name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
 		namelen = name_loc->namelen;
 		name = (char *)name_loc->nameval;
 	} else {
-		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
 		namelen = name_rmt->namelen;
 		name = (char *)name_rmt->name;
 	}
@@ -2592,7 +2826,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
 
 	if (args->rmtblkno) {
 		ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
-		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
 		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
 		name_rmt->valuelen = cpu_to_be32(args->valuelen);
 		xfs_trans_log_buf(args->trans, bp,
@@ -2609,34 +2843,41 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
  * Set the INCOMPLETE flag on an entry in a leaf block.
  */
 int
-xfs_attr_leaf_setflag(xfs_da_args_t *args)
+xfs_attr3_leaf_setflag(
+	struct xfs_da_args	*args)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_entry_t *entry;
-	xfs_attr_leaf_name_remote_t *name_rmt;
-	struct xfs_buf *bp;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp;
 	int error;
+#ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr;
+#endif
 
 	trace_xfs_attr_leaf_setflag(args);
 
 	/*
 	 * Set up the operation.
 	 */
-	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
 	if (error)
 		return(error);
 
 	leaf = bp->b_addr;
-	ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
+#ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+	ASSERT(args->index < ichdr.count);
 	ASSERT(args->index >= 0);
-	entry = &leaf->entries[ args->index ];
+#endif
+	entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
 
 	ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
 	entry->flags |= XFS_ATTR_INCOMPLETE;
 	xfs_trans_log_buf(args->trans, bp,
 			XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
 	if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
-		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
 		name_rmt->valueblk = 0;
 		name_rmt->valuelen = 0;
 		xfs_trans_log_buf(args->trans, bp,
@@ -2657,14 +2898,20 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
  * Note that they could be in different blocks, or in the same block.
  */
 int
-xfs_attr_leaf_flipflags(xfs_da_args_t *args)
+xfs_attr3_leaf_flipflags(
+	struct xfs_da_args	*args)
 {
-	xfs_attr_leafblock_t *leaf1, *leaf2;
-	xfs_attr_leaf_entry_t *entry1, *entry2;
-	xfs_attr_leaf_name_remote_t *name_rmt;
-	struct xfs_buf *bp1, *bp2;
+	struct xfs_attr_leafblock *leaf1;
+	struct xfs_attr_leafblock *leaf2;
+	struct xfs_attr_leaf_entry *entry1;
+	struct xfs_attr_leaf_entry *entry2;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_buf		*bp1;
+	struct xfs_buf		*bp2;
 	int error;
 #ifdef DEBUG
+	struct xfs_attr3_icleaf_hdr ichdr1;
+	struct xfs_attr3_icleaf_hdr ichdr2;
 	xfs_attr_leaf_name_local_t *name_loc;
 	int namelen1, namelen2;
 	char *name1, *name2;
@@ -2675,7 +2922,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 	/*
 	 * Read the block containing the "old" attr
 	 */
-	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+	error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
 	if (error)
 		return error;
 
@@ -2683,7 +2930,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 	 * Read the block containing the "new" attr, if it is different
 	 */
 	if (args->blkno2 != args->blkno) {
-		error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
+		error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
 					   -1, &bp2);
 		if (error)
 			return error;
@@ -2692,31 +2939,35 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 	}
 
 	leaf1 = bp1->b_addr;
-	ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
-	ASSERT(args->index >= 0);
-	entry1 = &leaf1->entries[ args->index ];
+	entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index];
 
 	leaf2 = bp2->b_addr;
-	ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
-	ASSERT(args->index2 >= 0);
-	entry2 = &leaf2->entries[ args->index2 ];
+	entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
 
 #ifdef DEBUG
+	xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+	ASSERT(args->index < ichdr1.count);
+	ASSERT(args->index >= 0);
+
+	xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+	ASSERT(args->index2 < ichdr2.count);
+	ASSERT(args->index2 >= 0);
+
 	if (entry1->flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr_leaf_name_local(leaf1, args->index);
+		name_loc = xfs_attr3_leaf_name_local(leaf1, args->index);
 		namelen1 = name_loc->namelen;
 		name1 = (char *)name_loc->nameval;
 	} else {
-		name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
 		namelen1 = name_rmt->namelen;
 		name1 = (char *)name_rmt->name;
 	}
 	if (entry2->flags & XFS_ATTR_LOCAL) {
-		name_loc = xfs_attr_leaf_name_local(leaf2, args->index2);
+		name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2);
 		namelen2 = name_loc->namelen;
 		name2 = (char *)name_loc->nameval;
 	} else {
-		name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
 		namelen2 = name_rmt->namelen;
 		name2 = (char *)name_rmt->name;
 	}
@@ -2733,7 +2984,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 			  XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
 	if (args->rmtblkno) {
 		ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
-		name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
 		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
 		name_rmt->valuelen = cpu_to_be32(args->valuelen);
 		xfs_trans_log_buf(args->trans, bp1,
@@ -2744,7 +2995,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 	xfs_trans_log_buf(args->trans, bp2,
 			  XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
 	if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
-		name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
+		name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2);
 		name_rmt->valueblk = 0;
 		name_rmt->valuelen = 0;
 		xfs_trans_log_buf(args->trans, bp2,
@@ -2756,7 +3007,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 	 */
 	error = xfs_trans_roll(&args->trans, args->dp);
 
-	return(error);
+	return error;
 }
 
 /*========================================================================
@@ -2768,12 +3019,14 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
  * We're doing a depth-first traversal in order to invalidate everything.
  */
 int
-xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
+xfs_attr3_root_inactive(
+	struct xfs_trans	**trans,
+	struct xfs_inode	*dp)
 {
-	xfs_da_blkinfo_t *info;
-	xfs_daddr_t blkno;
-	struct xfs_buf *bp;
-	int error;
+	struct xfs_da_blkinfo	*info;
+	struct xfs_buf		*bp;
+	xfs_daddr_t		blkno;
+	int			error;
 
 	/*
 	 * Read block 0 to see what we have to work with.
@@ -2781,40 +3034,46 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
 	 * the extents in reverse order the extent containing
 	 * block 0 must still be there.
 	 */
-	error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+	error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
 	if (error)
-		return(error);
-	blkno = XFS_BUF_ADDR(bp);
+		return error;
+	blkno = bp->b_bn;
 
 	/*
 	 * Invalidate the tree, even if the "tree" is only a single leaf block.
 	 * This is a depth-first traversal!
 	 */
 	info = bp->b_addr;
-	if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
-		error = xfs_attr_node_inactive(trans, dp, bp, 1);
-	} else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) {
-		error = xfs_attr_leaf_inactive(trans, dp, bp);
-	} else {
+	switch (info->magic) {
+	case cpu_to_be16(XFS_DA_NODE_MAGIC):
+	case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+		error = xfs_attr3_node_inactive(trans, dp, bp, 1);
+		break;
+	case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+	case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+		error = xfs_attr3_leaf_inactive(trans, dp, bp);
+		break;
+	default:
 		error = XFS_ERROR(EIO);
 		xfs_trans_brelse(*trans, bp);
+		break;
 	}
 	if (error)
-		return(error);
+		return error;
 
 	/*
 	 * Invalidate the incore copy of the root block.
 	 */
 	error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
 	if (error)
-		return(error);
+		return error;
 	xfs_trans_binval(*trans, bp);	/* remove from cache */
 	/*
 	 * Commit the invalidate and start the next transaction.
 	 */
 	error = xfs_trans_roll(trans, dp);
 
-	return (error);
+	return error;
 }
 
 /*
@@ -2822,7 +3081,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
  * We're doing a depth-first traversal in order to invalidate everything.
  */
 STATIC int
-xfs_attr_node_inactive(
+xfs_attr3_node_inactive(
 	struct xfs_trans **trans,
 	struct xfs_inode *dp,
 	struct xfs_buf	*bp,
@@ -2832,26 +3091,28 @@ xfs_attr_node_inactive(
 	xfs_da_intnode_t *node;
 	xfs_dablk_t child_fsb;
 	xfs_daddr_t parent_blkno, child_blkno;
-	int error, count, i;
+	int error, i;
 	struct xfs_buf *child_bp;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr ichdr;
 
 	/*
 	 * Since this code is recursive (gasp!) we must protect ourselves.
 	 */
 	if (level > XFS_DA_NODE_MAXDEPTH) {
 		xfs_trans_brelse(*trans, bp);	/* no locks for later trans */
-		return(XFS_ERROR(EIO));
+		return XFS_ERROR(EIO);
 	}
 
 	node = bp->b_addr;
-	ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-	parent_blkno = XFS_BUF_ADDR(bp);	/* save for re-read later */
-	count = be16_to_cpu(node->hdr.count);
-	if (!count) {
+	xfs_da3_node_hdr_from_disk(&ichdr, node);
+	parent_blkno = bp->b_bn;
+	if (!ichdr.count) {
 		xfs_trans_brelse(*trans, bp);
-		return(0);
+		return 0;
 	}
-	child_fsb = be32_to_cpu(node->btree[0].before);
+	btree = xfs_da3_node_tree_p(node);
+	child_fsb = be32_to_cpu(btree[0].before);
 	xfs_trans_brelse(*trans, bp);	/* no locks for later trans */
 
 	/*
@@ -2859,14 +3120,14 @@ xfs_attr_node_inactive(
 	 * over the leaves removing all of them.  If this is higher up
 	 * in the tree, recurse downward.
 	 */
-	for (i = 0; i < count; i++) {
+	for (i = 0; i < ichdr.count; i++) {
 		/*
 		 * Read the subsidiary block to see what we have to work with.
 		 * Don't do this in a transaction.  This is a depth-first
 		 * traversal of the tree so we may deal with many blocks
 		 * before we come back to this one.
 		 */
-		error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
+		error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
 						XFS_ATTR_FORK);
 		if (error)
 			return(error);
@@ -2878,18 +3139,24 @@ xfs_attr_node_inactive(
 			 * Invalidate the subtree, however we have to.
 			 */
 			info = child_bp->b_addr;
-			if (info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
-				error = xfs_attr_node_inactive(trans, dp,
-						child_bp, level+1);
-			} else if (info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) {
-				error = xfs_attr_leaf_inactive(trans, dp,
-						child_bp);
-			} else {
+			switch (info->magic) {
+			case cpu_to_be16(XFS_DA_NODE_MAGIC):
+			case cpu_to_be16(XFS_DA3_NODE_MAGIC):
+				error = xfs_attr3_node_inactive(trans, dp,
+							child_bp, level + 1);
+				break;
+			case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
+			case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
+				error = xfs_attr3_leaf_inactive(trans, dp,
+							child_bp);
+				break;
+			default:
 				error = XFS_ERROR(EIO);
 				xfs_trans_brelse(*trans, child_bp);
+				break;
 			}
 			if (error)
-				return(error);
+				return error;
 
 			/*
 			 * Remove the subsidiary block from the cache
@@ -2898,7 +3165,7 @@ xfs_attr_node_inactive(
 			error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
 				&child_bp, XFS_ATTR_FORK);
 			if (error)
-				return(error);
+				return error;
 			xfs_trans_binval(*trans, child_bp);
 		}
 
@@ -2906,12 +3173,12 @@ xfs_attr_node_inactive(
 		 * If we're not done, re-read the parent to get the next
 		 * child block number.
 		 */
-		if ((i+1) < count) {
-			error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
+		if (i + 1 < ichdr.count) {
+			error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
 						 &bp, XFS_ATTR_FORK);
 			if (error)
-				return(error);
-			child_fsb = be32_to_cpu(node->btree[i+1].before);
+				return error;
+			child_fsb = be32_to_cpu(btree[i + 1].before);
 			xfs_trans_brelse(*trans, bp);
 		}
 		/*
@@ -2919,10 +3186,10 @@ xfs_attr_node_inactive(
 		 */
 		error = xfs_trans_roll(trans, dp);
 		if (error)
-			return (error);
+			return  error;
 	}
 
-	return(0);
+	return 0;
 }
 
 /*
@@ -2932,29 +3199,35 @@ xfs_attr_node_inactive(
  * caught holding something that the logging code wants to flush to disk.
  */
 STATIC int
-xfs_attr_leaf_inactive(
-	struct xfs_trans **trans,
-	struct xfs_inode *dp,
-	struct xfs_buf	*bp)
+xfs_attr3_leaf_inactive(
+	struct xfs_trans	**trans,
+	struct xfs_inode	*dp,
+	struct xfs_buf		*bp)
 {
-	xfs_attr_leafblock_t *leaf;
-	xfs_attr_leaf_entry_t *entry;
-	xfs_attr_leaf_name_remote_t *name_rmt;
-	xfs_attr_inactive_list_t *list, *lp;
-	int error, count, size, tmp, i;
+	struct xfs_attr_leafblock *leaf;
+	struct xfs_attr3_icleaf_hdr ichdr;
+	struct xfs_attr_leaf_entry *entry;
+	struct xfs_attr_leaf_name_remote *name_rmt;
+	struct xfs_attr_inactive_list *list;
+	struct xfs_attr_inactive_list *lp;
+	int			error;
+	int			count;
+	int			size;
+	int			tmp;
+	int			i;
 
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+	xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
 
 	/*
 	 * Count the number of "remote" value extents.
 	 */
 	count = 0;
-	entry = &leaf->entries[0];
-	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+	entry = xfs_attr3_leaf_entryp(leaf);
+	for (i = 0; i < ichdr.count; entry++, i++) {
 		if (be16_to_cpu(entry->nameidx) &&
 		    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-			name_rmt = xfs_attr_leaf_name_remote(leaf, i);
+			name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
 			if (name_rmt->valueblk)
 				count++;
 		}
@@ -2965,27 +3238,27 @@ xfs_attr_leaf_inactive(
 	 */
 	if (count == 0) {
 		xfs_trans_brelse(*trans, bp);
-		return(0);
+		return 0;
 	}
 
 	/*
 	 * Allocate storage for a list of all the "remote" value extents.
 	 */
 	size = count * sizeof(xfs_attr_inactive_list_t);
-	list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP);
+	list = kmem_alloc(size, KM_SLEEP);
 
 	/*
 	 * Identify each of the "remote" value extents.
 	 */
 	lp = list;
-	entry = &leaf->entries[0];
-	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
+	entry = xfs_attr3_leaf_entryp(leaf);
+	for (i = 0; i < ichdr.count; entry++, i++) {
 		if (be16_to_cpu(entry->nameidx) &&
 		    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-			name_rmt = xfs_attr_leaf_name_remote(leaf, i);
+			name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
 			if (name_rmt->valueblk) {
 				lp->valueblk = be32_to_cpu(name_rmt->valueblk);
-				lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
+				lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
 						    be32_to_cpu(name_rmt->valuelen));
 				lp++;
 			}
@@ -2998,15 +3271,15 @@ xfs_attr_leaf_inactive(
 	 */
 	error = 0;
 	for (lp = list, i = 0; i < count; i++, lp++) {
-		tmp = xfs_attr_leaf_freextent(trans, dp,
+		tmp = xfs_attr3_leaf_freextent(trans, dp,
 				lp->valueblk, lp->valuelen);
 
 		if (error == 0)
 			error = tmp;	/* save only the 1st errno */
 	}
 
-	kmem_free((xfs_caddr_t)list);
-	return(error);
+	kmem_free(list);
+	return error;
 }
 
 /*
@@ -3014,14 +3287,20 @@ xfs_attr_leaf_inactive(
  * invalidate any buffers that are incore/in transactions.
  */
 STATIC int
-xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
-				    xfs_dablk_t blkno, int blkcnt)
+xfs_attr3_leaf_freextent(
+	struct xfs_trans	**trans,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		blkno,
+	int			blkcnt)
 {
-	xfs_bmbt_irec_t map;
-	xfs_dablk_t tblkno;
-	int tblkcnt, dblkcnt, nmap, error;
-	xfs_daddr_t dblkno;
-	xfs_buf_t *bp;
+	struct xfs_bmbt_irec	map;
+	struct xfs_buf		*bp;
+	xfs_dablk_t		tblkno;
+	xfs_daddr_t		dblkno;
+	int			tblkcnt;
+	int			dblkcnt;
+	int			nmap;
+	int			error;
 
 	/*
 	 * Roll through the "value", invalidating the attribute value's
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 77de139a58f..444a7704596 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -89,7 +90,7 @@ typedef struct xfs_attr_leaf_hdr {	/* constant-structure header block */
 
 typedef struct xfs_attr_leaf_entry {	/* sorted on key, not name */
 	__be32	hashval;		/* hash value of name */
- 	__be16	nameidx;		/* index into buffer of name/value */
+	__be16	nameidx;		/* index into buffer of name/value */
 	__u8	flags;			/* LOCAL/ROOT/SECURE/INCOMPLETE flag */
 	__u8	pad2;			/* unused pad byte */
 } xfs_attr_leaf_entry_t;
@@ -115,6 +116,55 @@ typedef struct xfs_attr_leafblock {
 } xfs_attr_leafblock_t;
 
 /*
+ * CRC enabled leaf structures. Called "version 3" structures to match the
+ * version number of the directory and dablk structures for this feature, and
+ * attr2 is already taken by the variable inode attribute fork size feature.
+ */
+struct xfs_attr3_leaf_hdr {
+	struct xfs_da3_blkinfo	info;
+	__be16			count;
+	__be16			usedbytes;
+	__be16			firstused;
+	__u8			holes;
+	__u8			pad1;
+	struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+	__be32			pad2;		/* 64 bit alignment */
+};
+
+#define XFS_ATTR3_LEAF_CRC_OFF	(offsetof(struct xfs_attr3_leaf_hdr, info.crc))
+
+struct xfs_attr3_leafblock {
+	struct xfs_attr3_leaf_hdr	hdr;
+	struct xfs_attr_leaf_entry	entries[1];
+
+	/*
+	 * The rest of the block contains the following structures after the
+	 * leaf entries, growing from the bottom up. The variables are never
+	 * referenced, the locations accessed purely from helper functions.
+	 *
+	 * struct xfs_attr_leaf_name_local
+	 * struct xfs_attr_leaf_name_remote
+	 */
+};
+
+/*
+ * incore, neutral version of the attribute leaf header
+ */
+struct xfs_attr3_icleaf_hdr {
+	__uint32_t	forw;
+	__uint32_t	back;
+	__uint16_t	magic;
+	__uint16_t	count;
+	__uint16_t	usedbytes;
+	__uint16_t	firstused;
+	__u8		holes;
+	struct {
+		__uint16_t	base;
+		__uint16_t	size;
+	} freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+
+/*
  * Flags used in the leaf_entry[i].flags field.
  * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
  * on the system call, they are "or"ed together for various operations.
@@ -147,26 +197,43 @@ typedef struct xfs_attr_leafblock {
  */
 #define	XFS_ATTR_LEAF_NAME_ALIGN	((uint)sizeof(xfs_dablk_t))
 
+static inline int
+xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
+{
+	if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+		return sizeof(struct xfs_attr3_leaf_hdr);
+	return sizeof(struct xfs_attr_leaf_hdr);
+}
+
+static inline struct xfs_attr_leaf_entry *
+xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
+{
+	if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+		return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
+	return &leafp->entries[0];
+}
+
 /*
  * Cast typed pointers for "local" and "remote" name/value structs.
  */
-static inline xfs_attr_leaf_name_remote_t *
-xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+static inline char *
+xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
 {
-	return (xfs_attr_leaf_name_remote_t *)
-		&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+	struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+
+	return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
 }
 
-static inline xfs_attr_leaf_name_local_t *
-xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
 {
-	return (xfs_attr_leaf_name_local_t *)
-		&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+	return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
 }
 
-static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
 {
-	return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
+	return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
 }
 
 /*
@@ -221,37 +288,37 @@ int	xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
 /*
  * Internal routines when attribute fork size == XFS_LBSIZE(mp).
  */
-int	xfs_attr_leaf_to_node(struct xfs_da_args *args);
-int	xfs_attr_leaf_to_shortform(struct xfs_buf *bp,
+int	xfs_attr3_leaf_to_node(struct xfs_da_args *args);
+int	xfs_attr3_leaf_to_shortform(struct xfs_buf *bp,
 				   struct xfs_da_args *args, int forkoff);
-int	xfs_attr_leaf_clearflag(struct xfs_da_args *args);
-int	xfs_attr_leaf_setflag(struct xfs_da_args *args);
-int	xfs_attr_leaf_flipflags(xfs_da_args_t *args);
+int	xfs_attr3_leaf_clearflag(struct xfs_da_args *args);
+int	xfs_attr3_leaf_setflag(struct xfs_da_args *args);
+int	xfs_attr3_leaf_flipflags(struct xfs_da_args *args);
 
 /*
  * Routines used for growing the Btree.
  */
-int	xfs_attr_leaf_split(struct xfs_da_state *state,
+int	xfs_attr3_leaf_split(struct xfs_da_state *state,
 				   struct xfs_da_state_blk *oldblk,
 				   struct xfs_da_state_blk *newblk);
-int	xfs_attr_leaf_lookup_int(struct xfs_buf *leaf,
+int	xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf,
 					struct xfs_da_args *args);
-int	xfs_attr_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
-int	xfs_attr_leaf_add(struct xfs_buf *leaf_buffer,
+int	xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args);
+int	xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
 				 struct xfs_da_args *args);
-int	xfs_attr_leaf_remove(struct xfs_buf *leaf_buffer,
+int	xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
 				    struct xfs_da_args *args);
-int	xfs_attr_leaf_list_int(struct xfs_buf *bp,
+int	xfs_attr3_leaf_list_int(struct xfs_buf *bp,
 				      struct xfs_attr_list_context *context);
 
 /*
  * Routines used for shrinking the Btree.
  */
-int	xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
-void	xfs_attr_leaf_unbalance(struct xfs_da_state *state,
+int	xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void	xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
 				       struct xfs_da_state_blk *drop_blk,
 				       struct xfs_da_state_blk *save_blk);
-int	xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+int	xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
 
 /*
  * Utility routines.
@@ -261,10 +328,12 @@ int	xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
 				   struct xfs_buf *leaf2_bp);
 int	xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
 					int *local);
-int	xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+int	xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
 			xfs_dablk_t bno, xfs_daddr_t mappedbno,
 			struct xfs_buf **bpp);
+void	xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
+				     struct xfs_attr_leafblock *from);
 
-extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
 
 #endif	/* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
new file mode 100644
index 00000000000..ef6b0c12452
--- /dev/null
+++ b/fs/xfs/xfs_attr_remote.c
@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+
+#define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
+
+/*
+ * Each contiguous block has a header, so it is not just a simple attribute
+ * length to FSB conversion.
+ */
+int
+xfs_attr3_rmt_blocks(
+	struct xfs_mount *mp,
+	int		attrlen)
+{
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+		return (attrlen + buflen - 1) / buflen;
+	}
+	return XFS_B_TO_FSB(mp, attrlen);
+}
+
+/*
+ * Checking of the remote attribute header is split into two parts. The verifier
+ * does CRC, location and bounds checking, the unpacking function checks the
+ * attribute parameters and owner.
+ */
+static bool
+xfs_attr3_rmt_hdr_ok(
+	struct xfs_mount	*mp,
+	void			*ptr,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (bno != be64_to_cpu(rmt->rm_blkno))
+		return false;
+	if (offset != be32_to_cpu(rmt->rm_offset))
+		return false;
+	if (size != be32_to_cpu(rmt->rm_bytes))
+		return false;
+	if (ino != be64_to_cpu(rmt->rm_owner))
+		return false;
+
+	/* ok */
+	return true;
+}
+
+static bool
+xfs_attr3_rmt_verify(
+	struct xfs_mount	*mp,
+	void			*ptr,
+	int			fsbsize,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return false;
+	if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+		return false;
+	if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	if (be64_to_cpu(rmt->rm_blkno) != bno)
+		return false;
+	if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
+		return false;
+	if (be32_to_cpu(rmt->rm_offset) +
+				be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX)
+		return false;
+	if (rmt->rm_owner == 0)
+		return false;
+
+	return true;
+}
+
+static void
+xfs_attr3_rmt_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	char		*ptr;
+	int		len;
+	bool		corrupt = false;
+	xfs_daddr_t	bno;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	ptr = bp->b_addr;
+	bno = bp->b_bn;
+	len = BBTOB(bp->b_length);
+	ASSERT(len >= XFS_LBSIZE(mp));
+
+	while (len > 0) {
+		if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp),
+				      XFS_ATTR3_RMT_CRC_OFF)) {
+			corrupt = true;
+			break;
+		}
+		if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
+			corrupt = true;
+			break;
+		}
+		len -= XFS_LBSIZE(mp);
+		ptr += XFS_LBSIZE(mp);
+		bno += mp->m_bsize;
+	}
+
+	if (corrupt) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	} else
+		ASSERT(len == 0);
+}
+
+static void
+xfs_attr3_rmt_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	char		*ptr;
+	int		len;
+	xfs_daddr_t	bno;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	ptr = bp->b_addr;
+	bno = bp->b_bn;
+	len = BBTOB(bp->b_length);
+	ASSERT(len >= XFS_LBSIZE(mp));
+
+	while (len > 0) {
+		if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
+			XFS_CORRUPTION_ERROR(__func__,
+					    XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+			xfs_buf_ioerror(bp, EFSCORRUPTED);
+			return;
+		}
+		if (bip) {
+			struct xfs_attr3_rmt_hdr *rmt;
+
+			rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+			rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+		}
+		xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF);
+
+		len -= XFS_LBSIZE(mp);
+		ptr += XFS_LBSIZE(mp);
+		bno += mp->m_bsize;
+	}
+	ASSERT(len == 0);
+}
+
+const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+	.verify_read = xfs_attr3_rmt_read_verify,
+	.verify_write = xfs_attr3_rmt_write_verify,
+};
+
+STATIC int
+xfs_attr3_rmt_hdr_set(
+	struct xfs_mount	*mp,
+	void			*ptr,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	xfs_daddr_t		bno)
+{
+	struct xfs_attr3_rmt_hdr *rmt = ptr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return 0;
+
+	rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
+	rmt->rm_offset = cpu_to_be32(offset);
+	rmt->rm_bytes = cpu_to_be32(size);
+	uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+	rmt->rm_owner = cpu_to_be64(ino);
+	rmt->rm_blkno = cpu_to_be64(bno);
+
+	return sizeof(struct xfs_attr3_rmt_hdr);
+}
+
+/*
+ * Helper functions to copy attribute data in and out of the one disk extents
+ */
+STATIC int
+xfs_attr_rmtval_copyout(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	xfs_ino_t	ino,
+	int		*offset,
+	int		*valuelen,
+	char		**dst)
+{
+	char		*src = bp->b_addr;
+	xfs_daddr_t	bno = bp->b_bn;
+	int		len = BBTOB(bp->b_length);
+
+	ASSERT(len >= XFS_LBSIZE(mp));
+
+	while (len > 0 && *valuelen > 0) {
+		int hdr_size = 0;
+		int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+
+		byte_cnt = min_t(int, *valuelen, byte_cnt);
+
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
+						  byte_cnt, bno)) {
+				xfs_alert(mp,
+"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
+					bno, *offset, byte_cnt, ino);
+				return EFSCORRUPTED;
+			}
+			hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
+		}
+
+		memcpy(*dst, src + hdr_size, byte_cnt);
+
+		/* roll buffer forwards */
+		len -= XFS_LBSIZE(mp);
+		src += XFS_LBSIZE(mp);
+		bno += mp->m_bsize;
+
+		/* roll attribute data forwards */
+		*valuelen -= byte_cnt;
+		*dst += byte_cnt;
+		*offset += byte_cnt;
+	}
+	return 0;
+}
+
+STATIC void
+xfs_attr_rmtval_copyin(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	xfs_ino_t	ino,
+	int		*offset,
+	int		*valuelen,
+	char		**src)
+{
+	char		*dst = bp->b_addr;
+	xfs_daddr_t	bno = bp->b_bn;
+	int		len = BBTOB(bp->b_length);
+
+	ASSERT(len >= XFS_LBSIZE(mp));
+
+	while (len > 0 && *valuelen > 0) {
+		int hdr_size;
+		int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+
+		byte_cnt = min(*valuelen, byte_cnt);
+		hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
+						 byte_cnt, bno);
+
+		memcpy(dst + hdr_size, *src, byte_cnt);
+
+		/*
+		 * If this is the last block, zero the remainder of it.
+		 * Check that we are actually the last block, too.
+		 */
+		if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) {
+			ASSERT(*valuelen - byte_cnt == 0);
+			ASSERT(len == XFS_LBSIZE(mp));
+			memset(dst + hdr_size + byte_cnt, 0,
+					XFS_LBSIZE(mp) - hdr_size - byte_cnt);
+		}
+
+		/* roll buffer forwards */
+		len -= XFS_LBSIZE(mp);
+		dst += XFS_LBSIZE(mp);
+		bno += mp->m_bsize;
+
+		/* roll attribute data forwards */
+		*valuelen -= byte_cnt;
+		*src += byte_cnt;
+		*offset += byte_cnt;
+	}
+}
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+int
+xfs_attr_rmtval_get(
+	struct xfs_da_args	*args)
+{
+	struct xfs_bmbt_irec	map[ATTR_RMTVALUE_MAPSIZE];
+	struct xfs_mount	*mp = args->dp->i_mount;
+	struct xfs_buf		*bp;
+	xfs_dablk_t		lblkno = args->rmtblkno;
+	char			*dst = args->value;
+	int			valuelen = args->valuelen;
+	int			nmap;
+	int			error;
+	int			blkcnt = args->rmtblkcnt;
+	int			i;
+	int			offset = 0;
+
+	trace_xfs_attr_rmtval_get(args);
+
+	ASSERT(!(args->flags & ATTR_KERNOVAL));
+
+	while (valuelen > 0) {
+		nmap = ATTR_RMTVALUE_MAPSIZE;
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
+			return error;
+		ASSERT(nmap >= 1);
+
+		for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+			xfs_daddr_t	dblkno;
+			int		dblkcnt;
+
+			ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+			       (map[i].br_startblock != HOLESTARTBLOCK));
+			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+			dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+						   dblkno, dblkcnt, 0, &bp,
+						   &xfs_attr3_rmt_buf_ops);
+			if (error)
+				return error;
+
+			error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+							&offset, &valuelen,
+							&dst);
+			xfs_buf_relse(bp);
+			if (error)
+				return error;
+
+			/* roll attribute extent map forwards */
+			lblkno += map[i].br_blockcount;
+			blkcnt -= map[i].br_blockcount;
+		}
+	}
+	ASSERT(valuelen == 0);
+	return 0;
+}
+
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+int
+xfs_attr_rmtval_set(
+	struct xfs_da_args	*args)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_bmbt_irec	map;
+	xfs_dablk_t		lblkno;
+	xfs_fileoff_t		lfileoff = 0;
+	char			*src = args->value;
+	int			blkcnt;
+	int			valuelen;
+	int			nmap;
+	int			error;
+	int			offset = 0;
+
+	trace_xfs_attr_rmtval_set(args);
+
+	/*
+	 * Find a "hole" in the attribute address space large enough for
+	 * us to drop the new attribute's value into. Because CRC enable
+	 * attributes have headers, we can't just do a straight byte to FSB
+	 * conversion and have to take the header space into account.
+	 */
+	blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+	error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+						   XFS_ATTR_FORK);
+	if (error)
+		return error;
+
+	args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+	args->rmtblkcnt = blkcnt;
+
+	/*
+	 * Roll through the "value", allocating blocks on disk as required.
+	 */
+	while (blkcnt > 0) {
+		int	committed;
+
+		/*
+		 * Allocate a single extent, up to the size of the value.
+		 */
+		xfs_bmap_init(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
+				  blkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				  args->firstblock, args->total, &map, &nmap,
+				  args->flist);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, dp, 0);
+
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+
+		/*
+		 * Start the next trans in the chain.
+		 */
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * Roll through the "value", copying the attribute value to the
+	 * already-allocated blocks.  Blocks are written synchronously
+	 * so that we can know they are all on disk before we turn off
+	 * the INCOMPLETE flag.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	valuelen = args->valuelen;
+	while (valuelen > 0) {
+		struct xfs_buf	*bp;
+		xfs_daddr_t	dblkno;
+		int		dblkcnt;
+
+		ASSERT(blkcnt > 0);
+
+		xfs_bmap_init(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, &map, &nmap,
+				       XFS_BMAPI_ATTRFORK);
+		if (error)
+			return(error);
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
+		if (!bp)
+			return ENOMEM;
+		bp->b_ops = &xfs_attr3_rmt_buf_ops;
+
+		xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+				       &valuelen, &src);
+
+		error = xfs_bwrite(bp);	/* GROT: NOTE: synchronous write */
+		xfs_buf_relse(bp);
+		if (error)
+			return error;
+
+
+		/* roll attribute extent map forwards */
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+	}
+	ASSERT(valuelen == 0);
+	return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_remove(
+	struct xfs_da_args	*args)
+{
+	struct xfs_mount	*mp = args->dp->i_mount;
+	xfs_dablk_t		lblkno;
+	int			blkcnt;
+	int			error;
+	int			done;
+
+	trace_xfs_attr_rmtval_remove(args);
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's blocks.
+	 * Note that args->rmtblkcnt is the minimum number of data blocks we'll
+	 * see for a CRC enabled remote attribute. Each extent will have a
+	 * header, and so we may have more blocks than we realise here.  If we
+	 * fail to map the blocks correctly, we'll have problems with the buffer
+	 * lookups.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	while (blkcnt > 0) {
+		struct xfs_bmbt_irec	map;
+		struct xfs_buf		*bp;
+		xfs_daddr_t		dblkno;
+		int			dblkcnt;
+		int			nmap;
+
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		nmap = 1;
+		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
+				       blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
+		if (error)
+			return(error);
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		/*
+		 * If the "remote" value is in the cache, remove it.
+		 */
+		bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
+		if (bp) {
+			xfs_buf_stale(bp);
+			xfs_buf_relse(bp);
+			bp = NULL;
+		}
+
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+	}
+
+	/*
+	 * Keep de-allocating extents until the remote-value region is gone.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	done = 0;
+	while (!done) {
+		int committed;
+
+		xfs_bmap_init(args->flist, args->firstblock);
+		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				    1, args->firstblock, args->flist,
+				    &done);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						&committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return error;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed)
+			xfs_trans_ijoin(args->trans, args->dp, 0);
+
+		/*
+		 * Close out trans and start the next one in the chain.
+		 */
+		error = xfs_trans_roll(&args->trans, args->dp);
+		if (error)
+			return (error);
+	}
+	return(0);
+}
+
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h
new file mode 100644
index 00000000000..92a8fd7977c
--- /dev/null
+++ b/fs/xfs/xfs_attr_remote.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_ATTR_REMOTE_H__
+#define	__XFS_ATTR_REMOTE_H__
+
+#define XFS_ATTR3_RMT_MAGIC	0x5841524d	/* XARM */
+
+/*
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
+struct xfs_attr3_rmt_hdr {
+	__be32	rm_magic;
+	__be32	rm_offset;
+	__be32	rm_bytes;
+	__be32	rm_crc;
+	uuid_t	rm_uuid;
+	__be64	rm_owner;
+	__be64	rm_blkno;
+	__be64	rm_lsn;
+};
+
+#define XFS_ATTR3_RMT_CRC_OFF	offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
+
+#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize)	\
+	((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+			sizeof(struct xfs_attr3_rmt_hdr) : 0))
+
+extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+
+int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
+
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_rmtval_set(struct xfs_da_args *args);
+int xfs_attr_rmtval_remove(struct xfs_da_args *args);
+
+#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index cdb2d334858..89042848f9e 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -25,6 +25,7 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
+#include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
@@ -47,177 +48,78 @@
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_symlink.h"
 
 
 kmem_zone_t		*xfs_bmap_free_item_zone;
 
 /*
- * Prototypes for internal bmap routines.
- */
-
-#ifdef DEBUG
-STATIC void
-xfs_bmap_check_leaf_extents(
-	struct xfs_btree_cur	*cur,
-	struct xfs_inode	*ip,
-	int			whichfork);
-#else
-#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)		do { } while (0)
-#endif
-
-
-/*
- * Called from xfs_bmap_add_attrfork to handle extents format files.
- */
-STATIC int					/* error */
-xfs_bmap_add_attrfork_extents(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	xfs_fsblock_t		*firstblock,	/* first block allocated */
-	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
-	int			*flags);	/* inode logging flags */
-
-/*
- * Called from xfs_bmap_add_attrfork to handle local format files.
- */
-STATIC int					/* error */
-xfs_bmap_add_attrfork_local(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	xfs_fsblock_t		*firstblock,	/* first block allocated */
-	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
-	int			*flags);	/* inode logging flags */
-
-/*
- * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
- * It figures out where to ask the underlying allocator to put the new extent.
- */
-STATIC int				/* error */
-xfs_bmap_alloc(
-	xfs_bmalloca_t		*ap);	/* bmap alloc argument struct */
-
-/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
- */
-STATIC int				/* error */
-xfs_bmap_btree_to_extents(
-	xfs_trans_t		*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork); /* data or attr fork */
-
-/*
- * Remove the entry "free" from the free item list.  Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-STATIC void
-xfs_bmap_del_free(
-	xfs_bmap_free_t		*flist,	/* free item list header */
-	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
-	xfs_bmap_free_item_t	*free);	/* list item to be freed */
-
-/*
- * Convert an extents-format file into a btree-format file.
- * The new file will have a root block (in the inode) and a single child block.
- */
-STATIC int					/* error */
-xfs_bmap_extents_to_btree(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
-	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
-	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
-	int			wasdel,		/* converting a delayed alloc */
-	int			*logflagsp,	/* inode logging flags */
-	int			whichfork);	/* data or attr fork */
-
-/*
- * Convert a local file to an extents file.
- * This code is sort of bogus, since the file data needs to get
- * logged so it won't be lost.  The bmap-level manipulations are ok, though.
- */
-STATIC int				/* error */
-xfs_bmap_local_to_extents(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
-	xfs_extlen_t	total,		/* total blocks needed by transaction */
-	int		*logflagsp,	/* inode logging flags */
-	int		whichfork);	/* data or attr fork */
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry.  If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
-xfs_bmap_search_extents(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_fileoff_t	bno,		/* block number searched for */
-	int		whichfork,	/* data or attr fork */
-	int		*eofp,		/* out: end of file found */
-	xfs_extnum_t	*lastxp,	/* out: last extent index */
-	xfs_bmbt_irec_t	*gotp,		/* out: extent entry found */
-	xfs_bmbt_irec_t	*prevp);	/* out: previous extent entry found */
-
-/*
- * Compute the worst-case number of indirect blocks that will be used
- * for ip's delayed extent of length "len".
+ * Miscellaneous helper functions
  */
-STATIC xfs_filblks_t
-xfs_bmap_worst_indlen(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_filblks_t		len);	/* delayed extent length */
 
-#ifdef DEBUG
 /*
- * Perform various validation checks on the values being returned
- * from xfs_bmapi().
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.  Done once, during mount.
  */
-STATIC void
-xfs_bmap_validate_ret(
-	xfs_fileoff_t		bno,
-	xfs_filblks_t		len,
-	int			flags,
-	xfs_bmbt_irec_t		*mval,
-	int			nmap,
-	int			ret_nmap);
-#else
-#define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
-#endif /* DEBUG */
-
-STATIC int
-xfs_bmap_count_tree(
-	xfs_mount_t     *mp,
-	xfs_trans_t     *tp,
-	xfs_ifork_t	*ifp,
-	xfs_fsblock_t   blockno,
-	int             levelin,
-	int		*count);
-
-STATIC void
-xfs_bmap_count_leaves(
-	xfs_ifork_t		*ifp,
-	xfs_extnum_t		idx,
-	int			numrecs,
-	int			*count);
+void
+xfs_bmap_compute_maxlevels(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	int		whichfork)	/* data or attr fork */
+{
+	int		level;		/* btree level */
+	uint		maxblocks;	/* max blocks at this level */
+	uint		maxleafents;	/* max leaf entries possible */
+	int		maxrootrecs;	/* max records in root block */
+	int		minleafrecs;	/* min records in leaf block */
+	int		minnoderecs;	/* min records in node block */
+	int		sz;		/* root block size */
 
-STATIC void
-xfs_bmap_disk_count_leaves(
-	struct xfs_mount	*mp,
-	struct xfs_btree_block	*block,
-	int			numrecs,
-	int			*count);
+	/*
+	 * The maximum number of extents in a file, hence the maximum
+	 * number of leaf entries, is controlled by the type of di_nextents
+	 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+	 * (a signed 16-bit number, xfs_aextnum_t).
+	 *
+	 * Note that we can no longer assume that if we are in ATTR1 that
+	 * the fork offset of all the inodes will be
+	 * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+	 * with ATTR2 and then mounted back with ATTR1, keeping the
+	 * di_forkoff's fixed but probably at various positions. Therefore,
+	 * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+	 * of a minimum size available.
+	 */
+	if (whichfork == XFS_DATA_FORK) {
+		maxleafents = MAXEXTNUM;
+		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
+	} else {
+		maxleafents = MAXAEXTNUM;
+		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	}
+	maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
+	minleafrecs = mp->m_bmap_dmnr[0];
+	minnoderecs = mp->m_bmap_dmnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++) {
+		if (maxblocks <= maxrootrecs)
+			maxblocks = 1;
+		else
+			maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	}
+	mp->m_bm_maxlevels[whichfork] = level;
+}
 
 /*
- * Bmap internal routines.
+ * Convert the given file system block to a disk block.  We have to treat it
+ * differently based on whether the file is a real time file or not, because the
+ * bmap code does.
  */
+xfs_daddr_t
+xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
+{
+	return (XFS_IS_REALTIME_INODE(ip) ? \
+		 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+		 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+}
 
 STATIC int				/* error */
 xfs_bmbt_lookup_eq(
@@ -287,6 +189,1070 @@ xfs_bmbt_update(
 }
 
 /*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_filblks_t	len)		/* delayed extent length */
+{
+	int		level;		/* btree level number */
+	int		maxrecs;	/* maximum record count at this level */
+	xfs_mount_t	*mp;		/* mount structure */
+	xfs_filblks_t	rval;		/* return value */
+
+	mp = ip->i_mount;
+	maxrecs = mp->m_bmap_dmxr[0];
+	for (level = 0, rval = 0;
+	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+	     level++) {
+		len += maxrecs - 1;
+		do_div(len, maxrecs);
+		rval += len;
+		if (len == 1)
+			return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+				level - 1;
+		if (level == 0)
+			maxrecs = mp->m_bmap_dmxr[1];
+	}
+	return rval;
+}
+
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			offset;
+
+	if (mp->m_sb.sb_inodesize == 256) {
+		offset = XFS_LITINO(mp, ip->i_d.di_version) -
+				XFS_BMDR_SPACE_CALC(MINABTPTRS);
+	} else {
+		offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+	}
+
+	ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
+	return offset;
+}
+
+/*
+ * Helper routine to reset inode di_forkoff field when switching
+ * attribute fork from local to extent format - we reset it where
+ * possible to make space available for inline data fork extents.
+ */
+STATIC void
+xfs_bmap_forkoff_reset(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	if (whichfork == XFS_ATTR_FORK &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+	    ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+		uint	dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+		if (dfl_forkoff > ip->i_d.di_forkoff)
+			ip->i_d.di_forkoff = dfl_forkoff;
+	}
+}
+
+/*
+ * Extent tree block counting routines.
+ */
+
+/*
+ * Count leaf blocks given a range of extent records.
+ */
+STATIC void
+xfs_bmap_count_leaves(
+	xfs_ifork_t		*ifp,
+	xfs_extnum_t		idx,
+	int			numrecs,
+	int			*count)
+{
+	int		b;
+
+	for (b = 0; b < numrecs; b++) {
+		xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
+		*count += xfs_bmbt_get_blockcount(frp);
+	}
+}
+
+/*
+ * Count leaf blocks given a range of extent records originally
+ * in btree format.
+ */
+STATIC void
+xfs_bmap_disk_count_leaves(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*block,
+	int			numrecs,
+	int			*count)
+{
+	int		b;
+	xfs_bmbt_rec_t	*frp;
+
+	for (b = 1; b <= numrecs; b++) {
+		frp = XFS_BMBT_REC_ADDR(mp, block, b);
+		*count += xfs_bmbt_disk_get_blockcount(frp);
+	}
+}
+
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks is use.
+ */
+STATIC int                                     /* error */
+xfs_bmap_count_tree(
+	xfs_mount_t     *mp,            /* file system mount point */
+	xfs_trans_t     *tp,            /* transaction pointer */
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fsblock_t   blockno,	/* file system block number */
+	int             levelin,	/* level in btree */
+	int		*count)		/* Count of blocks */
+{
+	int			error;
+	xfs_buf_t		*bp, *nbp;
+	int			level = levelin;
+	__be64			*pp;
+	xfs_fsblock_t           bno = blockno;
+	xfs_fsblock_t		nextbno;
+	struct xfs_btree_block	*block, *nextblock;
+	int			numrecs;
+
+	error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+	if (error)
+		return error;
+	*count += 1;
+	block = XFS_BUF_TO_BLOCK(bp);
+
+	if (--level) {
+		/* Not at node above leaves, count this level of nodes */
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+		while (nextbno != NULLFSBLOCK) {
+			error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				return error;
+			*count += 1;
+			nextblock = XFS_BUF_TO_BLOCK(nbp);
+			nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
+			xfs_trans_brelse(tp, nbp);
+		}
+
+		/* Dive to the next level */
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		if (unlikely((error =
+		     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
+			xfs_trans_brelse(tp, bp);
+			XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
+					 XFS_ERRLEVEL_LOW, mp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		xfs_trans_brelse(tp, bp);
+	} else {
+		/* count all level 1 nodes and their leaves */
+		for (;;) {
+			nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+			numrecs = be16_to_cpu(block->bb_numrecs);
+			xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
+			xfs_trans_brelse(tp, bp);
+			if (nextbno == NULLFSBLOCK)
+				break;
+			bno = nextbno;
+			error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				return error;
+			*count += 1;
+			block = XFS_BUF_TO_BLOCK(bp);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int						/* error */
+xfs_bmap_count_blocks(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode */
+	int			whichfork,	/* data or attr fork */
+	int			*count)		/* out: count of blocks */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	__be64			*pp;	/* pointer to block address */
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+		xfs_bmap_count_leaves(ifp, 0,
+			ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+			count);
+		return 0;
+	}
+
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	block = ifp->if_broot;
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+	ASSERT(bno != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+	if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
+		XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
+				 mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	return 0;
+}
+
+/*
+ * Debug/sanity checking code
+ */
+
+STATIC int
+xfs_bmap_sanity_check(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	int			level)
+{
+	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+
+	if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
+	    block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
+		return 0;
+
+	if (be16_to_cpu(block->bb_level) != level ||
+	    be16_to_cpu(block->bb_numrecs) == 0 ||
+	    be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+		return 0;
+
+	return 1;
+}
+
+#ifdef DEBUG
+STATIC struct xfs_buf *
+xfs_bmap_get_bp(
+	struct xfs_btree_cur	*cur,
+	xfs_fsblock_t		bno)
+{
+	struct xfs_log_item_desc *lidp;
+	int			i;
+
+	if (!cur)
+		return NULL;
+
+	for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+		if (!cur->bc_bufs[i])
+			break;
+		if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
+			return cur->bc_bufs[i];
+	}
+
+	/* Chase down all the log items to see if the bp is there */
+	list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
+		struct xfs_buf_log_item	*bip;
+		bip = (struct xfs_buf_log_item *)lidp->lid_item;
+		if (bip->bli_item.li_type == XFS_LI_BUF &&
+		    XFS_BUF_ADDR(bip->bli_buf) == bno)
+			return bip->bli_buf;
+	}
+
+	return NULL;
+}
+
+STATIC void
+xfs_check_block(
+	struct xfs_btree_block	*block,
+	xfs_mount_t		*mp,
+	int			root,
+	short			sz)
+{
+	int			i, j, dmxr;
+	__be64			*pp, *thispa;	/* pointer to block address */
+	xfs_bmbt_key_t		*prevp, *keyp;
+
+	ASSERT(be16_to_cpu(block->bb_level) > 0);
+
+	prevp = NULL;
+	for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
+		dmxr = mp->m_bmap_dmxr[0];
+		keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
+
+		if (prevp) {
+			ASSERT(be64_to_cpu(prevp->br_startoff) <
+			       be64_to_cpu(keyp->br_startoff));
+		}
+		prevp = keyp;
+
+		/*
+		 * Compare the block numbers to see if there are dups.
+		 */
+		if (root)
+			pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+		else
+			pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
+
+		for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
+			if (root)
+				thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+			else
+				thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
+			if (*thispa == *pp) {
+				xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
+					__func__, j, i,
+					(unsigned long long)be64_to_cpu(*thispa));
+				panic("%s: ptrs are equal in node\n",
+					__func__);
+			}
+		}
+	}
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+
+STATIC void
+xfs_bmap_check_leaf_extents(
+	xfs_btree_cur_t		*cur,	/* btree cursor or null */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_extnum_t		i=0, j;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	__be64			*pp;	/* pointer to block address */
+	xfs_bmbt_rec_t		*ep;	/* pointer to current extent */
+	xfs_bmbt_rec_t		last = {0, 0}; /* last extent in prev block */
+	xfs_bmbt_rec_t		*nextp;	/* pointer to next extent */
+	int			bp_release = 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+		return;
+	}
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+
+	ASSERT(bno != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		/* See if buf is in cur first */
+		bp_release = 0;
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (!bp) {
+			bp_release = 1;
+			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				goto error_norelse;
+		}
+		block = XFS_BUF_TO_BLOCK(bp);
+		XFS_WANT_CORRUPTED_GOTO(
+			xfs_bmap_sanity_check(mp, bp, level),
+			error0);
+		if (level == 0)
+			break;
+
+		/*
+		 * Check this block for basic sanity (increasing keys and
+		 * no duplicate blocks).
+		 */
+
+		xfs_check_block(block, mp, 0, 0);
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+	}
+
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	i = 0;
+
+	/*
+	 * Loop over all leaf nodes checking that all extents are in the right order.
+	 */
+	for (;;) {
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+
+
+		num_recs = xfs_btree_get_numrecs(block);
+
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+		/*
+		 * Check all the extents to make sure they are OK.
+		 * If we had a previous block, the last entry should
+		 * conform with the first entry in this one.
+		 */
+
+		ep = XFS_BMBT_REC_ADDR(mp, block, 1);
+		if (i) {
+			ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+			       xfs_bmbt_disk_get_blockcount(&last) <=
+			       xfs_bmbt_disk_get_startoff(ep));
+		}
+		for (j = 1; j < num_recs; j++) {
+			nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+			ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+			       xfs_bmbt_disk_get_blockcount(ep) <=
+			       xfs_bmbt_disk_get_startoff(nextp));
+			ep = nextp;
+		}
+
+		last = *ep;
+		i += num_recs;
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+
+		bp_release = 0;
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (!bp) {
+			bp_release = 1;
+			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+						XFS_BMAP_BTREE_REF,
+						&xfs_bmbt_buf_ops);
+			if (error)
+				goto error_norelse;
+		}
+		block = XFS_BUF_TO_BLOCK(bp);
+	}
+	if (bp_release) {
+		bp_release = 0;
+		xfs_trans_brelse(NULL, bp);
+	}
+	return;
+
+error0:
+	xfs_warn(mp, "%s: at error0", __func__);
+	if (bp_release)
+		xfs_trans_brelse(NULL, bp);
+error_norelse:
+	xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+		__func__, i);
+	panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+	return;
+}
+
+/*
+ * Add bmap trace insert entries for all the contents of the extent records.
+ */
+void
+xfs_bmap_trace_exlist(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	cnt,		/* count of entries in the list */
+	int		whichfork,	/* data or attr fork */
+	unsigned long	caller_ip)
+{
+	xfs_extnum_t	idx;		/* extent record index */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	int		state = 0;
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+	for (idx = 0; idx < cnt; idx++)
+		trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+}
+
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the callers original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extent beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	xfs_bmbt_irec_t		*mval,
+	int			nmap,
+	int			ret_nmap)
+{
+	int			i;		/* index to map values */
+
+	ASSERT(ret_nmap <= nmap);
+
+	for (i = 0; i < ret_nmap; i++) {
+		ASSERT(mval[i].br_blockcount > 0);
+		if (!(flags & XFS_BMAPI_ENTIRE)) {
+			ASSERT(mval[i].br_startoff >= bno);
+			ASSERT(mval[i].br_blockcount <= len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+			       bno + len);
+		} else {
+			ASSERT(mval[i].br_startoff < bno + len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+			       bno);
+		}
+		ASSERT(i == 0 ||
+		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+		       mval[i].br_startoff);
+		ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+		       mval[i].br_startblock != HOLESTARTBLOCK);
+		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+		       mval[i].br_state == XFS_EXT_UNWRITTEN);
+	}
+}
+
+#else
+#define xfs_bmap_check_leaf_extents(cur, ip, whichfork)		do { } while (0)
+#define	xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+
+/*
+ * bmap free list manipulation functions
+ */
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+	xfs_fsblock_t		bno,		/* fs block number of extent */
+	xfs_filblks_t		len,		/* length of extent */
+	xfs_bmap_free_t		*flist,		/* list of extents */
+	xfs_mount_t		*mp)		/* mount point structure */
+{
+	xfs_bmap_free_item_t	*cur;		/* current (next) element */
+	xfs_bmap_free_item_t	*new;		/* new element */
+	xfs_bmap_free_item_t	*prev;		/* previous element */
+#ifdef DEBUG
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+
+	ASSERT(bno != NULLFSBLOCK);
+	ASSERT(len > 0);
+	ASSERT(len <= MAXEXTLEN);
+	ASSERT(!isnullstartblock(bno));
+	agno = XFS_FSB_TO_AGNO(mp, bno);
+	agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(agbno < mp->m_sb.sb_agblocks);
+	ASSERT(len < mp->m_sb.sb_agblocks);
+	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+	ASSERT(xfs_bmap_free_item_zone != NULL);
+	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+	new->xbfi_startblock = bno;
+	new->xbfi_blockcount = (xfs_extlen_t)len;
+	for (prev = NULL, cur = flist->xbf_first;
+	     cur != NULL;
+	     prev = cur, cur = cur->xbfi_next) {
+		if (cur->xbfi_startblock >= bno)
+			break;
+	}
+	if (prev)
+		prev->xbfi_next = new;
+	else
+		flist->xbf_first = new;
+	new->xbfi_next = cur;
+	flist->xbf_count++;
+}
+
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+	xfs_bmap_free_t		*flist,	/* free item list header */
+	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
+	xfs_bmap_free_item_t	*free)	/* list item to be freed */
+{
+	if (prev)
+		prev->xbfi_next = free->xbfi_next;
+	else
+		flist->xbf_first = free->xbfi_next;
+	flist->xbf_count--;
+	kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.  We never free any extents in
+ * the first transaction.
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+int						/* error */
+xfs_bmap_finish(
+	xfs_trans_t		**tp,		/* transaction pointer addr */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*committed)	/* xact committed or not */
+{
+	xfs_efd_log_item_t	*efd;		/* extent free data */
+	xfs_efi_log_item_t	*efi;		/* extent free intention */
+	int			error;		/* error return value */
+	xfs_bmap_free_item_t	*free;		/* free extent item */
+	unsigned int		logres;		/* new log reservation */
+	unsigned int		logcount;	/* new log count */
+	xfs_mount_t		*mp;		/* filesystem mount structure */
+	xfs_bmap_free_item_t	*next;		/* next item on free list */
+	xfs_trans_t		*ntp;		/* new transaction pointer */
+
+	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+	if (flist->xbf_count == 0) {
+		*committed = 0;
+		return 0;
+	}
+	ntp = *tp;
+	efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+	for (free = flist->xbf_first; free; free = free->xbfi_next)
+		xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+			free->xbfi_blockcount);
+	logres = ntp->t_log_res;
+	logcount = ntp->t_log_count;
+	ntp = xfs_trans_dup(*tp);
+	error = xfs_trans_commit(*tp, 0);
+	*tp = ntp;
+	*committed = 1;
+	/*
+	 * We have a new transaction, so we should return committed=1,
+	 * even though we're returning an error.
+	 */
+	if (error)
+		return error;
+
+	/*
+	 * transaction commit worked ok so we can drop the extra ticket
+	 * reference that we gained in xfs_trans_dup()
+	 */
+	xfs_log_ticket_put(ntp->t_ticket);
+
+	if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
+			logcount)))
+		return error;
+	efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+	for (free = flist->xbf_first; free != NULL; free = next) {
+		next = free->xbfi_next;
+		if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+				free->xbfi_blockcount))) {
+			/*
+			 * The bmap free list will be cleaned up at a
+			 * higher level.  The EFI will be canceled when
+			 * this transaction is aborted.
+			 * Need to force shutdown here to make sure it
+			 * happens, since this transaction may not be
+			 * dirty yet.
+			 */
+			mp = ntp->t_mountp;
+			if (!XFS_FORCED_SHUTDOWN(mp))
+				xfs_force_shutdown(mp,
+						   (error == EFSCORRUPTED) ?
+						   SHUTDOWN_CORRUPT_INCORE :
+						   SHUTDOWN_META_IO_ERROR);
+			return error;
+		}
+		xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+			free->xbfi_blockcount);
+		xfs_bmap_del_free(flist, NULL, free);
+	}
+	return 0;
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+	xfs_bmap_free_t		*flist)	/* list of bmap_free_items */
+{
+	xfs_bmap_free_item_t	*free;	/* free list item */
+	xfs_bmap_free_item_t	*next;
+
+	if (flist->xbf_count == 0)
+		return;
+	ASSERT(flist->xbf_first != NULL);
+	for (free = flist->xbf_first; free; free = next) {
+		next = free->xbfi_next;
+		xfs_bmap_del_free(flist, NULL, free);
+	}
+	ASSERT(flist->xbf_count == 0);
+}
+
+/*
+ * Inode fork format manipulation functions
+ */
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the file extents are already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int				/* error */
+xfs_bmap_btree_to_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork)  /* data or attr fork */
+{
+	/* REFERENCED */
+	struct xfs_btree_block	*cblock;/* child btree block */
+	xfs_fsblock_t		cbno;	/* child block number */
+	xfs_buf_t		*cbp;	/* child block's buffer */
+	int			error;	/* error return value */
+	xfs_ifork_t		*ifp;	/* inode fork data */
+	xfs_mount_t		*mp;	/* mount point structure */
+	__be64			*pp;	/* ptr to block address */
+	struct xfs_btree_block	*rblock;/* root btree block */
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+	rblock = ifp->if_broot;
+	ASSERT(be16_to_cpu(rblock->bb_level) == 1);
+	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
+	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
+	cbno = be64_to_cpu(*pp);
+	*logflagsp = 0;
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
+		return error;
+#endif
+	error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+				&xfs_bmbt_buf_ops);
+	if (error)
+		return error;
+	cblock = XFS_BUF_TO_BLOCK(cbp);
+	if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
+		return error;
+	xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+	ip->i_d.di_nblocks--;
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(tp, cbp);
+	if (cur->bc_bufs[0] == cbp)
+		cur->bc_bufs[0] = NULL;
+	xfs_iroot_realloc(ip, -1, whichfork);
+	ASSERT(ifp->if_broot == NULL);
+	ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+	return 0;
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int					/* error */
+xfs_bmap_extents_to_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
+	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
+	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
+	int			wasdel,		/* converting a delayed alloc */
+	int			*logflagsp,	/* inode logging flags */
+	int			whichfork)	/* data or attr fork */
+{
+	struct xfs_btree_block	*ablock;	/* allocated (child) bt block */
+	xfs_buf_t		*abp;		/* buffer for ablock */
+	xfs_alloc_arg_t		args;		/* allocation arguments */
+	xfs_bmbt_rec_t		*arp;		/* child record pointer */
+	struct xfs_btree_block	*block;		/* btree root block */
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
+	int			error;		/* error return value */
+	xfs_extnum_t		i, cnt;		/* extent record index */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	xfs_bmbt_key_t		*kp;		/* root block key pointer */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_extnum_t		nextents;	/* number of file extents */
+	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+
+	/*
+	 * Make space in the inode incore.
+	 */
+	xfs_iroot_realloc(ip, 1, whichfork);
+	ifp->if_flags |= XFS_IFBROOT;
+
+	/*
+	 * Fill in the root.
+	 */
+	block = ifp->if_broot;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+	else
+		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS);
+
+	/*
+	 * Need a cursor.  Can't allocate until bb_level is filled in.
+	 */
+	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+	cur->bc_private.b.firstblock = *firstblock;
+	cur->bc_private.b.flist = flist;
+	cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+	/*
+	 * Convert to a btree with two levels, one record in root.
+	 */
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+	memset(&args, 0, sizeof(args));
+	args.tp = tp;
+	args.mp = mp;
+	args.firstblock = *firstblock;
+	if (*firstblock == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+	} else if (flist->xbf_low) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = *firstblock;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.fsbno = *firstblock;
+	}
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = wasdel;
+	*logflagsp = 0;
+	if ((error = xfs_alloc_vextent(&args))) {
+		xfs_iroot_realloc(ip, -1, whichfork);
+		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+		return error;
+	}
+	/*
+	 * Allocation can't fail, the space was reserved.
+	 */
+	ASSERT(args.fsbno != NULLFSBLOCK);
+	ASSERT(*firstblock == NULLFSBLOCK ||
+	       args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+	       (flist->xbf_low &&
+		args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	ip->i_d.di_nblocks++;
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+	abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+	/*
+	 * Fill in the child block.
+	 */
+	abp->b_ops = &xfs_bmbt_buf_ops;
+	ablock = XFS_BUF_TO_BLOCK(abp);
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+				XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+				XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+	else
+		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+				XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+				XFS_BTREE_LONG_PTRS);
+
+	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (cnt = i = 0; i < nextents; i++) {
+		ep = xfs_iext_get_ext(ifp, i);
+		if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
+			arp->l0 = cpu_to_be64(ep->l0);
+			arp->l1 = cpu_to_be64(ep->l1);
+			arp++; cnt++;
+		}
+	}
+	ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+	xfs_btree_set_numrecs(ablock, cnt);
+
+	/*
+	 * Fill in the root key and pointer.
+	 */
+	kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
+	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
+	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+						be16_to_cpu(block->bb_level)));
+	*pp = cpu_to_be64(args.fsbno);
+
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+	xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+	ASSERT(*curp == NULL);
+	*curp = cur;
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
+	return 0;
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+STATIC int				/* error */
+xfs_bmap_local_to_extents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
+	xfs_extlen_t	total,		/* total blocks needed by transaction */
+	int		*logflagsp,	/* inode logging flags */
+	int		whichfork,
+	void		(*init_fn)(struct xfs_trans *tp,
+				   struct xfs_buf *bp,
+				   struct xfs_inode *ip,
+				   struct xfs_ifork *ifp))
+{
+	int		error;		/* error return value */
+	int		flags;		/* logging flags returned */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+
+	/*
+	 * We don't want to deal with the case of keeping inode data inline yet.
+	 * So sending the data fork of a regular inode is invalid.
+	 */
+	ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	flags = 0;
+	error = 0;
+	if (ifp->if_bytes) {
+		xfs_alloc_arg_t	args;	/* allocation arguments */
+		xfs_buf_t	*bp;	/* buffer for extent block */
+		xfs_bmbt_rec_host_t *ep;/* extent record pointer */
+
+		ASSERT((ifp->if_flags &
+			(XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
+		memset(&args, 0, sizeof(args));
+		args.tp = tp;
+		args.mp = ip->i_mount;
+		args.firstblock = *firstblock;
+		/*
+		 * Allocate a block.  We know we need only one, since the
+		 * file currently fits in an inode.
+		 */
+		if (*firstblock == NULLFSBLOCK) {
+			args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+			args.type = XFS_ALLOCTYPE_START_BNO;
+		} else {
+			args.fsbno = *firstblock;
+			args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		}
+		args.total = total;
+		args.minlen = args.maxlen = args.prod = 1;
+		error = xfs_alloc_vextent(&args);
+		if (error)
+			goto done;
+
+		/* Can't fail, the space was reserved. */
+		ASSERT(args.fsbno != NULLFSBLOCK);
+		ASSERT(args.len == 1);
+		*firstblock = args.fsbno;
+		bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+
+		/* initialise the block and copy the data */
+		init_fn(tp, bp, ip, ifp);
+
+		/* account for the change in fork size and log everything */
+		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+		xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
+		xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+		xfs_iext_add(ifp, 0, 1);
+		ep = xfs_iext_get_ext(ifp, 0);
+		xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+		trace_xfs_bmap_post_update(ip, 0,
+				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
+				_THIS_IP_);
+		XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+		ip->i_d.di_nblocks = 1;
+		xfs_trans_mod_dquot_byino(tp, ip,
+			XFS_TRANS_DQ_BCOUNT, 1L);
+		flags |= xfs_ilog_fext(whichfork);
+	} else {
+		ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+		xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+	}
+	ifp->if_flags &= ~XFS_IFINLINE;
+	ifp->if_flags |= XFS_IFEXTENTS;
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+	flags |= XFS_ILOG_CORE;
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
  * Called from xfs_bmap_add_attrfork to handle btree format files.
  */
 STATIC int					/* error */
@@ -357,7 +1323,34 @@ xfs_bmap_add_attrfork_extents(
 }
 
 /*
- * Called from xfs_bmap_add_attrfork to handle local format files.
+ * Block initialisation function for local to extent format conversion.
+ *
+ * This shouldn't actually be called by anyone, so make sure debug kernels cause
+ * a noticable failure.
+ */
+STATIC void
+xfs_bmap_local_to_extents_init_fn(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	struct xfs_inode	*ip,
+	struct xfs_ifork	*ifp)
+{
+	ASSERT(0);
+	bp->b_ops = &xfs_bmbt_buf_ops;
+	memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files. Each
+ * different data fork content type needs a different callout to do the
+ * conversion. Some are basic and only require special block initialisation
+ * callouts for the data formating, others (directories) are so specialised they
+ * handle everything themselves.
+ *
+ * XXX (dgc): investigate whether directory conversion can use the generic
+ * formatting callout. It should be possible - it's just a very complex
+ * formatter.
  */
 STATIC int					/* error */
 xfs_bmap_add_attrfork_local(
@@ -368,28 +1361,666 @@ xfs_bmap_add_attrfork_local(
 	int			*flags)		/* inode logging flags */
 {
 	xfs_da_args_t		dargs;		/* args for dir/attr code */
-	int			error;		/* error return value */
-	xfs_mount_t		*mp;		/* mount structure pointer */
 
 	if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
 		return 0;
+
 	if (S_ISDIR(ip->i_d.di_mode)) {
-		mp = ip->i_mount;
 		memset(&dargs, 0, sizeof(dargs));
 		dargs.dp = ip;
 		dargs.firstblock = firstblock;
 		dargs.flist = flist;
-		dargs.total = mp->m_dirblkfsbs;
+		dargs.total = ip->i_mount->m_dirblkfsbs;
 		dargs.whichfork = XFS_DATA_FORK;
 		dargs.trans = tp;
-		error = xfs_dir2_sf_to_block(&dargs);
-	} else
-		error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
-			XFS_DATA_FORK);
+		return xfs_dir2_sf_to_block(&dargs);
+	}
+
+	if (S_ISLNK(ip->i_d.di_mode))
+		return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
+						 flags, XFS_DATA_FORK,
+						 xfs_symlink_local_to_remote);
+
+	return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
+					 XFS_DATA_FORK,
+					 xfs_bmap_local_to_extents_init_fn);
+}
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int						/* error code */
+xfs_bmap_add_attrfork(
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			size,		/* space new attribute needs */
+	int			rsvd)		/* xact may use reserved blks */
+{
+	xfs_fsblock_t		firstblock;	/* 1st block/ag allocated */
+	xfs_bmap_free_t		flist;		/* freed extent records */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	int			blks;		/* space reservation */
+	int			version = 1;	/* superblock attr version */
+	int			committed;	/* xaction was committed */
+	int			logflags;	/* logging flags */
+	int			error;		/* error return value */
+
+	ASSERT(XFS_IFORK_Q(ip) == 0);
+
+	mp = ip->i_mount;
+	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+	tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+	blks = XFS_ADDAFORK_SPACE_RES(mp);
+	if (rsvd)
+		tp->t_flags |= XFS_TRANS_RESERVE;
+	if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
+		goto error0;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
+			XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+			XFS_QMOPT_RES_REGBLKS);
+	if (error) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+		return error;
+	}
+	if (XFS_IFORK_Q(ip))
+		goto error1;
+	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+		/*
+		 * For inodes coming from pre-6.2 filesystems.
+		 */
+		ASSERT(ip->i_d.di_aformat == 0);
+		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	}
+	ASSERT(ip->i_d.di_anextents == 0);
+
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_DEV:
+		ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_UUID:
+		ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+		if (!ip->i_d.di_forkoff)
+			ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+		else if (mp->m_flags & XFS_MOUNT_ATTR2)
+			version = 2;
+		break;
+	default:
+		ASSERT(0);
+		error = XFS_ERROR(EINVAL);
+		goto error1;
+	}
+
+	ASSERT(ip->i_afp == NULL);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp->if_flags = XFS_IFEXTENTS;
+	logflags = 0;
+	xfs_bmap_init(&flist, &firstblock);
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+			&flist, &logflags);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (error)
+		goto error2;
+	if (!xfs_sb_version_hasattr(&mp->m_sb) ||
+	   (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
+		__int64_t sbfields = 0;
+
+		spin_lock(&mp->m_sb_lock);
+		if (!xfs_sb_version_hasattr(&mp->m_sb)) {
+			xfs_sb_version_addattr(&mp->m_sb);
+			sbfields |= XFS_SB_VERSIONNUM;
+		}
+		if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
+			xfs_sb_version_addattr2(&mp->m_sb);
+			sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+		}
+		if (sbfields) {
+			spin_unlock(&mp->m_sb_lock);
+			xfs_mod_sb(tp, sbfields);
+		} else
+			spin_unlock(&mp->m_sb_lock);
+	}
+
+	error = xfs_bmap_finish(&tp, &flist, &committed);
+	if (error)
+		goto error2;
+	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+error2:
+	xfs_bmap_cancel(&flist);
+error1:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+error0:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
 	return error;
 }
 
 /*
+ * Internal and external extent tree search functions.
+ */
+
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int					/* error */
+xfs_bmap_read_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode */
+	int			whichfork) /* data or attr fork */
+{
+	struct xfs_btree_block	*block;	/* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_exntfmt_t		exntf;	/* XFS_EXTFMT_NOSTATE, if checking */
+	xfs_extnum_t		i, j;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	__be64			*pp;	/* pointer to block address */
+	/* REFERENCED */
+	xfs_extnum_t		room;	/* number of entries there's room for */
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+					XFS_EXTFMT_INODE(ip);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	ASSERT(level > 0);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+	bno = be64_to_cpu(*pp);
+	ASSERT(bno != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+		if (error)
+			return error;
+		block = XFS_BUF_TO_BLOCK(bp);
+		XFS_WANT_CORRUPTED_GOTO(
+			xfs_bmap_sanity_check(mp, bp, level),
+			error0);
+		if (level == 0)
+			break;
+		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
+		bno = be64_to_cpu(*pp);
+		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+		xfs_trans_brelse(tp, bp);
+	}
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	i = 0;
+	/*
+	 * Loop over all leaf nodes.  Copy information to the extent records.
+	 */
+	for (;;) {
+		xfs_bmbt_rec_t	*frp;
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+		xfs_extnum_t	start;
+
+		num_recs = xfs_btree_get_numrecs(block);
+		if (unlikely(i + num_recs > room)) {
+			ASSERT(i + num_recs <= room);
+			xfs_warn(ip->i_mount,
+				"corrupt dinode %Lu, (btree extents).",
+				(unsigned long long) ip->i_ino);
+			XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+				XFS_ERRLEVEL_LOW, ip->i_mount, block);
+			goto error0;
+		}
+		XFS_WANT_CORRUPTED_GOTO(
+			xfs_bmap_sanity_check(mp, bp, 0),
+			error0);
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
+		if (nextbno != NULLFSBLOCK)
+			xfs_btree_reada_bufl(mp, nextbno, 1,
+					     &xfs_bmbt_buf_ops);
+		/*
+		 * Copy records into the extent records.
+		 */
+		frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+		start = i;
+		for (j = 0; j < num_recs; j++, i++, frp++) {
+			xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
+			trp->l0 = be64_to_cpu(frp->l0);
+			trp->l1 = be64_to_cpu(frp->l1);
+		}
+		if (exntf == XFS_EXTFMT_NOSTATE) {
+			/*
+			 * Check all attribute bmap btree records and
+			 * any "older" data bmap btree records for a
+			 * set bit in the "extent flag" position.
+			 */
+			if (unlikely(xfs_check_nostate_extents(ifp,
+					start, num_recs))) {
+				XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
+						 XFS_ERRLEVEL_LOW,
+						 ip->i_mount);
+				goto error0;
+			}
+		}
+		xfs_trans_brelse(tp, bp);
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+		if (error)
+			return error;
+		block = XFS_BUF_TO_BLOCK(bp);
+	}
+	ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+	XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
+	return 0;
+error0:
+	xfs_trans_brelse(tp, bp);
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
+
+/*
+ * Search the extent records for the entry containing block bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies
+ * past eof, *eofp will be set, and *prevp will contain the last
+ * entry (null if none).  Else, *lastxp will be set to the index
+ * of the found entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
+xfs_bmap_search_multi_extents(
+	xfs_ifork_t	*ifp,		/* inode fork pointer */
+	xfs_fileoff_t	bno,		/* block number searched for */
+	int		*eofp,		/* out: end of file found */
+	xfs_extnum_t	*lastxp,	/* out: last extent index */
+	xfs_bmbt_irec_t	*gotp,		/* out: extent entry found */
+	xfs_bmbt_irec_t	*prevp)		/* out: previous extent entry found */
+{
+	xfs_bmbt_rec_host_t *ep;		/* extent record pointer */
+	xfs_extnum_t	lastx;		/* last extent index */
+
+	/*
+	 * Initialize the extent entry structure to catch access to
+	 * uninitialized br_startblock field.
+	 */
+	gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
+	gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
+	gotp->br_state = XFS_EXT_INVALID;
+#if XFS_BIG_BLKNOS
+	gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
+#else
+	gotp->br_startblock = 0xffffa5a5;
+#endif
+	prevp->br_startoff = NULLFILEOFF;
+
+	ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
+	if (lastx > 0) {
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
+	}
+	if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+		xfs_bmbt_get_all(ep, gotp);
+		*eofp = 0;
+	} else {
+		if (lastx > 0) {
+			*gotp = *prevp;
+		}
+		*eofp = 1;
+		ep = NULL;
+	}
+	*lastxp = lastx;
+	return ep;
+}
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
+xfs_bmap_search_extents(
+	xfs_inode_t     *ip,            /* incore inode pointer */
+	xfs_fileoff_t   bno,            /* block number searched for */
+	int             fork,      	/* data or attr fork */
+	int             *eofp,          /* out: end of file found */
+	xfs_extnum_t    *lastxp,        /* out: last extent index */
+	xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
+	xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
+
+	XFS_STATS_INC(xs_look_exlist);
+	ifp = XFS_IFORK_PTR(ip, fork);
+
+	ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
+
+	if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
+		     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
+		xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+				"Access to block zero in inode %llu "
+				"start_block: %llx start_off: %llx "
+				"blkcnt: %llx extent-state: %x lastx: %x\n",
+			(unsigned long long)ip->i_ino,
+			(unsigned long long)gotp->br_startblock,
+			(unsigned long long)gotp->br_startoff,
+			(unsigned long long)gotp->br_blockcount,
+			gotp->br_state, *lastxp);
+		*lastxp = NULLEXTNUM;
+		*eofp = 1;
+		return NULL;
+	}
+	return ep;
+}
+
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int						/* error */
+xfs_bmap_first_unused(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_extlen_t	len,			/* size of hole to find */
+	xfs_fileoff_t	*first_unused,		/* unused block */
+	int		whichfork)		/* data or attr fork */
+{
+	int		error;			/* error return value */
+	int		idx;			/* extent record index */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_fileoff_t	lastaddr;		/* last block number seen */
+	xfs_fileoff_t	lowest;			/* lowest useful block */
+	xfs_fileoff_t	max;			/* starting useful block */
+	xfs_fileoff_t	off;			/* offset for this block */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*first_unused = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	lowest = *first_unused;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
+		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
+		off = xfs_bmbt_get_startoff(ep);
+		/*
+		 * See if the hole before this extent will work.
+		 */
+		if (off >= lowest + len && off - max >= len) {
+			*first_unused = max;
+			return 0;
+		}
+		lastaddr = off + xfs_bmbt_get_blockcount(ep);
+		max = XFS_FILEOFF_MAX(lastaddr, lowest);
+	}
+	*first_unused = max;
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int						/* error */
+xfs_bmap_last_before(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_fileoff_t	*last_block,		/* last block */
+	int		whichfork)		/* data or attr fork */
+{
+	xfs_fileoff_t	bno;			/* input file offset */
+	int		eof;			/* hit end of file */
+	xfs_bmbt_rec_host_t *ep;		/* pointer to last extent */
+	int		error;			/* error return value */
+	xfs_bmbt_irec_t	got;			/* current extent value */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	lastx;			/* last extent used */
+	xfs_bmbt_irec_t	prev;			/* previous extent value */
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+	       return XFS_ERROR(EIO);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*last_block = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	bno = *last_block - 1;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+		if (prev.br_startoff == NULLFILEOFF)
+			*last_block = 0;
+		else
+			*last_block = prev.br_startoff + prev.br_blockcount;
+	}
+	/*
+	 * Otherwise *last_block is already the right answer.
+	 */
+	return 0;
+}
+
+STATIC int
+xfs_bmap_last_extent(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	struct xfs_bmbt_irec	*rec,
+	int			*is_empty)
+{
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	int			error;
+	int			nextents;
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
+	}
+
+	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*is_empty = 1;
+		return 0;
+	}
+
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
+	*is_empty = 0;
+	return 0;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ *
+ * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
+ * at, or past the EOF.
+ */
+STATIC int
+xfs_bmap_isaeof(
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	bma->aeof = 0;
+	error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
+				     &is_empty);
+	if (error || is_empty)
+		return error;
+
+	/*
+	 * Check if we are allocation or past the last extent, or at least into
+	 * the last delayed allocated extent.
+	 */
+	bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
+		(bma->offset >= rec.br_startoff &&
+		 isnullstartblock(rec.br_startblock));
+	return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.  All offsets are considered outside
+ * the end of file for an empty fork, so 1 is returned in *eof in that case.
+ */
+int
+xfs_bmap_eof(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		endoff,
+	int			whichfork,
+	int			*eof)
+{
+	struct xfs_bmbt_irec	rec;
+	int			error;
+
+	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
+	if (error || *eof)
+		return error;
+
+	*eof = endoff >= rec.br_startoff + rec.br_blockcount;
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent records.
+ * Returns 0 for local files, as they do not have extent records.
+ */
+int
+xfs_bmap_last_offset(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*last_block,
+	int			whichfork)
+{
+	struct xfs_bmbt_irec	rec;
+	int			is_empty;
+	int			error;
+
+	*last_block = 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+		return 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+	       return XFS_ERROR(EIO);
+
+	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
+	if (error || is_empty)
+		return error;
+
+	*last_block = rec.br_startoff + rec.br_blockcount;
+	return 0;
+}
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int					/* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+	xfs_inode_t	*ip,		/* incore inode */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_host_t *ep;	/* ptr to fork's extent */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	int		rval;		/* return value */
+	xfs_bmbt_irec_t	s;		/* internal version of extent */
+
+#ifndef DEBUG
+	if (whichfork == XFS_DATA_FORK)
+		return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
+#endif	/* !DEBUG */
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+		return 0;
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		return 0;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ep = xfs_iext_get_ext(ifp, 0);
+	xfs_bmbt_get_all(ep, &s);
+	rval = s.br_startoff == 0 && s.br_blockcount == 1;
+	if (rval && whichfork == XFS_DATA_FORK)
+		ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
+	return rval;
+}
+
+/*
+ * Extent tree manipulation functions used during allocation.
+ */
+
+/*
  * Convert a delayed allocation to a real allocation.
  */
 STATIC int				/* error */
@@ -1852,6 +3483,10 @@ done:
 }
 
 /*
+ * Functions used in the extent read, allocate and remove paths
+ */
+
+/*
  * Adjust the size of the new extent based on di_extsize and rt extsize.
  */
 STATIC int
@@ -2624,1626 +4259,6 @@ xfs_bmap_alloc(
 }
 
 /*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
- */
-STATIC int				/* error */
-xfs_bmap_btree_to_extents(
-	xfs_trans_t		*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork)  /* data or attr fork */
-{
-	/* REFERENCED */
-	struct xfs_btree_block	*cblock;/* child btree block */
-	xfs_fsblock_t		cbno;	/* child block number */
-	xfs_buf_t		*cbp;	/* child block's buffer */
-	int			error;	/* error return value */
-	xfs_ifork_t		*ifp;	/* inode fork data */
-	xfs_mount_t		*mp;	/* mount point structure */
-	__be64			*pp;	/* ptr to block address */
-	struct xfs_btree_block	*rblock;/* root btree block */
-
-	mp = ip->i_mount;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-	rblock = ifp->if_broot;
-	ASSERT(be16_to_cpu(rblock->bb_level) == 1);
-	ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
-	ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
-	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
-	cbno = be64_to_cpu(*pp);
-	*logflagsp = 0;
-#ifdef DEBUG
-	if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
-		return error;
-#endif
-	error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
-				&xfs_bmbt_buf_ops);
-	if (error)
-		return error;
-	cblock = XFS_BUF_TO_BLOCK(cbp);
-	if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
-		return error;
-	xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
-	ip->i_d.di_nblocks--;
-	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
-	xfs_trans_binval(tp, cbp);
-	if (cur->bc_bufs[0] == cbp)
-		cur->bc_bufs[0] = NULL;
-	xfs_iroot_realloc(ip, -1, whichfork);
-	ASSERT(ifp->if_broot == NULL);
-	ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
-	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
-	return 0;
-}
-
-/*
- * Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
- */
-STATIC int				/* error */
-xfs_bmap_del_extent(
-	xfs_inode_t		*ip,	/* incore inode pointer */
-	xfs_trans_t		*tp,	/* current transaction pointer */
-	xfs_extnum_t		*idx,	/* extent number to update/delete */
-	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
-	xfs_btree_cur_t		*cur,	/* if null, not a btree */
-	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
-	int			*logflagsp, /* inode logging flags */
-	int			whichfork) /* data or attr fork */
-{
-	xfs_filblks_t		da_new;	/* new delay-alloc indirect blocks */
-	xfs_filblks_t		da_old;	/* old delay-alloc indirect blocks */
-	xfs_fsblock_t		del_endblock=0;	/* first block past del */
-	xfs_fileoff_t		del_endoff;	/* first offset past del */
-	int			delay;	/* current block is delayed allocated */
-	int			do_fx;	/* free extent at end of routine */
-	xfs_bmbt_rec_host_t	*ep;	/* current extent entry pointer */
-	int			error;	/* error return value */
-	int			flags;	/* inode logging flags */
-	xfs_bmbt_irec_t		got;	/* current extent entry */
-	xfs_fileoff_t		got_endoff;	/* first offset past got */
-	int			i;	/* temp state */
-	xfs_ifork_t		*ifp;	/* inode fork pointer */
-	xfs_mount_t		*mp;	/* mount structure */
-	xfs_filblks_t		nblks;	/* quota/sb block count */
-	xfs_bmbt_irec_t		new;	/* new record to be inserted */
-	/* REFERENCED */
-	uint			qfield;	/* quota field to update */
-	xfs_filblks_t		temp;	/* for indirect length calculations */
-	xfs_filblks_t		temp2;	/* for indirect length calculations */
-	int			state = 0;
-
-	XFS_STATS_INC(xs_del_exlist);
-
-	if (whichfork == XFS_ATTR_FORK)
-		state |= BMAP_ATTRFORK;
-
-	mp = ip->i_mount;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
-		(uint)sizeof(xfs_bmbt_rec_t)));
-	ASSERT(del->br_blockcount > 0);
-	ep = xfs_iext_get_ext(ifp, *idx);
-	xfs_bmbt_get_all(ep, &got);
-	ASSERT(got.br_startoff <= del->br_startoff);
-	del_endoff = del->br_startoff + del->br_blockcount;
-	got_endoff = got.br_startoff + got.br_blockcount;
-	ASSERT(got_endoff >= del_endoff);
-	delay = isnullstartblock(got.br_startblock);
-	ASSERT(isnullstartblock(del->br_startblock) == delay);
-	flags = 0;
-	qfield = 0;
-	error = 0;
-	/*
-	 * If deleting a real allocation, must free up the disk space.
-	 */
-	if (!delay) {
-		flags = XFS_ILOG_CORE;
-		/*
-		 * Realtime allocation.  Free it and record di_nblocks update.
-		 */
-		if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
-			xfs_fsblock_t	bno;
-			xfs_filblks_t	len;
-
-			ASSERT(do_mod(del->br_blockcount,
-				      mp->m_sb.sb_rextsize) == 0);
-			ASSERT(do_mod(del->br_startblock,
-				      mp->m_sb.sb_rextsize) == 0);
-			bno = del->br_startblock;
-			len = del->br_blockcount;
-			do_div(bno, mp->m_sb.sb_rextsize);
-			do_div(len, mp->m_sb.sb_rextsize);
-			error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
-			if (error)
-				goto done;
-			do_fx = 0;
-			nblks = len * mp->m_sb.sb_rextsize;
-			qfield = XFS_TRANS_DQ_RTBCOUNT;
-		}
-		/*
-		 * Ordinary allocation.
-		 */
-		else {
-			do_fx = 1;
-			nblks = del->br_blockcount;
-			qfield = XFS_TRANS_DQ_BCOUNT;
-		}
-		/*
-		 * Set up del_endblock and cur for later.
-		 */
-		del_endblock = del->br_startblock + del->br_blockcount;
-		if (cur) {
-			if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
-					got.br_startblock, got.br_blockcount,
-					&i)))
-				goto done;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		}
-		da_old = da_new = 0;
-	} else {
-		da_old = startblockval(got.br_startblock);
-		da_new = 0;
-		nblks = 0;
-		do_fx = 0;
-	}
-	/*
-	 * Set flag value to use in switch statement.
-	 * Left-contig is 2, right-contig is 1.
-	 */
-	switch (((got.br_startoff == del->br_startoff) << 1) |
-		(got_endoff == del_endoff)) {
-	case 3:
-		/*
-		 * Matches the whole extent.  Delete the entry.
-		 */
-		xfs_iext_remove(ip, *idx, 1,
-				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
-		--*idx;
-		if (delay)
-			break;
-
-		XFS_IFORK_NEXT_SET(ip, whichfork,
-			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
-		flags |= XFS_ILOG_CORE;
-		if (!cur) {
-			flags |= xfs_ilog_fext(whichfork);
-			break;
-		}
-		if ((error = xfs_btree_delete(cur, &i)))
-			goto done;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-		break;
-
-	case 2:
-		/*
-		 * Deleting the first part of the extent.
-		 */
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_startoff(ep, del_endoff);
-		temp = got.br_blockcount - del->br_blockcount;
-		xfs_bmbt_set_blockcount(ep, temp);
-		if (delay) {
-			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-				da_old);
-			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-			da_new = temp;
-			break;
-		}
-		xfs_bmbt_set_startblock(ep, del_endblock);
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		if (!cur) {
-			flags |= xfs_ilog_fext(whichfork);
-			break;
-		}
-		if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
-				got.br_blockcount - del->br_blockcount,
-				got.br_state)))
-			goto done;
-		break;
-
-	case 1:
-		/*
-		 * Deleting the last part of the extent.
-		 */
-		temp = got.br_blockcount - del->br_blockcount;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep, temp);
-		if (delay) {
-			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-				da_old);
-			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-			da_new = temp;
-			break;
-		}
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		if (!cur) {
-			flags |= xfs_ilog_fext(whichfork);
-			break;
-		}
-		if ((error = xfs_bmbt_update(cur, got.br_startoff,
-				got.br_startblock,
-				got.br_blockcount - del->br_blockcount,
-				got.br_state)))
-			goto done;
-		break;
-
-	case 0:
-		/*
-		 * Deleting the middle of the extent.
-		 */
-		temp = del->br_startoff - got.br_startoff;
-		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep, temp);
-		new.br_startoff = del_endoff;
-		temp2 = got_endoff - del_endoff;
-		new.br_blockcount = temp2;
-		new.br_state = got.br_state;
-		if (!delay) {
-			new.br_startblock = del_endblock;
-			flags |= XFS_ILOG_CORE;
-			if (cur) {
-				if ((error = xfs_bmbt_update(cur,
-						got.br_startoff,
-						got.br_startblock, temp,
-						got.br_state)))
-					goto done;
-				if ((error = xfs_btree_increment(cur, 0, &i)))
-					goto done;
-				cur->bc_rec.b = new;
-				error = xfs_btree_insert(cur, &i);
-				if (error && error != ENOSPC)
-					goto done;
-				/*
-				 * If get no-space back from btree insert,
-				 * it tried a split, and we have a zero
-				 * block reservation.
-				 * Fix up our state and return the error.
-				 */
-				if (error == ENOSPC) {
-					/*
-					 * Reset the cursor, don't trust
-					 * it after any insert operation.
-					 */
-					if ((error = xfs_bmbt_lookup_eq(cur,
-							got.br_startoff,
-							got.br_startblock,
-							temp, &i)))
-						goto done;
-					XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-					/*
-					 * Update the btree record back
-					 * to the original value.
-					 */
-					if ((error = xfs_bmbt_update(cur,
-							got.br_startoff,
-							got.br_startblock,
-							got.br_blockcount,
-							got.br_state)))
-						goto done;
-					/*
-					 * Reset the extent record back
-					 * to the original value.
-					 */
-					xfs_bmbt_set_blockcount(ep,
-						got.br_blockcount);
-					flags = 0;
-					error = XFS_ERROR(ENOSPC);
-					goto done;
-				}
-				XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-			} else
-				flags |= xfs_ilog_fext(whichfork);
-			XFS_IFORK_NEXT_SET(ip, whichfork,
-				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
-		} else {
-			ASSERT(whichfork == XFS_DATA_FORK);
-			temp = xfs_bmap_worst_indlen(ip, temp);
-			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
-			temp2 = xfs_bmap_worst_indlen(ip, temp2);
-			new.br_startblock = nullstartblock((int)temp2);
-			da_new = temp + temp2;
-			while (da_new > da_old) {
-				if (temp) {
-					temp--;
-					da_new--;
-					xfs_bmbt_set_startblock(ep,
-						nullstartblock((int)temp));
-				}
-				if (da_new == da_old)
-					break;
-				if (temp2) {
-					temp2--;
-					da_new--;
-					new.br_startblock =
-						nullstartblock((int)temp2);
-				}
-			}
-		}
-		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
-		xfs_iext_insert(ip, *idx + 1, 1, &new, state);
-		++*idx;
-		break;
-	}
-	/*
-	 * If we need to, add to list of extents to delete.
-	 */
-	if (do_fx)
-		xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
-			mp);
-	/*
-	 * Adjust inode # blocks in the file.
-	 */
-	if (nblks)
-		ip->i_d.di_nblocks -= nblks;
-	/*
-	 * Adjust quota data.
-	 */
-	if (qfield)
-		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
-
-	/*
-	 * Account for change in delayed indirect blocks.
-	 * Nothing to do for disk quota accounting here.
-	 */
-	ASSERT(da_old >= da_new);
-	if (da_old > da_new) {
-		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-			(int64_t)(da_old - da_new), 0);
-	}
-done:
-	*logflagsp = flags;
-	return error;
-}
-
-/*
- * Remove the entry "free" from the free item list.  Prev points to the
- * previous entry, unless "free" is the head of the list.
- */
-STATIC void
-xfs_bmap_del_free(
-	xfs_bmap_free_t		*flist,	/* free item list header */
-	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
-	xfs_bmap_free_item_t	*free)	/* list item to be freed */
-{
-	if (prev)
-		prev->xbfi_next = free->xbfi_next;
-	else
-		flist->xbf_first = free->xbfi_next;
-	flist->xbf_count--;
-	kmem_zone_free(xfs_bmap_free_item_zone, free);
-}
-
-/*
- * Convert an extents-format file into a btree-format file.
- * The new file will have a root block (in the inode) and a single child block.
- */
-STATIC int					/* error */
-xfs_bmap_extents_to_btree(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
-	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
-	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
-	int			wasdel,		/* converting a delayed alloc */
-	int			*logflagsp,	/* inode logging flags */
-	int			whichfork)	/* data or attr fork */
-{
-	struct xfs_btree_block	*ablock;	/* allocated (child) bt block */
-	xfs_buf_t		*abp;		/* buffer for ablock */
-	xfs_alloc_arg_t		args;		/* allocation arguments */
-	xfs_bmbt_rec_t		*arp;		/* child record pointer */
-	struct xfs_btree_block	*block;		/* btree root block */
-	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
-	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
-	int			error;		/* error return value */
-	xfs_extnum_t		i, cnt;		/* extent record index */
-	xfs_ifork_t		*ifp;		/* inode fork pointer */
-	xfs_bmbt_key_t		*kp;		/* root block key pointer */
-	xfs_mount_t		*mp;		/* mount structure */
-	xfs_extnum_t		nextents;	/* number of file extents */
-	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-
-	/*
-	 * Make space in the inode incore.
-	 */
-	xfs_iroot_realloc(ip, 1, whichfork);
-	ifp->if_flags |= XFS_IFBROOT;
-
-	/*
-	 * Fill in the root.
-	 */
-	block = ifp->if_broot;
-	block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
-	block->bb_level = cpu_to_be16(1);
-	block->bb_numrecs = cpu_to_be16(1);
-	block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-	block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-
-	/*
-	 * Need a cursor.  Can't allocate until bb_level is filled in.
-	 */
-	mp = ip->i_mount;
-	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-	cur->bc_private.b.firstblock = *firstblock;
-	cur->bc_private.b.flist = flist;
-	cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
-	/*
-	 * Convert to a btree with two levels, one record in root.
-	 */
-	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
-	memset(&args, 0, sizeof(args));
-	args.tp = tp;
-	args.mp = mp;
-	args.firstblock = *firstblock;
-	if (*firstblock == NULLFSBLOCK) {
-		args.type = XFS_ALLOCTYPE_START_BNO;
-		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
-	} else if (flist->xbf_low) {
-		args.type = XFS_ALLOCTYPE_START_BNO;
-		args.fsbno = *firstblock;
-	} else {
-		args.type = XFS_ALLOCTYPE_NEAR_BNO;
-		args.fsbno = *firstblock;
-	}
-	args.minlen = args.maxlen = args.prod = 1;
-	args.total = args.minleft = args.alignment = args.mod = args.isfl =
-		args.minalignslop = 0;
-	args.wasdel = wasdel;
-	*logflagsp = 0;
-	if ((error = xfs_alloc_vextent(&args))) {
-		xfs_iroot_realloc(ip, -1, whichfork);
-		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-		return error;
-	}
-	/*
-	 * Allocation can't fail, the space was reserved.
-	 */
-	ASSERT(args.fsbno != NULLFSBLOCK);
-	ASSERT(*firstblock == NULLFSBLOCK ||
-	       args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
-	       (flist->xbf_low &&
-		args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
-	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
-	cur->bc_private.b.allocated++;
-	ip->i_d.di_nblocks++;
-	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
-	abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
-	/*
-	 * Fill in the child block.
-	 */
-	abp->b_ops = &xfs_bmbt_buf_ops;
-	ablock = XFS_BUF_TO_BLOCK(abp);
-	ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
-	ablock->bb_level = 0;
-	ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-	ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	for (cnt = i = 0; i < nextents; i++) {
-		ep = xfs_iext_get_ext(ifp, i);
-		if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
-			arp->l0 = cpu_to_be64(ep->l0);
-			arp->l1 = cpu_to_be64(ep->l1);
-			arp++; cnt++;
-		}
-	}
-	ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
-	xfs_btree_set_numrecs(ablock, cnt);
-
-	/*
-	 * Fill in the root key and pointer.
-	 */
-	kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
-	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
-	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
-	pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
-						be16_to_cpu(block->bb_level)));
-	*pp = cpu_to_be64(args.fsbno);
-
-	/*
-	 * Do all this logging at the end so that
-	 * the root is at the right level.
-	 */
-	xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
-	xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
-	ASSERT(*curp == NULL);
-	*curp = cur;
-	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
-	return 0;
-}
-
-/*
- * Calculate the default attribute fork offset for newly created inodes.
- */
-uint
-xfs_default_attroffset(
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	uint			offset;
-
-	if (mp->m_sb.sb_inodesize == 256) {
-		offset = XFS_LITINO(mp) -
-				XFS_BMDR_SPACE_CALC(MINABTPTRS);
-	} else {
-		offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
-	}
-
-	ASSERT(offset < XFS_LITINO(mp));
-	return offset;
-}
-
-/*
- * Helper routine to reset inode di_forkoff field when switching
- * attribute fork from local to extent format - we reset it where
- * possible to make space available for inline data fork extents.
- */
-STATIC void
-xfs_bmap_forkoff_reset(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip,
-	int		whichfork)
-{
-	if (whichfork == XFS_ATTR_FORK &&
-	    ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
-	    ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
-	    ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
-		uint	dfl_forkoff = xfs_default_attroffset(ip) >> 3;
-
-		if (dfl_forkoff > ip->i_d.di_forkoff)
-			ip->i_d.di_forkoff = dfl_forkoff;
-	}
-}
-
-/*
- * Convert a local file to an extents file.
- * This code is out of bounds for data forks of regular files,
- * since the file data needs to get logged so things will stay consistent.
- * (The bmap-level manipulations are ok, though).
- */
-STATIC int				/* error */
-xfs_bmap_local_to_extents(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
-	xfs_extlen_t	total,		/* total blocks needed by transaction */
-	int		*logflagsp,	/* inode logging flags */
-	int		whichfork)	/* data or attr fork */
-{
-	int		error;		/* error return value */
-	int		flags;		/* logging flags returned */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-
-	/*
-	 * We don't want to deal with the case of keeping inode data inline yet.
-	 * So sending the data fork of a regular inode is invalid.
-	 */
-	ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
-	flags = 0;
-	error = 0;
-	if (ifp->if_bytes) {
-		xfs_alloc_arg_t	args;	/* allocation arguments */
-		xfs_buf_t	*bp;	/* buffer for extent block */
-		xfs_bmbt_rec_host_t *ep;/* extent record pointer */
-
-		memset(&args, 0, sizeof(args));
-		args.tp = tp;
-		args.mp = ip->i_mount;
-		args.firstblock = *firstblock;
-		ASSERT((ifp->if_flags &
-			(XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
-		/*
-		 * Allocate a block.  We know we need only one, since the
-		 * file currently fits in an inode.
-		 */
-		if (*firstblock == NULLFSBLOCK) {
-			args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
-			args.type = XFS_ALLOCTYPE_START_BNO;
-		} else {
-			args.fsbno = *firstblock;
-			args.type = XFS_ALLOCTYPE_NEAR_BNO;
-		}
-		args.total = total;
-		args.mod = args.minleft = args.alignment = args.wasdel =
-			args.isfl = args.minalignslop = 0;
-		args.minlen = args.maxlen = args.prod = 1;
-		if ((error = xfs_alloc_vextent(&args)))
-			goto done;
-		/*
-		 * Can't fail, the space was reserved.
-		 */
-		ASSERT(args.fsbno != NULLFSBLOCK);
-		ASSERT(args.len == 1);
-		*firstblock = args.fsbno;
-		bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-		bp->b_ops = &xfs_bmbt_buf_ops;
-		memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
-		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
-		xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
-		xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
-		xfs_iext_add(ifp, 0, 1);
-		ep = xfs_iext_get_ext(ifp, 0);
-		xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
-		trace_xfs_bmap_post_update(ip, 0,
-				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
-				_THIS_IP_);
-		XFS_IFORK_NEXT_SET(ip, whichfork, 1);
-		ip->i_d.di_nblocks = 1;
-		xfs_trans_mod_dquot_byino(tp, ip,
-			XFS_TRANS_DQ_BCOUNT, 1L);
-		flags |= xfs_ilog_fext(whichfork);
-	} else {
-		ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
-		xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
-	}
-	ifp->if_flags &= ~XFS_IFINLINE;
-	ifp->if_flags |= XFS_IFEXTENTS;
-	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-	flags |= XFS_ILOG_CORE;
-done:
-	*logflagsp = flags;
-	return error;
-}
-
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry.  If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none).  Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
-xfs_bmap_search_multi_extents(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_fileoff_t	bno,		/* block number searched for */
-	int		*eofp,		/* out: end of file found */
-	xfs_extnum_t	*lastxp,	/* out: last extent index */
-	xfs_bmbt_irec_t	*gotp,		/* out: extent entry found */
-	xfs_bmbt_irec_t	*prevp)		/* out: previous extent entry found */
-{
-	xfs_bmbt_rec_host_t *ep;		/* extent record pointer */
-	xfs_extnum_t	lastx;		/* last extent index */
-
-	/*
-	 * Initialize the extent entry structure to catch access to
-	 * uninitialized br_startblock field.
-	 */
-	gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
-	gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
-	gotp->br_state = XFS_EXT_INVALID;
-#if XFS_BIG_BLKNOS
-	gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
-#else
-	gotp->br_startblock = 0xffffa5a5;
-#endif
-	prevp->br_startoff = NULLFILEOFF;
-
-	ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
-	if (lastx > 0) {
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
-	}
-	if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
-		xfs_bmbt_get_all(ep, gotp);
-		*eofp = 0;
-	} else {
-		if (lastx > 0) {
-			*gotp = *prevp;
-		}
-		*eofp = 1;
-		ep = NULL;
-	}
-	*lastxp = lastx;
-	return ep;
-}
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry.  If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
-xfs_bmap_search_extents(
-	xfs_inode_t     *ip,            /* incore inode pointer */
-	xfs_fileoff_t   bno,            /* block number searched for */
-	int             fork,      	/* data or attr fork */
-	int             *eofp,          /* out: end of file found */
-	xfs_extnum_t    *lastxp,        /* out: last extent index */
-	xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
-	xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
-{
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
-
-	XFS_STATS_INC(xs_look_exlist);
-	ifp = XFS_IFORK_PTR(ip, fork);
-
-	ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
-
-	if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
-		     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
-		xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
-				"Access to block zero in inode %llu "
-				"start_block: %llx start_off: %llx "
-				"blkcnt: %llx extent-state: %x lastx: %x\n",
-			(unsigned long long)ip->i_ino,
-			(unsigned long long)gotp->br_startblock,
-			(unsigned long long)gotp->br_startoff,
-			(unsigned long long)gotp->br_blockcount,
-			gotp->br_state, *lastxp);
-		*lastxp = NULLEXTNUM;
-		*eofp = 1;
-		return NULL;
-	}
-	return ep;
-}
-
-/*
- * Compute the worst-case number of indirect blocks that will be used
- * for ip's delayed extent of length "len".
- */
-STATIC xfs_filblks_t
-xfs_bmap_worst_indlen(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_filblks_t	len)		/* delayed extent length */
-{
-	int		level;		/* btree level number */
-	int		maxrecs;	/* maximum record count at this level */
-	xfs_mount_t	*mp;		/* mount structure */
-	xfs_filblks_t	rval;		/* return value */
-
-	mp = ip->i_mount;
-	maxrecs = mp->m_bmap_dmxr[0];
-	for (level = 0, rval = 0;
-	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
-	     level++) {
-		len += maxrecs - 1;
-		do_div(len, maxrecs);
-		rval += len;
-		if (len == 1)
-			return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
-				level - 1;
-		if (level == 0)
-			maxrecs = mp->m_bmap_dmxr[1];
-	}
-	return rval;
-}
-
-/*
- * Convert inode from non-attributed to attributed.
- * Must not be in a transaction, ip must not be locked.
- */
-int						/* error code */
-xfs_bmap_add_attrfork(
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	int			size,		/* space new attribute needs */
-	int			rsvd)		/* xact may use reserved blks */
-{
-	xfs_fsblock_t		firstblock;	/* 1st block/ag allocated */
-	xfs_bmap_free_t		flist;		/* freed extent records */
-	xfs_mount_t		*mp;		/* mount structure */
-	xfs_trans_t		*tp;		/* transaction pointer */
-	int			blks;		/* space reservation */
-	int			version = 1;	/* superblock attr version */
-	int			committed;	/* xaction was committed */
-	int			logflags;	/* logging flags */
-	int			error;		/* error return value */
-
-	ASSERT(XFS_IFORK_Q(ip) == 0);
-
-	mp = ip->i_mount;
-	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
-	tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
-	blks = XFS_ADDAFORK_SPACE_RES(mp);
-	if (rsvd)
-		tp->t_flags |= XFS_TRANS_RESERVE;
-	if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
-			XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
-		goto error0;
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
-			XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
-			XFS_QMOPT_RES_REGBLKS);
-	if (error) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
-		return error;
-	}
-	if (XFS_IFORK_Q(ip))
-		goto error1;
-	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
-		/*
-		 * For inodes coming from pre-6.2 filesystems.
-		 */
-		ASSERT(ip->i_d.di_aformat == 0);
-		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-	}
-	ASSERT(ip->i_d.di_anextents == 0);
-
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-	switch (ip->i_d.di_format) {
-	case XFS_DINODE_FMT_DEV:
-		ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
-		break;
-	case XFS_DINODE_FMT_UUID:
-		ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
-		break;
-	case XFS_DINODE_FMT_LOCAL:
-	case XFS_DINODE_FMT_EXTENTS:
-	case XFS_DINODE_FMT_BTREE:
-		ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
-		if (!ip->i_d.di_forkoff)
-			ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
-		else if (mp->m_flags & XFS_MOUNT_ATTR2)
-			version = 2;
-		break;
-	default:
-		ASSERT(0);
-		error = XFS_ERROR(EINVAL);
-		goto error1;
-	}
-
-	ASSERT(ip->i_afp == NULL);
-	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
-	ip->i_afp->if_flags = XFS_IFEXTENTS;
-	logflags = 0;
-	xfs_bmap_init(&flist, &firstblock);
-	switch (ip->i_d.di_format) {
-	case XFS_DINODE_FMT_LOCAL:
-		error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
-			&logflags);
-		break;
-	case XFS_DINODE_FMT_EXTENTS:
-		error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
-			&flist, &logflags);
-		break;
-	case XFS_DINODE_FMT_BTREE:
-		error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
-			&logflags);
-		break;
-	default:
-		error = 0;
-		break;
-	}
-	if (logflags)
-		xfs_trans_log_inode(tp, ip, logflags);
-	if (error)
-		goto error2;
-	if (!xfs_sb_version_hasattr(&mp->m_sb) ||
-	   (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
-		__int64_t sbfields = 0;
-
-		spin_lock(&mp->m_sb_lock);
-		if (!xfs_sb_version_hasattr(&mp->m_sb)) {
-			xfs_sb_version_addattr(&mp->m_sb);
-			sbfields |= XFS_SB_VERSIONNUM;
-		}
-		if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
-			xfs_sb_version_addattr2(&mp->m_sb);
-			sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
-		}
-		if (sbfields) {
-			spin_unlock(&mp->m_sb_lock);
-			xfs_mod_sb(tp, sbfields);
-		} else
-			spin_unlock(&mp->m_sb_lock);
-	}
-
-	error = xfs_bmap_finish(&tp, &flist, &committed);
-	if (error)
-		goto error2;
-	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-error2:
-	xfs_bmap_cancel(&flist);
-error1:
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
-	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-	return error;
-}
-
-/*
- * Add the extent to the list of extents to be free at transaction end.
- * The list is maintained sorted (by block number).
- */
-/* ARGSUSED */
-void
-xfs_bmap_add_free(
-	xfs_fsblock_t		bno,		/* fs block number of extent */
-	xfs_filblks_t		len,		/* length of extent */
-	xfs_bmap_free_t		*flist,		/* list of extents */
-	xfs_mount_t		*mp)		/* mount point structure */
-{
-	xfs_bmap_free_item_t	*cur;		/* current (next) element */
-	xfs_bmap_free_item_t	*new;		/* new element */
-	xfs_bmap_free_item_t	*prev;		/* previous element */
-#ifdef DEBUG
-	xfs_agnumber_t		agno;
-	xfs_agblock_t		agbno;
-
-	ASSERT(bno != NULLFSBLOCK);
-	ASSERT(len > 0);
-	ASSERT(len <= MAXEXTLEN);
-	ASSERT(!isnullstartblock(bno));
-	agno = XFS_FSB_TO_AGNO(mp, bno);
-	agbno = XFS_FSB_TO_AGBNO(mp, bno);
-	ASSERT(agno < mp->m_sb.sb_agcount);
-	ASSERT(agbno < mp->m_sb.sb_agblocks);
-	ASSERT(len < mp->m_sb.sb_agblocks);
-	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
-#endif
-	ASSERT(xfs_bmap_free_item_zone != NULL);
-	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
-	new->xbfi_startblock = bno;
-	new->xbfi_blockcount = (xfs_extlen_t)len;
-	for (prev = NULL, cur = flist->xbf_first;
-	     cur != NULL;
-	     prev = cur, cur = cur->xbfi_next) {
-		if (cur->xbfi_startblock >= bno)
-			break;
-	}
-	if (prev)
-		prev->xbfi_next = new;
-	else
-		flist->xbf_first = new;
-	new->xbfi_next = cur;
-	flist->xbf_count++;
-}
-
-/*
- * Compute and fill in the value of the maximum depth of a bmap btree
- * in this filesystem.  Done once, during mount.
- */
-void
-xfs_bmap_compute_maxlevels(
-	xfs_mount_t	*mp,		/* file system mount structure */
-	int		whichfork)	/* data or attr fork */
-{
-	int		level;		/* btree level */
-	uint		maxblocks;	/* max blocks at this level */
-	uint		maxleafents;	/* max leaf entries possible */
-	int		maxrootrecs;	/* max records in root block */
-	int		minleafrecs;	/* min records in leaf block */
-	int		minnoderecs;	/* min records in node block */
-	int		sz;		/* root block size */
-
-	/*
-	 * The maximum number of extents in a file, hence the maximum
-	 * number of leaf entries, is controlled by the type of di_nextents
-	 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
-	 * (a signed 16-bit number, xfs_aextnum_t).
-	 *
-	 * Note that we can no longer assume that if we are in ATTR1 that
-	 * the fork offset of all the inodes will be
-	 * (xfs_default_attroffset(ip) >> 3) because we could have mounted
-	 * with ATTR2 and then mounted back with ATTR1, keeping the
-	 * di_forkoff's fixed but probably at various positions. Therefore,
-	 * for both ATTR1 and ATTR2 we have to assume the worst case scenario
-	 * of a minimum size available.
-	 */
-	if (whichfork == XFS_DATA_FORK) {
-		maxleafents = MAXEXTNUM;
-		sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
-	} else {
-		maxleafents = MAXAEXTNUM;
-		sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
-	}
-	maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
-	minleafrecs = mp->m_bmap_dmnr[0];
-	minnoderecs = mp->m_bmap_dmnr[1];
-	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-	for (level = 1; maxblocks > 1; level++) {
-		if (maxblocks <= maxrootrecs)
-			maxblocks = 1;
-		else
-			maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-	}
-	mp->m_bm_maxlevels[whichfork] = level;
-}
-
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.  We never free any extents in
- * the first transaction.
- *
- * Return 1 if the given transaction was committed and a new one
- * started, and 0 otherwise in the committed parameter.
- */
-int						/* error */
-xfs_bmap_finish(
-	xfs_trans_t		**tp,		/* transaction pointer addr */
-	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
-	int			*committed)	/* xact committed or not */
-{
-	xfs_efd_log_item_t	*efd;		/* extent free data */
-	xfs_efi_log_item_t	*efi;		/* extent free intention */
-	int			error;		/* error return value */
-	xfs_bmap_free_item_t	*free;		/* free extent item */
-	unsigned int		logres;		/* new log reservation */
-	unsigned int		logcount;	/* new log count */
-	xfs_mount_t		*mp;		/* filesystem mount structure */
-	xfs_bmap_free_item_t	*next;		/* next item on free list */
-	xfs_trans_t		*ntp;		/* new transaction pointer */
-
-	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
-	if (flist->xbf_count == 0) {
-		*committed = 0;
-		return 0;
-	}
-	ntp = *tp;
-	efi = xfs_trans_get_efi(ntp, flist->xbf_count);
-	for (free = flist->xbf_first; free; free = free->xbfi_next)
-		xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
-			free->xbfi_blockcount);
-	logres = ntp->t_log_res;
-	logcount = ntp->t_log_count;
-	ntp = xfs_trans_dup(*tp);
-	error = xfs_trans_commit(*tp, 0);
-	*tp = ntp;
-	*committed = 1;
-	/*
-	 * We have a new transaction, so we should return committed=1,
-	 * even though we're returning an error.
-	 */
-	if (error)
-		return error;
-
-	/*
-	 * transaction commit worked ok so we can drop the extra ticket
-	 * reference that we gained in xfs_trans_dup()
-	 */
-	xfs_log_ticket_put(ntp->t_ticket);
-
-	if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
-			logcount)))
-		return error;
-	efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
-	for (free = flist->xbf_first; free != NULL; free = next) {
-		next = free->xbfi_next;
-		if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
-				free->xbfi_blockcount))) {
-			/*
-			 * The bmap free list will be cleaned up at a
-			 * higher level.  The EFI will be canceled when
-			 * this transaction is aborted.
-			 * Need to force shutdown here to make sure it
-			 * happens, since this transaction may not be
-			 * dirty yet.
-			 */
-			mp = ntp->t_mountp;
-			if (!XFS_FORCED_SHUTDOWN(mp))
-				xfs_force_shutdown(mp,
-						   (error == EFSCORRUPTED) ?
-						   SHUTDOWN_CORRUPT_INCORE :
-						   SHUTDOWN_META_IO_ERROR);
-			return error;
-		}
-		xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
-			free->xbfi_blockcount);
-		xfs_bmap_del_free(flist, NULL, free);
-	}
-	return 0;
-}
-
-/*
- * Free up any items left in the list.
- */
-void
-xfs_bmap_cancel(
-	xfs_bmap_free_t		*flist)	/* list of bmap_free_items */
-{
-	xfs_bmap_free_item_t	*free;	/* free list item */
-	xfs_bmap_free_item_t	*next;
-
-	if (flist->xbf_count == 0)
-		return;
-	ASSERT(flist->xbf_first != NULL);
-	for (free = flist->xbf_first; free; free = next) {
-		next = free->xbfi_next;
-		xfs_bmap_del_free(flist, NULL, free);
-	}
-	ASSERT(flist->xbf_count == 0);
-}
-
-/*
- * Returns the file-relative block number of the first unused block(s)
- * in the file with at least "len" logically contiguous blocks free.
- * This is the lowest-address hole if the file has holes, else the first block
- * past the end of file.
- * Return 0 if the file is currently local (in-inode).
- */
-int						/* error */
-xfs_bmap_first_unused(
-	xfs_trans_t	*tp,			/* transaction pointer */
-	xfs_inode_t	*ip,			/* incore inode */
-	xfs_extlen_t	len,			/* size of hole to find */
-	xfs_fileoff_t	*first_unused,		/* unused block */
-	int		whichfork)		/* data or attr fork */
-{
-	int		error;			/* error return value */
-	int		idx;			/* extent record index */
-	xfs_ifork_t	*ifp;			/* inode fork pointer */
-	xfs_fileoff_t	lastaddr;		/* last block number seen */
-	xfs_fileoff_t	lowest;			/* lowest useful block */
-	xfs_fileoff_t	max;			/* starting useful block */
-	xfs_fileoff_t	off;			/* offset for this block */
-	xfs_extnum_t	nextents;		/* number of extent entries */
-
-	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
-	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
-	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-		*first_unused = 0;
-		return 0;
-	}
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
-		return error;
-	lowest = *first_unused;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
-		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
-		off = xfs_bmbt_get_startoff(ep);
-		/*
-		 * See if the hole before this extent will work.
-		 */
-		if (off >= lowest + len && off - max >= len) {
-			*first_unused = max;
-			return 0;
-		}
-		lastaddr = off + xfs_bmbt_get_blockcount(ep);
-		max = XFS_FILEOFF_MAX(lastaddr, lowest);
-	}
-	*first_unused = max;
-	return 0;
-}
-
-/*
- * Returns the file-relative block number of the last block + 1 before
- * last_block (input value) in the file.
- * This is not based on i_size, it is based on the extent records.
- * Returns 0 for local files, as they do not have extent records.
- */
-int						/* error */
-xfs_bmap_last_before(
-	xfs_trans_t	*tp,			/* transaction pointer */
-	xfs_inode_t	*ip,			/* incore inode */
-	xfs_fileoff_t	*last_block,		/* last block */
-	int		whichfork)		/* data or attr fork */
-{
-	xfs_fileoff_t	bno;			/* input file offset */
-	int		eof;			/* hit end of file */
-	xfs_bmbt_rec_host_t *ep;		/* pointer to last extent */
-	int		error;			/* error return value */
-	xfs_bmbt_irec_t	got;			/* current extent value */
-	xfs_ifork_t	*ifp;			/* inode fork pointer */
-	xfs_extnum_t	lastx;			/* last extent used */
-	xfs_bmbt_irec_t	prev;			/* previous extent value */
-
-	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
-	       return XFS_ERROR(EIO);
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
-		*last_block = 0;
-		return 0;
-	}
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
-		return error;
-	bno = *last_block - 1;
-	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-		&prev);
-	if (eof || xfs_bmbt_get_startoff(ep) > bno) {
-		if (prev.br_startoff == NULLFILEOFF)
-			*last_block = 0;
-		else
-			*last_block = prev.br_startoff + prev.br_blockcount;
-	}
-	/*
-	 * Otherwise *last_block is already the right answer.
-	 */
-	return 0;
-}
-
-STATIC int
-xfs_bmap_last_extent(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	int			whichfork,
-	struct xfs_bmbt_irec	*rec,
-	int			*is_empty)
-{
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
-	int			error;
-	int			nextents;
-
-	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-		error = xfs_iread_extents(tp, ip, whichfork);
-		if (error)
-			return error;
-	}
-
-	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-	if (nextents == 0) {
-		*is_empty = 1;
-		return 0;
-	}
-
-	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec);
-	*is_empty = 0;
-	return 0;
-}
-
-/*
- * Check the last inode extent to determine whether this allocation will result
- * in blocks being allocated at the end of the file. When we allocate new data
- * blocks at the end of the file which do not start at the previous data block,
- * we will try to align the new blocks at stripe unit boundaries.
- *
- * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
- * at, or past the EOF.
- */
-STATIC int
-xfs_bmap_isaeof(
-	struct xfs_bmalloca	*bma,
-	int			whichfork)
-{
-	struct xfs_bmbt_irec	rec;
-	int			is_empty;
-	int			error;
-
-	bma->aeof = 0;
-	error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
-				     &is_empty);
-	if (error || is_empty)
-		return error;
-
-	/*
-	 * Check if we are allocation or past the last extent, or at least into
-	 * the last delayed allocated extent.
-	 */
-	bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount ||
-		(bma->offset >= rec.br_startoff &&
-		 isnullstartblock(rec.br_startblock));
-	return 0;
-}
-
-/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary.  All offsets are considered outside
- * the end of file for an empty fork, so 1 is returned in *eof in that case.
- */
-int
-xfs_bmap_eof(
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		endoff,
-	int			whichfork,
-	int			*eof)
-{
-	struct xfs_bmbt_irec	rec;
-	int			error;
-
-	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
-	if (error || *eof)
-		return error;
-
-	*eof = endoff >= rec.br_startoff + rec.br_blockcount;
-	return 0;
-}
-
-/*
- * Returns the file-relative block number of the first block past eof in
- * the file.  This is not based on i_size, it is based on the extent records.
- * Returns 0 for local files, as they do not have extent records.
- */
-int
-xfs_bmap_last_offset(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		*last_block,
-	int			whichfork)
-{
-	struct xfs_bmbt_irec	rec;
-	int			is_empty;
-	int			error;
-
-	*last_block = 0;
-
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
-		return 0;
-
-	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-	       return XFS_ERROR(EIO);
-
-	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
-	if (error || is_empty)
-		return error;
-
-	*last_block = rec.br_startoff + rec.br_blockcount;
-	return 0;
-}
-
-/*
- * Returns whether the selected fork of the inode has exactly one
- * block or not.  For the data fork we check this matches di_size,
- * implying the file's range is 0..bsize-1.
- */
-int					/* 1=>1 block, 0=>otherwise */
-xfs_bmap_one_block(
-	xfs_inode_t	*ip,		/* incore inode */
-	int		whichfork)	/* data or attr fork */
-{
-	xfs_bmbt_rec_host_t *ep;	/* ptr to fork's extent */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	int		rval;		/* return value */
-	xfs_bmbt_irec_t	s;		/* internal version of extent */
-
-#ifndef DEBUG
-	if (whichfork == XFS_DATA_FORK)
-		return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
-#endif	/* !DEBUG */
-	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
-		return 0;
-	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-		return 0;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
-	ep = xfs_iext_get_ext(ifp, 0);
-	xfs_bmbt_get_all(ep, &s);
-	rval = s.br_startoff == 0 && s.br_blockcount == 1;
-	if (rval && whichfork == XFS_DATA_FORK)
-		ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
-	return rval;
-}
-
-STATIC int
-xfs_bmap_sanity_check(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp,
-	int			level)
-{
-	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-
-	if (block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) ||
-	    be16_to_cpu(block->bb_level) != level ||
-	    be16_to_cpu(block->bb_numrecs) == 0 ||
-	    be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
-		return 0;
-	return 1;
-}
-
-/*
- * Read in the extents to if_extents.
- * All inode fields are set up by caller, we just traverse the btree
- * and copy the records in. If the file system cannot contain unwritten
- * extents, the records are checked for no "state" flags.
- */
-int					/* error */
-xfs_bmap_read_extents(
-	xfs_trans_t		*tp,	/* transaction pointer */
-	xfs_inode_t		*ip,	/* incore inode */
-	int			whichfork) /* data or attr fork */
-{
-	struct xfs_btree_block	*block;	/* current btree block */
-	xfs_fsblock_t		bno;	/* block # of "block" */
-	xfs_buf_t		*bp;	/* buffer for "block" */
-	int			error;	/* error return value */
-	xfs_exntfmt_t		exntf;	/* XFS_EXTFMT_NOSTATE, if checking */
-	xfs_extnum_t		i, j;	/* index into the extents list */
-	xfs_ifork_t		*ifp;	/* fork structure */
-	int			level;	/* btree level, for checking */
-	xfs_mount_t		*mp;	/* file system mount structure */
-	__be64			*pp;	/* pointer to block address */
-	/* REFERENCED */
-	xfs_extnum_t		room;	/* number of entries there's room for */
-
-	bno = NULLFSBLOCK;
-	mp = ip->i_mount;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
-					XFS_EXTFMT_INODE(ip);
-	block = ifp->if_broot;
-	/*
-	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
-	 */
-	level = be16_to_cpu(block->bb_level);
-	ASSERT(level > 0);
-	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
-	bno = be64_to_cpu(*pp);
-	ASSERT(bno != NULLDFSBNO);
-	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-	/*
-	 * Go down the tree until leaf level is reached, following the first
-	 * pointer (leftmost) at each level.
-	 */
-	while (level-- > 0) {
-		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
-		if (error)
-			return error;
-		block = XFS_BUF_TO_BLOCK(bp);
-		XFS_WANT_CORRUPTED_GOTO(
-			xfs_bmap_sanity_check(mp, bp, level),
-			error0);
-		if (level == 0)
-			break;
-		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
-		bno = be64_to_cpu(*pp);
-		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
-		xfs_trans_brelse(tp, bp);
-	}
-	/*
-	 * Here with bp and block set to the leftmost leaf node in the tree.
-	 */
-	room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	i = 0;
-	/*
-	 * Loop over all leaf nodes.  Copy information to the extent records.
-	 */
-	for (;;) {
-		xfs_bmbt_rec_t	*frp;
-		xfs_fsblock_t	nextbno;
-		xfs_extnum_t	num_recs;
-		xfs_extnum_t	start;
-
-		num_recs = xfs_btree_get_numrecs(block);
-		if (unlikely(i + num_recs > room)) {
-			ASSERT(i + num_recs <= room);
-			xfs_warn(ip->i_mount,
-				"corrupt dinode %Lu, (btree extents).",
-				(unsigned long long) ip->i_ino);
-			XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
-				XFS_ERRLEVEL_LOW, ip->i_mount, block);
-			goto error0;
-		}
-		XFS_WANT_CORRUPTED_GOTO(
-			xfs_bmap_sanity_check(mp, bp, 0),
-			error0);
-		/*
-		 * Read-ahead the next leaf block, if any.
-		 */
-		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-		if (nextbno != NULLFSBLOCK)
-			xfs_btree_reada_bufl(mp, nextbno, 1,
-					     &xfs_bmbt_buf_ops);
-		/*
-		 * Copy records into the extent records.
-		 */
-		frp = XFS_BMBT_REC_ADDR(mp, block, 1);
-		start = i;
-		for (j = 0; j < num_recs; j++, i++, frp++) {
-			xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
-			trp->l0 = be64_to_cpu(frp->l0);
-			trp->l1 = be64_to_cpu(frp->l1);
-		}
-		if (exntf == XFS_EXTFMT_NOSTATE) {
-			/*
-			 * Check all attribute bmap btree records and
-			 * any "older" data bmap btree records for a
-			 * set bit in the "extent flag" position.
-			 */
-			if (unlikely(xfs_check_nostate_extents(ifp,
-					start, num_recs))) {
-				XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
-						 XFS_ERRLEVEL_LOW,
-						 ip->i_mount);
-				goto error0;
-			}
-		}
-		xfs_trans_brelse(tp, bp);
-		bno = nextbno;
-		/*
-		 * If we've reached the end, stop.
-		 */
-		if (bno == NULLFSBLOCK)
-			break;
-		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
-		if (error)
-			return error;
-		block = XFS_BUF_TO_BLOCK(bp);
-	}
-	ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
-	ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
-	XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
-	return 0;
-error0:
-	xfs_trans_brelse(tp, bp);
-	return XFS_ERROR(EFSCORRUPTED);
-}
-
-#ifdef DEBUG
-/*
- * Add bmap trace insert entries for all the contents of the extent records.
- */
-void
-xfs_bmap_trace_exlist(
-	xfs_inode_t	*ip,		/* incore inode pointer */
-	xfs_extnum_t	cnt,		/* count of entries in the list */
-	int		whichfork,	/* data or attr fork */
-	unsigned long	caller_ip)
-{
-	xfs_extnum_t	idx;		/* extent record index */
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	int		state = 0;
-
-	if (whichfork == XFS_ATTR_FORK)
-		state |= BMAP_ATTRFORK;
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
-	for (idx = 0; idx < cnt; idx++)
-		trace_xfs_extlist(ip, idx, whichfork, caller_ip);
-}
-
-/*
- * Validate that the bmbt_irecs being returned from bmapi are valid
- * given the callers original parameters.  Specifically check the
- * ranges of the returned irecs to ensure that they only extent beyond
- * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
- */
-STATIC void
-xfs_bmap_validate_ret(
-	xfs_fileoff_t		bno,
-	xfs_filblks_t		len,
-	int			flags,
-	xfs_bmbt_irec_t		*mval,
-	int			nmap,
-	int			ret_nmap)
-{
-	int			i;		/* index to map values */
-
-	ASSERT(ret_nmap <= nmap);
-
-	for (i = 0; i < ret_nmap; i++) {
-		ASSERT(mval[i].br_blockcount > 0);
-		if (!(flags & XFS_BMAPI_ENTIRE)) {
-			ASSERT(mval[i].br_startoff >= bno);
-			ASSERT(mval[i].br_blockcount <= len);
-			ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
-			       bno + len);
-		} else {
-			ASSERT(mval[i].br_startoff < bno + len);
-			ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
-			       bno);
-		}
-		ASSERT(i == 0 ||
-		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
-		       mval[i].br_startoff);
-		ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
-		       mval[i].br_startblock != HOLESTARTBLOCK);
-		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
-		       mval[i].br_state == XFS_EXT_UNWRITTEN);
-	}
-}
-#endif /* DEBUG */
-
-
-/*
  * Trim the returned map to the required bounds
  */
 STATIC void
@@ -4919,8 +4934,32 @@ xfs_bmapi_write(
 	XFS_STATS_INC(xs_blk_mapw);
 
 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		/*
+		 * XXX (dgc): This assumes we are only called for inodes that
+		 * contain content neutral data in local format. Anything that
+		 * contains caller-specific data in local format that needs
+		 * transformation to move to a block format needs to do the
+		 * conversion to extent format itself.
+		 *
+		 * Directory data forks and attribute forks handle this
+		 * themselves, but with the addition of metadata verifiers every
+		 * data fork in local format now contains caller specific data
+		 * and as such conversion through this function is likely to be
+		 * broken.
+		 *
+		 * The only likely user of this branch is for remote symlinks,
+		 * but we cannot overwrite the data fork contents of the symlink
+		 * (EEXIST occurs higher up the stack) and so it will never go
+		 * from local format to extent format here. Hence I don't think
+		 * this branch is ever executed intentionally and we should
+		 * consider removing it and asserting that xfs_bmapi_write()
+		 * cannot be called directly on local format forks. i.e. callers
+		 * are completely responsible for local to extent format
+		 * conversion, not xfs_bmapi_write().
+		 */
 		error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
-						  &bma.logflags, whichfork);
+					&bma.logflags, whichfork,
+					xfs_bmap_local_to_extents_init_fn);
 		if (error)
 			goto error0;
 	}
@@ -5083,6 +5122,328 @@ error0:
 }
 
 /*
+ * Called by xfs_bmapi to update file extent records and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_del_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_trans_t		*tp,	/* current transaction pointer */
+	xfs_extnum_t		*idx,	/* extent number to update/delete */
+	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork) /* data or attr fork */
+{
+	xfs_filblks_t		da_new;	/* new delay-alloc indirect blocks */
+	xfs_filblks_t		da_old;	/* old delay-alloc indirect blocks */
+	xfs_fsblock_t		del_endblock=0;	/* first block past del */
+	xfs_fileoff_t		del_endoff;	/* first offset past del */
+	int			delay;	/* current block is delayed allocated */
+	int			do_fx;	/* free extent at end of routine */
+	xfs_bmbt_rec_host_t	*ep;	/* current extent entry pointer */
+	int			error;	/* error return value */
+	int			flags;	/* inode logging flags */
+	xfs_bmbt_irec_t		got;	/* current extent entry */
+	xfs_fileoff_t		got_endoff;	/* first offset past got */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_filblks_t		nblks;	/* quota/sb block count */
+	xfs_bmbt_irec_t		new;	/* new record to be inserted */
+	/* REFERENCED */
+	uint			qfield;	/* quota field to update */
+	xfs_filblks_t		temp;	/* for indirect length calculations */
+	xfs_filblks_t		temp2;	/* for indirect length calculations */
+	int			state = 0;
+
+	XFS_STATS_INC(xs_del_exlist);
+
+	if (whichfork == XFS_ATTR_FORK)
+		state |= BMAP_ATTRFORK;
+
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
+		(uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(del->br_blockcount > 0);
+	ep = xfs_iext_get_ext(ifp, *idx);
+	xfs_bmbt_get_all(ep, &got);
+	ASSERT(got.br_startoff <= del->br_startoff);
+	del_endoff = del->br_startoff + del->br_blockcount;
+	got_endoff = got.br_startoff + got.br_blockcount;
+	ASSERT(got_endoff >= del_endoff);
+	delay = isnullstartblock(got.br_startblock);
+	ASSERT(isnullstartblock(del->br_startblock) == delay);
+	flags = 0;
+	qfield = 0;
+	error = 0;
+	/*
+	 * If deleting a real allocation, must free up the disk space.
+	 */
+	if (!delay) {
+		flags = XFS_ILOG_CORE;
+		/*
+		 * Realtime allocation.  Free it and record di_nblocks update.
+		 */
+		if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+			xfs_fsblock_t	bno;
+			xfs_filblks_t	len;
+
+			ASSERT(do_mod(del->br_blockcount,
+				      mp->m_sb.sb_rextsize) == 0);
+			ASSERT(do_mod(del->br_startblock,
+				      mp->m_sb.sb_rextsize) == 0);
+			bno = del->br_startblock;
+			len = del->br_blockcount;
+			do_div(bno, mp->m_sb.sb_rextsize);
+			do_div(len, mp->m_sb.sb_rextsize);
+			error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len);
+			if (error)
+				goto done;
+			do_fx = 0;
+			nblks = len * mp->m_sb.sb_rextsize;
+			qfield = XFS_TRANS_DQ_RTBCOUNT;
+		}
+		/*
+		 * Ordinary allocation.
+		 */
+		else {
+			do_fx = 1;
+			nblks = del->br_blockcount;
+			qfield = XFS_TRANS_DQ_BCOUNT;
+		}
+		/*
+		 * Set up del_endblock and cur for later.
+		 */
+		del_endblock = del->br_startblock + del->br_blockcount;
+		if (cur) {
+			if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+					got.br_startblock, got.br_blockcount,
+					&i)))
+				goto done;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		}
+		da_old = da_new = 0;
+	} else {
+		da_old = startblockval(got.br_startblock);
+		da_new = 0;
+		nblks = 0;
+		do_fx = 0;
+	}
+	/*
+	 * Set flag value to use in switch statement.
+	 * Left-contig is 2, right-contig is 1.
+	 */
+	switch (((got.br_startoff == del->br_startoff) << 1) |
+		(got_endoff == del_endoff)) {
+	case 3:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		xfs_iext_remove(ip, *idx, 1,
+				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
+		--*idx;
+		if (delay)
+			break;
+
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+		flags |= XFS_ILOG_CORE;
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_btree_delete(cur, &i)))
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+		break;
+
+	case 2:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_startoff(ep, del_endoff);
+		temp = got.br_blockcount - del->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		xfs_bmbt_set_startblock(ep, del_endblock);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 1:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		temp = got.br_blockcount - del->br_blockcount;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		if (!cur) {
+			flags |= xfs_ilog_fext(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, got.br_startoff,
+				got.br_startblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 */
+		temp = del->br_startoff - got.br_startoff;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		new.br_startoff = del_endoff;
+		temp2 = got_endoff - del_endoff;
+		new.br_blockcount = temp2;
+		new.br_state = got.br_state;
+		if (!delay) {
+			new.br_startblock = del_endblock;
+			flags |= XFS_ILOG_CORE;
+			if (cur) {
+				if ((error = xfs_bmbt_update(cur,
+						got.br_startoff,
+						got.br_startblock, temp,
+						got.br_state)))
+					goto done;
+				if ((error = xfs_btree_increment(cur, 0, &i)))
+					goto done;
+				cur->bc_rec.b = new;
+				error = xfs_btree_insert(cur, &i);
+				if (error && error != ENOSPC)
+					goto done;
+				/*
+				 * If get no-space back from btree insert,
+				 * it tried a split, and we have a zero
+				 * block reservation.
+				 * Fix up our state and return the error.
+				 */
+				if (error == ENOSPC) {
+					/*
+					 * Reset the cursor, don't trust
+					 * it after any insert operation.
+					 */
+					if ((error = xfs_bmbt_lookup_eq(cur,
+							got.br_startoff,
+							got.br_startblock,
+							temp, &i)))
+						goto done;
+					XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+					/*
+					 * Update the btree record back
+					 * to the original value.
+					 */
+					if ((error = xfs_bmbt_update(cur,
+							got.br_startoff,
+							got.br_startblock,
+							got.br_blockcount,
+							got.br_state)))
+						goto done;
+					/*
+					 * Reset the extent record back
+					 * to the original value.
+					 */
+					xfs_bmbt_set_blockcount(ep,
+						got.br_blockcount);
+					flags = 0;
+					error = XFS_ERROR(ENOSPC);
+					goto done;
+				}
+				XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			} else
+				flags |= xfs_ilog_fext(whichfork);
+			XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+		} else {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			temp = xfs_bmap_worst_indlen(ip, temp);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			temp2 = xfs_bmap_worst_indlen(ip, temp2);
+			new.br_startblock = nullstartblock((int)temp2);
+			da_new = temp + temp2;
+			while (da_new > da_old) {
+				if (temp) {
+					temp--;
+					da_new--;
+					xfs_bmbt_set_startblock(ep,
+						nullstartblock((int)temp));
+				}
+				if (da_new == da_old)
+					break;
+				if (temp2) {
+					temp2--;
+					da_new--;
+					new.br_startblock =
+						nullstartblock((int)temp2);
+				}
+			}
+		}
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+		xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+		++*idx;
+		break;
+	}
+	/*
+	 * If we need to, add to list of extents to delete.
+	 */
+	if (do_fx)
+		xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+			mp);
+	/*
+	 * Adjust inode # blocks in the file.
+	 */
+	if (nblks)
+		ip->i_d.di_nblocks -= nblks;
+	/*
+	 * Adjust quota data.
+	 */
+	if (qfield)
+		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+
+	/*
+	 * Account for change in delayed indirect blocks.
+	 * Nothing to do for disk quota accounting here.
+	 */
+	ASSERT(da_old >= da_new);
+	if (da_old > da_new) {
+		xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+			(int64_t)(da_old - da_new), 0);
+	}
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
  * Unmap (remove) blocks from a file.
  * If nexts is nonzero then the number of extents to remove is limited to
  * that value.  If not all extents in the block range can be removed then
@@ -5743,416 +6104,6 @@ xfs_getbmap(
 	return error;
 }
 
-#ifdef DEBUG
-STATIC struct xfs_buf *
-xfs_bmap_get_bp(
-	struct xfs_btree_cur	*cur,
-	xfs_fsblock_t		bno)
-{
-	struct xfs_log_item_desc *lidp;
-	int			i;
-
-	if (!cur)
-		return NULL;
-
-	for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
-		if (!cur->bc_bufs[i])
-			break;
-		if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno)
-			return cur->bc_bufs[i];
-	}
-
-	/* Chase down all the log items to see if the bp is there */
-	list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
-		struct xfs_buf_log_item	*bip;
-		bip = (struct xfs_buf_log_item *)lidp->lid_item;
-		if (bip->bli_item.li_type == XFS_LI_BUF &&
-		    XFS_BUF_ADDR(bip->bli_buf) == bno)
-			return bip->bli_buf;
-	}
-
-	return NULL;
-}
-
-STATIC void
-xfs_check_block(
-	struct xfs_btree_block	*block,
-	xfs_mount_t		*mp,
-	int			root,
-	short			sz)
-{
-	int			i, j, dmxr;
-	__be64			*pp, *thispa;	/* pointer to block address */
-	xfs_bmbt_key_t		*prevp, *keyp;
-
-	ASSERT(be16_to_cpu(block->bb_level) > 0);
-
-	prevp = NULL;
-	for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
-		dmxr = mp->m_bmap_dmxr[0];
-		keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
-
-		if (prevp) {
-			ASSERT(be64_to_cpu(prevp->br_startoff) <
-			       be64_to_cpu(keyp->br_startoff));
-		}
-		prevp = keyp;
-
-		/*
-		 * Compare the block numbers to see if there are dups.
-		 */
-		if (root)
-			pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
-		else
-			pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
-
-		for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
-			if (root)
-				thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
-			else
-				thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
-			if (*thispa == *pp) {
-				xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
-					__func__, j, i,
-					(unsigned long long)be64_to_cpu(*thispa));
-				panic("%s: ptrs are equal in node\n",
-					__func__);
-			}
-		}
-	}
-}
-
-/*
- * Check that the extents for the inode ip are in the right order in all
- * btree leaves.
- */
-
-STATIC void
-xfs_bmap_check_leaf_extents(
-	xfs_btree_cur_t		*cur,	/* btree cursor or null */
-	xfs_inode_t		*ip,		/* incore inode pointer */
-	int			whichfork)	/* data or attr fork */
-{
-	struct xfs_btree_block	*block;	/* current btree block */
-	xfs_fsblock_t		bno;	/* block # of "block" */
-	xfs_buf_t		*bp;	/* buffer for "block" */
-	int			error;	/* error return value */
-	xfs_extnum_t		i=0, j;	/* index into the extents list */
-	xfs_ifork_t		*ifp;	/* fork structure */
-	int			level;	/* btree level, for checking */
-	xfs_mount_t		*mp;	/* file system mount structure */
-	__be64			*pp;	/* pointer to block address */
-	xfs_bmbt_rec_t		*ep;	/* pointer to current extent */
-	xfs_bmbt_rec_t		last = {0, 0}; /* last extent in prev block */
-	xfs_bmbt_rec_t		*nextp;	/* pointer to next extent */
-	int			bp_release = 0;
-
-	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
-		return;
-	}
-
-	bno = NULLFSBLOCK;
-	mp = ip->i_mount;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	block = ifp->if_broot;
-	/*
-	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
-	 */
-	level = be16_to_cpu(block->bb_level);
-	ASSERT(level > 0);
-	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
-	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
-	bno = be64_to_cpu(*pp);
-
-	ASSERT(bno != NULLDFSBNO);
-	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
-	/*
-	 * Go down the tree until leaf level is reached, following the first
-	 * pointer (leftmost) at each level.
-	 */
-	while (level-- > 0) {
-		/* See if buf is in cur first */
-		bp_release = 0;
-		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-		if (!bp) {
-			bp_release = 1;
-			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-						XFS_BMAP_BTREE_REF,
-						&xfs_bmbt_buf_ops);
-			if (error)
-				goto error_norelse;
-		}
-		block = XFS_BUF_TO_BLOCK(bp);
-		XFS_WANT_CORRUPTED_GOTO(
-			xfs_bmap_sanity_check(mp, bp, level),
-			error0);
-		if (level == 0)
-			break;
-
-		/*
-		 * Check this block for basic sanity (increasing keys and
-		 * no duplicate blocks).
-		 */
-
-		xfs_check_block(block, mp, 0, 0);
-		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
-		bno = be64_to_cpu(*pp);
-		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
-		if (bp_release) {
-			bp_release = 0;
-			xfs_trans_brelse(NULL, bp);
-		}
-	}
-
-	/*
-	 * Here with bp and block set to the leftmost leaf node in the tree.
-	 */
-	i = 0;
-
-	/*
-	 * Loop over all leaf nodes checking that all extents are in the right order.
-	 */
-	for (;;) {
-		xfs_fsblock_t	nextbno;
-		xfs_extnum_t	num_recs;
-
-
-		num_recs = xfs_btree_get_numrecs(block);
-
-		/*
-		 * Read-ahead the next leaf block, if any.
-		 */
-
-		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-
-		/*
-		 * Check all the extents to make sure they are OK.
-		 * If we had a previous block, the last entry should
-		 * conform with the first entry in this one.
-		 */
-
-		ep = XFS_BMBT_REC_ADDR(mp, block, 1);
-		if (i) {
-			ASSERT(xfs_bmbt_disk_get_startoff(&last) +
-			       xfs_bmbt_disk_get_blockcount(&last) <=
-			       xfs_bmbt_disk_get_startoff(ep));
-		}
-		for (j = 1; j < num_recs; j++) {
-			nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
-			ASSERT(xfs_bmbt_disk_get_startoff(ep) +
-			       xfs_bmbt_disk_get_blockcount(ep) <=
-			       xfs_bmbt_disk_get_startoff(nextp));
-			ep = nextp;
-		}
-
-		last = *ep;
-		i += num_recs;
-		if (bp_release) {
-			bp_release = 0;
-			xfs_trans_brelse(NULL, bp);
-		}
-		bno = nextbno;
-		/*
-		 * If we've reached the end, stop.
-		 */
-		if (bno == NULLFSBLOCK)
-			break;
-
-		bp_release = 0;
-		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-		if (!bp) {
-			bp_release = 1;
-			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-						XFS_BMAP_BTREE_REF,
-						&xfs_bmbt_buf_ops);
-			if (error)
-				goto error_norelse;
-		}
-		block = XFS_BUF_TO_BLOCK(bp);
-	}
-	if (bp_release) {
-		bp_release = 0;
-		xfs_trans_brelse(NULL, bp);
-	}
-	return;
-
-error0:
-	xfs_warn(mp, "%s: at error0", __func__);
-	if (bp_release)
-		xfs_trans_brelse(NULL, bp);
-error_norelse:
-	xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
-		__func__, i);
-	panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
-	return;
-}
-#endif
-
-/*
- * Count fsblocks of the given fork.
- */
-int						/* error */
-xfs_bmap_count_blocks(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	xfs_inode_t		*ip,		/* incore inode */
-	int			whichfork,	/* data or attr fork */
-	int			*count)		/* out: count of blocks */
-{
-	struct xfs_btree_block	*block;	/* current btree block */
-	xfs_fsblock_t		bno;	/* block # of "block" */
-	xfs_ifork_t		*ifp;	/* fork structure */
-	int			level;	/* btree level, for checking */
-	xfs_mount_t		*mp;	/* file system mount structure */
-	__be64			*pp;	/* pointer to block address */
-
-	bno = NULLFSBLOCK;
-	mp = ip->i_mount;
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
-		xfs_bmap_count_leaves(ifp, 0,
-			ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
-			count);
-		return 0;
-	}
-
-	/*
-	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
-	 */
-	block = ifp->if_broot;
-	level = be16_to_cpu(block->bb_level);
-	ASSERT(level > 0);
-	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
-	bno = be64_to_cpu(*pp);
-	ASSERT(bno != NULLDFSBNO);
-	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
-	if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
-		XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
-				 mp);
-		return XFS_ERROR(EFSCORRUPTED);
-	}
-
-	return 0;
-}
-
-/*
- * Recursively walks each level of a btree
- * to count total fsblocks is use.
- */
-STATIC int                                     /* error */
-xfs_bmap_count_tree(
-	xfs_mount_t     *mp,            /* file system mount point */
-	xfs_trans_t     *tp,            /* transaction pointer */
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_fsblock_t   blockno,	/* file system block number */
-	int             levelin,	/* level in btree */
-	int		*count)		/* Count of blocks */
-{
-	int			error;
-	xfs_buf_t		*bp, *nbp;
-	int			level = levelin;
-	__be64			*pp;
-	xfs_fsblock_t           bno = blockno;
-	xfs_fsblock_t		nextbno;
-	struct xfs_btree_block	*block, *nextblock;
-	int			numrecs;
-
-	error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
-						&xfs_bmbt_buf_ops);
-	if (error)
-		return error;
-	*count += 1;
-	block = XFS_BUF_TO_BLOCK(bp);
-
-	if (--level) {
-		/* Not at node above leaves, count this level of nodes */
-		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-		while (nextbno != NULLFSBLOCK) {
-			error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
-						XFS_BMAP_BTREE_REF,
-						&xfs_bmbt_buf_ops);
-			if (error)
-				return error;
-			*count += 1;
-			nextblock = XFS_BUF_TO_BLOCK(nbp);
-			nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
-			xfs_trans_brelse(tp, nbp);
-		}
-
-		/* Dive to the next level */
-		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
-		bno = be64_to_cpu(*pp);
-		if (unlikely((error =
-		     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
-			xfs_trans_brelse(tp, bp);
-			XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
-					 XFS_ERRLEVEL_LOW, mp);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
-		xfs_trans_brelse(tp, bp);
-	} else {
-		/* count all level 1 nodes and their leaves */
-		for (;;) {
-			nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
-			numrecs = be16_to_cpu(block->bb_numrecs);
-			xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
-			xfs_trans_brelse(tp, bp);
-			if (nextbno == NULLFSBLOCK)
-				break;
-			bno = nextbno;
-			error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-						XFS_BMAP_BTREE_REF,
-						&xfs_bmbt_buf_ops);
-			if (error)
-				return error;
-			*count += 1;
-			block = XFS_BUF_TO_BLOCK(bp);
-		}
-	}
-	return 0;
-}
-
-/*
- * Count leaf blocks given a range of extent records.
- */
-STATIC void
-xfs_bmap_count_leaves(
-	xfs_ifork_t		*ifp,
-	xfs_extnum_t		idx,
-	int			numrecs,
-	int			*count)
-{
-	int		b;
-
-	for (b = 0; b < numrecs; b++) {
-		xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
-		*count += xfs_bmbt_get_blockcount(frp);
-	}
-}
-
-/*
- * Count leaf blocks given a range of extent records originally
- * in btree format.
- */
-STATIC void
-xfs_bmap_disk_count_leaves(
-	struct xfs_mount	*mp,
-	struct xfs_btree_block	*block,
-	int			numrecs,
-	int			*count)
-{
-	int		b;
-	xfs_bmbt_rec_t	*frp;
-
-	for (b = 1; b <= numrecs; b++) {
-		frp = XFS_BMBT_REC_ADDR(mp, block, b);
-		*count += xfs_bmbt_disk_get_blockcount(frp);
-	}
-}
-
 /*
  * dead simple method of punching delalyed allocation blocks from a range in
  * the inode. Walks a block at a time so will be slow, but is only executed in
@@ -6227,16 +6178,3 @@ next_block:
 
 	return error;
 }
-
-/*
- * Convert the given file system block to a disk block.  We have to treat it
- * differently based on whether the file is a real time file or not, because the
- * bmap code does.
- */
-xfs_daddr_t
-xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
-{
-	return (XFS_IS_REALTIME_INODE(ip) ? \
-		 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
-		 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
-}
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 061b45cbe61..0c61a22be6f 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -37,6 +37,7 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
+#include "xfs_cksum.h"
 
 /*
  * Determine the extent state.
@@ -59,24 +60,31 @@ xfs_extent_state(
  */
 void
 xfs_bmdr_to_bmbt(
-	struct xfs_mount	*mp,
+	struct xfs_inode	*ip,
 	xfs_bmdr_block_t	*dblock,
 	int			dblocklen,
 	struct xfs_btree_block	*rblock,
 	int			rblocklen)
 {
+	struct xfs_mount	*mp = ip->i_mount;
 	int			dmxr;
 	xfs_bmbt_key_t		*fkp;
 	__be64			*fpp;
 	xfs_bmbt_key_t		*tkp;
 	__be64			*tpp;
 
-	rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
+	else
+		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+				 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+				 XFS_BTREE_LONG_PTRS);
+
 	rblock->bb_level = dblock->bb_level;
 	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
 	rblock->bb_numrecs = dblock->bb_numrecs;
-	rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-	rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
 	dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
 	fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
 	tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
@@ -424,7 +432,13 @@ xfs_bmbt_to_bmdr(
 	xfs_bmbt_key_t		*tkp;
 	__be64			*tpp;
 
-	ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
+		ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+		ASSERT(rblock->bb_u.l.bb_blkno ==
+		       cpu_to_be64(XFS_BUF_DADDR_NULL));
+	} else
+		ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
 	ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
 	ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
 	ASSERT(rblock->bb_level != 0);
@@ -708,59 +722,89 @@ xfs_bmbt_key_diff(
 				      cur->bc_rec.b.br_startoff;
 }
 
-static void
+static int
 xfs_bmbt_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	unsigned int		level;
-	int			lblock_ok; /* block passes checks */
 
-	/* magic number and level verification.
+	switch (block->bb_magic) {
+	case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
+			return false;
+		/*
+		 * XXX: need a better way of verifying the owner here. Right now
+		 * just make sure there has been one set.
+		 */
+		if (be64_to_cpu(block->bb_u.l.bb_owner) == 0)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_BMAP_MAGIC):
+		break;
+	default:
+		return false;
+	}
+
+	/*
+	 * numrecs and level verification.
 	 *
-	 * We don't know waht fork we belong to, so just verify that the level
+	 * We don't know what fork we belong to, so just verify that the level
 	 * is less than the maximum of the two. Later checks will be more
 	 * precise.
 	 */
 	level = be16_to_cpu(block->bb_level);
-	lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
-		    level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
-
-	/* numrecs verification */
-	lblock_ok = lblock_ok &&
-		be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
+	if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
+		return false;
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+		return false;
 
 	/* sibling pointer verification */
-	lblock_ok = lblock_ok &&
-		block->bb_u.l.bb_leftsib &&
-		(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
-		 XFS_FSB_SANITY_CHECK(mp,
-			be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
-		block->bb_u.l.bb_rightsib &&
-		(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
-		 XFS_FSB_SANITY_CHECK(mp,
-			be64_to_cpu(block->bb_u.l.bb_rightsib)));
-
-	if (!lblock_ok) {
-		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-	}
+	if (!block->bb_u.l.bb_leftsib ||
+	    (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
+	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
+		return false;
+	if (!block->bb_u.l.bb_rightsib ||
+	    (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
+	     !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
+		return false;
+
+	return true;
+
 }
 
 static void
 xfs_bmbt_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_bmbt_verify(bp);
+	if (!(xfs_btree_lblock_verify_crc(bp) &&
+	      xfs_bmbt_verify(bp))) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+				     bp->b_target->bt_mount, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+
 }
 
 static void
 xfs_bmbt_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_bmbt_verify(bp);
+	if (!xfs_bmbt_verify(bp)) {
+		xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn);
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+				     bp->b_target->bt_mount, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+	xfs_btree_lblock_calc_crc(bp);
 }
 
 const struct xfs_buf_ops xfs_bmbt_buf_ops = {
@@ -769,7 +813,7 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = {
 };
 
 
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 STATIC int
 xfs_bmbt_keys_inorder(
 	struct xfs_btree_cur	*cur,
@@ -809,7 +853,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
 	.init_ptr_from_cur	= xfs_bmbt_init_ptr_from_cur,
 	.key_diff		= xfs_bmbt_key_diff,
 	.buf_ops		= &xfs_bmbt_buf_ops,
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 	.keys_inorder		= xfs_bmbt_keys_inorder,
 	.recs_inorder		= xfs_bmbt_recs_inorder,
 #endif
@@ -838,6 +882,8 @@ xfs_bmbt_init_cursor(
 
 	cur->bc_ops = &xfs_bmbt_ops;
 	cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
 
 	cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
 	cur->bc_private.b.ip = ip;
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 88469ca0869..1b726d62694 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -18,7 +18,8 @@
 #ifndef __XFS_BMAP_BTREE_H__
 #define __XFS_BMAP_BTREE_H__
 
-#define XFS_BMAP_MAGIC	0x424d4150	/* 'BMAP' */
+#define XFS_BMAP_MAGIC		0x424d4150	/* 'BMAP' */
+#define XFS_BMAP_CRC_MAGIC	0x424d4133	/* 'BMA3' */
 
 struct xfs_btree_cur;
 struct xfs_btree_block;
@@ -136,10 +137,10 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
 
 /*
  * Btree block header size depends on a superblock flag.
- *
- * (not quite yet, but soon)
  */
-#define XFS_BMBT_BLOCK_LEN(mp)	XFS_BTREE_LBLOCK_LEN
+#define XFS_BMBT_BLOCK_LEN(mp) \
+	(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN)
 
 #define XFS_BMBT_REC_ADDR(mp, block, index) \
 	((xfs_bmbt_rec_t *) \
@@ -186,15 +187,17 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
 #define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
 	XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
 
-#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
-	(int)(XFS_BTREE_LBLOCK_LEN + \
+#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \
+	(int)(XFS_BMBT_BLOCK_LEN(mp) + \
 	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
 
-#define XFS_BMAP_BROOT_SPACE(bb) \
-	(XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
+#define XFS_BMAP_BROOT_SPACE(mp, bb) \
+	(XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs)))
 #define XFS_BMDR_SPACE_CALC(nrecs) \
 	(int)(sizeof(xfs_bmdr_block_t) + \
 	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
+#define XFS_BMAP_BMDR_SPACE(bb) \
+	(XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
 
 /*
  * Maximum number of bmap btree levels.
@@ -204,7 +207,7 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
 /*
  * Prototypes for xfs_bmap.c to call.
  */
-extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
+extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int,
 			struct xfs_btree_block *, int);
 extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
 extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index db010408d70..0903960410a 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -30,9 +30,11 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
 #include "xfs_btree.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_cksum.h"
 
 /*
  * Cursor allocation zone.
@@ -42,9 +44,13 @@ kmem_zone_t	*xfs_btree_cur_zone;
 /*
  * Btree magic numbers.
  */
-const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
-	XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
+static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
+	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC },
+	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
+	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC }
 };
+#define xfs_btree_magic(cur) \
+	xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
 
 
 STATIC int				/* error (0 or EFSCORRUPTED) */
@@ -54,30 +60,38 @@ xfs_btree_check_lblock(
 	int			level,	/* level of the btree block */
 	struct xfs_buf		*bp)	/* buffer for block, if any */
 {
-	int			lblock_ok; /* block passes checks */
+	int			lblock_ok = 1; /* block passes checks */
 	struct xfs_mount	*mp;	/* file system mount point */
 
 	mp = cur->bc_mp;
-	lblock_ok =
-		be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		lblock_ok = lblock_ok &&
+			uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+			block->bb_u.l.bb_blkno == cpu_to_be64(
+				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+	}
+
+	lblock_ok = lblock_ok &&
+		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
 		be16_to_cpu(block->bb_level) == level &&
 		be16_to_cpu(block->bb_numrecs) <=
 			cur->bc_ops->get_maxrecs(cur, level) &&
 		block->bb_u.l.bb_leftsib &&
 		(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
 		 XFS_FSB_SANITY_CHECK(mp,
-		 	be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+			be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
 		block->bb_u.l.bb_rightsib &&
 		(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
 		 XFS_FSB_SANITY_CHECK(mp,
-		 	be64_to_cpu(block->bb_u.l.bb_rightsib)));
+			be64_to_cpu(block->bb_u.l.bb_rightsib)));
+
 	if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
 			XFS_ERRTAG_BTREE_CHECK_LBLOCK,
 			XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
 		if (bp)
 			trace_xfs_btree_corrupt(bp, _RET_IP_);
-		XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
-				 mp);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	return 0;
@@ -90,16 +104,26 @@ xfs_btree_check_sblock(
 	int			level,	/* level of the btree block */
 	struct xfs_buf		*bp)	/* buffer containing block */
 {
+	struct xfs_mount	*mp;	/* file system mount point */
 	struct xfs_buf		*agbp;	/* buffer for ag. freespace struct */
 	struct xfs_agf		*agf;	/* ag. freespace structure */
 	xfs_agblock_t		agflen;	/* native ag. freespace length */
-	int			sblock_ok; /* block passes checks */
+	int			sblock_ok = 1; /* block passes checks */
 
+	mp = cur->bc_mp;
 	agbp = cur->bc_private.a.agbp;
 	agf = XFS_BUF_TO_AGF(agbp);
 	agflen = be32_to_cpu(agf->agf_length);
-	sblock_ok =
-		be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		sblock_ok = sblock_ok &&
+			uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+			block->bb_u.s.bb_blkno == cpu_to_be64(
+				bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
+	}
+
+	sblock_ok = sblock_ok &&
+		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
 		be16_to_cpu(block->bb_level) == level &&
 		be16_to_cpu(block->bb_numrecs) <=
 			cur->bc_ops->get_maxrecs(cur, level) &&
@@ -109,13 +133,13 @@ xfs_btree_check_sblock(
 		(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
 		 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
 		block->bb_u.s.bb_rightsib;
-	if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
+
+	if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
 			XFS_ERRTAG_BTREE_CHECK_SBLOCK,
 			XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
 		if (bp)
 			trace_xfs_btree_corrupt(bp, _RET_IP_);
-		XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
-			XFS_ERRLEVEL_LOW, cur->bc_mp, block);
+		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	return 0;
@@ -194,6 +218,72 @@ xfs_btree_check_ptr(
 #endif
 
 /*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * long-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_lblock_calc_crc(
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return;
+	if (bip)
+		block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+			 XFS_BTREE_LBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_lblock_verify_crc(
+	struct xfs_buf		*bp)
+{
+	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+					XFS_BTREE_LBLOCK_CRC_OFF);
+	return true;
+}
+
+/*
+ * Calculate CRC on the whole btree block and stuff it into the
+ * short-form btree header.
+ *
+ * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
+ * it into the buffer so recovery knows what the last modifcation was that made
+ * it to disk.
+ */
+void
+xfs_btree_sblock_calc_crc(
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return;
+	if (bip)
+		block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+			 XFS_BTREE_SBLOCK_CRC_OFF);
+}
+
+bool
+xfs_btree_sblock_verify_crc(
+	struct xfs_buf		*bp)
+{
+	if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+		return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+					XFS_BTREE_SBLOCK_CRC_OFF);
+	return true;
+}
+
+/*
  * Delete the btree cursor.
  */
 void
@@ -277,10 +367,8 @@ xfs_btree_dup_cursor(
 				*ncur = NULL;
 				return error;
 			}
-			new->bc_bufs[i] = bp;
-			ASSERT(!xfs_buf_geterror(bp));
-		} else
-			new->bc_bufs[i] = NULL;
+		}
+		new->bc_bufs[i] = bp;
 	}
 	*ncur = new;
 	return 0;
@@ -321,9 +409,14 @@ xfs_btree_dup_cursor(
  */
 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 {
-	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
-		XFS_BTREE_LBLOCK_LEN :
-		XFS_BTREE_SBLOCK_LEN;
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+			return XFS_BTREE_LBLOCK_CRC_LEN;
+		return XFS_BTREE_LBLOCK_LEN;
+	}
+	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+		return XFS_BTREE_SBLOCK_CRC_LEN;
+	return XFS_BTREE_SBLOCK_LEN;
 }
 
 /*
@@ -863,43 +956,85 @@ xfs_btree_set_sibling(
 }
 
 void
+xfs_btree_init_block_int(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*buf,
+	xfs_daddr_t		blkno,
+	__u32			magic,
+	__u16			level,
+	__u16			numrecs,
+	__u64			owner,
+	unsigned int		flags)
+{
+	buf->bb_magic = cpu_to_be32(magic);
+	buf->bb_level = cpu_to_be16(level);
+	buf->bb_numrecs = cpu_to_be16(numrecs);
+
+	if (flags & XFS_BTREE_LONG_PTRS) {
+		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+		if (flags & XFS_BTREE_CRC_BLOCKS) {
+			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
+			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
+			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+			buf->bb_u.l.bb_pad = 0;
+		}
+	} else {
+		/* owner is a 32 bit value on short blocks */
+		__u32 __owner = (__u32)owner;
+
+		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+		if (flags & XFS_BTREE_CRC_BLOCKS) {
+			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
+			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+		}
+	}
+}
+
+void
 xfs_btree_init_block(
 	struct xfs_mount *mp,
 	struct xfs_buf	*bp,
 	__u32		magic,
 	__u16		level,
 	__u16		numrecs,
+	__u64		owner,
 	unsigned int	flags)
 {
-	struct xfs_btree_block	*new = XFS_BUF_TO_BLOCK(bp);
-
-	new->bb_magic = cpu_to_be32(magic);
-	new->bb_level = cpu_to_be16(level);
-	new->bb_numrecs = cpu_to_be16(numrecs);
-
-	if (flags & XFS_BTREE_LONG_PTRS) {
-		new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-		new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-	} else {
-		new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-		new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-	}
+	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+				 magic, level, numrecs, owner, flags);
 }
 
 STATIC void
 xfs_btree_init_block_cur(
 	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp,
 	int			level,
-	int			numrecs,
-	struct xfs_buf		*bp)
+	int			numrecs)
 {
-	xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
-			       level, numrecs, cur->bc_flags);
+	__u64 owner;
+
+	/*
+	 * we can pull the owner from the cursor right now as the different
+	 * owners align directly with the pointer size of the btree. This may
+	 * change in future, but is safe for current users of the generic btree
+	 * code.
+	 */
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		owner = cur->bc_private.b.ip->i_ino;
+	else
+		owner = cur->bc_private.a.agno;
+
+	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
+				 xfs_btree_magic(cur), level, numrecs,
+				 owner, cur->bc_flags);
 }
 
 /*
  * Return true if ptr is the last record in the btree and
- * we need to track updateѕ to this record.  The decision
+ * we need to track updates to this record.  The decision
  * will be further refined in the update_lastrec method.
  */
 STATIC int
@@ -1147,6 +1282,7 @@ xfs_btree_log_keys(
 	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
 
 	if (bp) {
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
 		xfs_trans_log_buf(cur->bc_tp, bp,
 				  xfs_btree_key_offset(cur, first),
 				  xfs_btree_key_offset(cur, last + 1) - 1);
@@ -1171,6 +1307,7 @@ xfs_btree_log_recs(
 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
 	XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
 
+	xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
 	xfs_trans_log_buf(cur->bc_tp, bp,
 			  xfs_btree_rec_offset(cur, first),
 			  xfs_btree_rec_offset(cur, last + 1) - 1);
@@ -1195,6 +1332,7 @@ xfs_btree_log_ptrs(
 		struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 		int			level = xfs_btree_get_level(block);
 
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
 		xfs_trans_log_buf(cur->bc_tp, bp,
 				xfs_btree_ptr_offset(cur, first, level),
 				xfs_btree_ptr_offset(cur, last + 1, level) - 1);
@@ -1223,7 +1361,12 @@ xfs_btree_log_block(
 		offsetof(struct xfs_btree_block, bb_numrecs),
 		offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
 		offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
-		XFS_BTREE_SBLOCK_LEN
+		offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
+		offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
+		XFS_BTREE_SBLOCK_CRC_LEN
 	};
 	static const short	loffsets[] = {	/* table of offsets (long) */
 		offsetof(struct xfs_btree_block, bb_magic),
@@ -1231,17 +1374,40 @@ xfs_btree_log_block(
 		offsetof(struct xfs_btree_block, bb_numrecs),
 		offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
 		offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
-		XFS_BTREE_LBLOCK_LEN
+		offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
+		offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
+		XFS_BTREE_LBLOCK_CRC_LEN
 	};
 
 	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
 	XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
 
 	if (bp) {
+		int nbits;
+
+		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+			/*
+			 * We don't log the CRC when updating a btree
+			 * block but instead recreate it during log
+			 * recovery.  As the log buffers have checksums
+			 * of their own this is safe and avoids logging a crc
+			 * update in a lot of places.
+			 */
+			if (fields == XFS_BB_ALL_BITS)
+				fields = XFS_BB_ALL_BITS_CRC;
+			nbits = XFS_BB_NUM_BITS_CRC;
+		} else {
+			nbits = XFS_BB_NUM_BITS;
+		}
 		xfs_btree_offsets(fields,
 				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
 					loffsets : soffsets,
-				  XFS_BB_NUM_BITS, &first, &last);
+				  nbits, &first, &last);
+		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
 		xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 	} else {
 		xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
@@ -2204,7 +2370,7 @@ xfs_btree_split(
 		goto error0;
 
 	/* Fill in the btree header for the new right block. */
-	xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
+	xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
 
 	/*
 	 * Split the entries between the old and the new block evenly.
@@ -2378,7 +2544,17 @@ xfs_btree_new_iroot(
 	if (error)
 		goto error0;
 
+	/*
+	 * we can't just memcpy() the root in for CRC enabled btree blocks.
+	 * In that case have to also ensure the blkno remains correct
+	 */
 	memcpy(cblock, block, xfs_btree_block_len(cur));
+	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+		if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+			cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
+		else
+			cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
+	}
 
 	be16_add_cpu(&block->bb_level, 1);
 	xfs_btree_set_numrecs(block, 1);
@@ -2513,7 +2689,7 @@ xfs_btree_new_root(
 		nptr = 2;
 	}
 	/* Fill in the new block's btree header and log it. */
-	xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
+	xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
 	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
 	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
 			!xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index f932897194e..55e3c7cc3c3 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -42,11 +42,15 @@ extern kmem_zone_t	*xfs_btree_cur_zone;
  * Generic btree header.
  *
  * This is a combination of the actual format used on disk for short and long
- * format btrees.  The first three fields are shared by both format, but
- * the pointers are different and should be used with care.
+ * format btrees.  The first three fields are shared by both format, but the
+ * pointers are different and should be used with care.
  *
- * To get the size of the actual short or long form headers please use
- * the size macros below.  Never use sizeof(xfs_btree_block).
+ * To get the size of the actual short or long form headers please use the size
+ * macros below.  Never use sizeof(xfs_btree_block).
+ *
+ * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
+ * with the crc feature bit, and all accesses to them must be conditional on
+ * that flag.
  */
 struct xfs_btree_block {
 	__be32		bb_magic;	/* magic number for block type */
@@ -56,10 +60,23 @@ struct xfs_btree_block {
 		struct {
 			__be32		bb_leftsib;
 			__be32		bb_rightsib;
+
+			__be64		bb_blkno;
+			__be64		bb_lsn;
+			uuid_t		bb_uuid;
+			__be32		bb_owner;
+			__le32		bb_crc;
 		} s;			/* short form pointers */
 		struct	{
 			__be64		bb_leftsib;
 			__be64		bb_rightsib;
+
+			__be64		bb_blkno;
+			__be64		bb_lsn;
+			uuid_t		bb_uuid;
+			__be64		bb_owner;
+			__le32		bb_crc;
+			__be32		bb_pad; /* padding for alignment */
 		} l;			/* long form pointers */
 	} bb_u;				/* rest */
 };
@@ -67,6 +84,16 @@ struct xfs_btree_block {
 #define XFS_BTREE_SBLOCK_LEN	16	/* size of a short form block */
 #define XFS_BTREE_LBLOCK_LEN	24	/* size of a long form block */
 
+/* sizes of CRC enabled btree blocks */
+#define XFS_BTREE_SBLOCK_CRC_LEN	(XFS_BTREE_SBLOCK_LEN + 40)
+#define XFS_BTREE_LBLOCK_CRC_LEN	(XFS_BTREE_LBLOCK_LEN + 48)
+
+
+#define XFS_BTREE_SBLOCK_CRC_OFF \
+	offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
+#define XFS_BTREE_LBLOCK_CRC_OFF \
+	offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
+
 
 /*
  * Generic key, ptr and record wrapper structures.
@@ -101,13 +128,11 @@ union xfs_btree_rec {
 #define	XFS_BB_NUMRECS		0x04
 #define	XFS_BB_LEFTSIB		0x08
 #define	XFS_BB_RIGHTSIB		0x10
+#define	XFS_BB_BLKNO		0x20
 #define	XFS_BB_NUM_BITS		5
 #define	XFS_BB_ALL_BITS		((1 << XFS_BB_NUM_BITS) - 1)
-
-/*
- * Magic numbers for btree blocks.
- */
-extern const __uint32_t	xfs_magics[];
+#define	XFS_BB_NUM_BITS_CRC	8
+#define	XFS_BB_ALL_BITS_CRC	((1 << XFS_BB_NUM_BITS_CRC) - 1)
 
 /*
  * Generic stats interface
@@ -190,7 +215,7 @@ struct xfs_btree_ops {
 
 	const struct xfs_buf_ops	*buf_ops;
 
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 	/* check that k1 is lower than k2 */
 	int	(*keys_inorder)(struct xfs_btree_cur *cur,
 				union xfs_btree_key *k1,
@@ -256,6 +281,7 @@ typedef struct xfs_btree_cur
 #define XFS_BTREE_LONG_PTRS		(1<<0)	/* pointers are 64bits long */
 #define XFS_BTREE_ROOT_IN_INODE		(1<<1)	/* root may be variable size */
 #define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
+#define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
 
 
 #define	XFS_BTREE_NOERROR	0
@@ -393,8 +419,20 @@ xfs_btree_init_block(
 	__u32		magic,
 	__u16		level,
 	__u16		numrecs,
+	__u64		owner,
 	unsigned int	flags);
 
+void
+xfs_btree_init_block_int(
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*buf,
+	xfs_daddr_t		blkno,
+	__u32			magic,
+	__u16			level,
+	__u16			numrecs,
+	__u64			owner,
+	unsigned int		flags);
+
 /*
  * Common btree core entry points.
  */
@@ -408,6 +446,14 @@ int xfs_btree_delete(struct xfs_btree_cur *, int *);
 int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
 
 /*
+ * btree block CRC helpers
+ */
+void xfs_btree_lblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
+void xfs_btree_sblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
+
+/*
  * Internal btree helpers also used by xfs_bmap.c.
  */
 void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index fbbb9eb92e3..1b2472a46e4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -513,6 +513,7 @@ _xfs_buf_find(
 		xfs_alert(btp->bt_mount,
 			  "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
 			  __func__, blkno, eofs);
+		WARN_ON(1);
 		return NULL;
 	}
 
@@ -951,8 +952,6 @@ xfs_buf_trylock(
 	locked = down_trylock(&bp->b_sema) == 0;
 	if (locked)
 		XB_SET_OWNER(bp);
-	else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
-		xfs_log_force(bp->b_target->bt_mount, 0);
 
 	trace_xfs_buf_trylock(bp, _RET_IP_);
 	return locked;
@@ -1024,7 +1023,9 @@ xfs_buf_iodone_work(
 	bool			read = !!(bp->b_flags & XBF_READ);
 
 	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-	if (read && bp->b_ops)
+
+	/* only validate buffers that were read without errors */
+	if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE))
 		bp->b_ops->verify_read(bp);
 
 	if (bp->b_iodone)
@@ -1336,6 +1337,12 @@ _xfs_buf_ioapply(
 	int		size;
 	int		i;
 
+	/*
+	 * Make sure we capture only current IO errors rather than stale errors
+	 * left over from previous use of the buffer (e.g. failed readahead).
+	 */
+	bp->b_error = 0;
+
 	if (bp->b_flags & XBF_WRITE) {
 		if (bp->b_flags & XBF_SYNCIO)
 			rw = WRITE_SYNC;
@@ -1643,7 +1650,7 @@ xfs_alloc_buftarg(
 {
 	xfs_buftarg_t		*btp;
 
-	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
 
 	btp->bt_mount = mp;
 	btp->bt_dev =  bdev->bd_dev;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3f9949fee39..bfc4e0c26fd 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -37,109 +37,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
 	return container_of(lip, struct xfs_buf_log_item, bli_item);
 }
 
-
-#ifdef XFS_TRANS_DEBUG
-/*
- * This function uses an alternate strategy for tracking the bytes
- * that the user requests to be logged.  This can then be used
- * in conjunction with the bli_orig array in the buf log item to
- * catch bugs in our callers' code.
- *
- * We also double check the bits set in xfs_buf_item_log using a
- * simple algorithm to check that every byte is accounted for.
- */
-STATIC void
-xfs_buf_item_log_debug(
-	xfs_buf_log_item_t	*bip,
-	uint			first,
-	uint			last)
-{
-	uint	x;
-	uint	byte;
-	uint	nbytes;
-	uint	chunk_num;
-	uint	word_num;
-	uint	bit_num;
-	uint	bit_set;
-	uint	*wordp;
-
-	ASSERT(bip->bli_logged != NULL);
-	byte = first;
-	nbytes = last - first + 1;
-	bfset(bip->bli_logged, first, nbytes);
-	for (x = 0; x < nbytes; x++) {
-		chunk_num = byte >> XFS_BLF_SHIFT;
-		word_num = chunk_num >> BIT_TO_WORD_SHIFT;
-		bit_num = chunk_num & (NBWORD - 1);
-		wordp = &(bip->__bli_format.blf_data_map[word_num]);
-		bit_set = *wordp & (1 << bit_num);
-		ASSERT(bit_set);
-		byte++;
-	}
-}
-
-/*
- * This function is called when we flush something into a buffer without
- * logging it.  This happens for things like inodes which are logged
- * separately from the buffer.
- */
-void
-xfs_buf_item_flush_log_debug(
-	xfs_buf_t	*bp,
-	uint		first,
-	uint		last)
-{
-	xfs_buf_log_item_t	*bip = bp->b_fspriv;
-	uint			nbytes;
-
-	if (bip == NULL || (bip->bli_item.li_type != XFS_LI_BUF))
-		return;
-
-	ASSERT(bip->bli_logged != NULL);
-	nbytes = last - first + 1;
-	bfset(bip->bli_logged, first, nbytes);
-}
-
-/*
- * This function is called to verify that our callers have logged
- * all the bytes that they changed.
- *
- * It does this by comparing the original copy of the buffer stored in
- * the buf log item's bli_orig array to the current copy of the buffer
- * and ensuring that all bytes which mismatch are set in the bli_logged
- * array of the buf log item.
- */
-STATIC void
-xfs_buf_item_log_check(
-	xfs_buf_log_item_t	*bip)
-{
-	char		*orig;
-	char		*buffer;
-	int		x;
-	xfs_buf_t	*bp;
-
-	ASSERT(bip->bli_orig != NULL);
-	ASSERT(bip->bli_logged != NULL);
-
-	bp = bip->bli_buf;
-	ASSERT(bp->b_length > 0);
-	ASSERT(bp->b_addr != NULL);
-	orig = bip->bli_orig;
-	buffer = bp->b_addr;
-	for (x = 0; x < BBTOB(bp->b_length); x++) {
-		if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
-			xfs_emerg(bp->b_mount,
-				"%s: bip %x buffer %x orig %x index %d",
-				__func__, bip, bp, orig, x);
-			ASSERT(0);
-		}
-	}
-}
-#else
-#define		xfs_buf_item_log_debug(x,y,z)
-#define		xfs_buf_item_log_check(x)
-#endif
-
 STATIC void	xfs_buf_do_callbacks(struct xfs_buf *bp);
 
 /*
@@ -243,6 +140,16 @@ xfs_buf_item_size(
 
 	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
 
+	if (bip->bli_flags & XFS_BLI_ORDERED) {
+		/*
+		 * The buffer has been logged just to order it.
+		 * It is not being included in the transaction
+		 * commit, so no vectors are used at all.
+		 */
+		trace_xfs_buf_item_size_ordered(bip);
+		return XFS_LOG_VEC_ORDERED;
+	}
+
 	/*
 	 * the vector count is based on the number of buffer vectors we have
 	 * dirty bits in. This will only be greater than one when we have a
@@ -315,6 +222,7 @@ xfs_buf_item_format_segment(
 		goto out;
 	}
 
+
 	/*
 	 * Fill in an iovec for each set of contiguous chunks.
 	 */
@@ -365,12 +273,7 @@ xfs_buf_item_format_segment(
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 			vecp->i_len = nbits * XFS_BLF_CHUNK;
 			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
-/*
- * You would think we need to bump the nvecs here too, but we do not
- * this number is used by recovery, and it gets confused by the boundary
- * split here
- *			nvecs++;
- */
+			nvecs++;
 			vecp++;
 			first_bit = next_bit;
 			last_bit = next_bit;
@@ -407,18 +310,36 @@ xfs_buf_item_format(
 
 	/*
 	 * If it is an inode buffer, transfer the in-memory state to the
-	 * format flags and clear the in-memory state. We do not transfer
+	 * format flags and clear the in-memory state.
+	 *
+	 * For buffer based inode allocation, we do not transfer
 	 * this state if the inode buffer allocation has not yet been committed
 	 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
 	 * correct replay of the inode allocation.
+	 *
+	 * For icreate item based inode allocation, the buffers aren't written
+	 * to the journal during allocation, and hence we should always tag the
+	 * buffer as an inode buffer so that the correct unlinked list replay
+	 * occurs during recovery.
 	 */
 	if (bip->bli_flags & XFS_BLI_INODE_BUF) {
-		if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+		if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+		    !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
 		      xfs_log_item_in_current_chkpt(lip)))
 			bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
 		bip->bli_flags &= ~XFS_BLI_INODE_BUF;
 	}
 
+	if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+							XFS_BLI_ORDERED) {
+		/*
+		 * The buffer has been logged just to order it.  It is not being
+		 * included in the transaction commit, so don't format it.
+		 */
+		trace_xfs_buf_item_format_ordered(bip);
+		return;
+	}
+
 	for (i = 0; i < bip->bli_format_count; i++) {
 		vecp = xfs_buf_item_format_segment(bip, vecp, offset,
 						&bip->bli_formats[i]);
@@ -429,7 +350,6 @@ xfs_buf_item_format(
 	 * Check to make sure everything is consistent.
 	 */
 	trace_xfs_buf_item_format(bip);
-	xfs_buf_item_log_check(bip);
 }
 
 /*
@@ -449,6 +369,7 @@ xfs_buf_item_pin(
 
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+	       (bip->bli_flags & XFS_BLI_ORDERED) ||
 	       (bip->bli_flags & XFS_BLI_STALE));
 
 	trace_xfs_buf_item_pin(bip);
@@ -573,8 +494,18 @@ xfs_buf_item_push(
 
 	if (xfs_buf_ispinned(bp))
 		return XFS_ITEM_PINNED;
-	if (!xfs_buf_trylock(bp))
+	if (!xfs_buf_trylock(bp)) {
+		/*
+		 * If we have just raced with a buffer being pinned and it has
+		 * been marked stale, we could end up stalling until someone else
+		 * issues a log force to unpin the stale buffer. Check for the
+		 * race condition here so xfsaild recognizes the buffer is pinned
+		 * and queues a log force to move it along.
+		 */
+		if (xfs_buf_ispinned(bp))
+			return XFS_ITEM_PINNED;
 		return XFS_ITEM_LOCKED;
+	}
 
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
 
@@ -611,8 +542,9 @@ xfs_buf_item_unlock(
 {
 	struct xfs_buf_log_item	*bip = BUF_ITEM(lip);
 	struct xfs_buf		*bp = bip->bli_buf;
-	int			aborted, clean, i;
-	uint			hold;
+	bool			clean;
+	bool			aborted;
+	int			flags;
 
 	/* Clear the buffer's association with this transaction. */
 	bp->b_transp = NULL;
@@ -623,23 +555,21 @@ xfs_buf_item_unlock(
 	 * (cancelled) buffers at unpin time, but we'll never go through the
 	 * pin/unpin cycle if we abort inside commit.
 	 */
-	aborted = (lip->li_flags & XFS_LI_ABORTED) != 0;
-
+	aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
 	/*
-	 * Before possibly freeing the buf item, determine if we should
-	 * release the buffer at the end of this routine.
+	 * Before possibly freeing the buf item, copy the per-transaction state
+	 * so we can reference it safely later after clearing it from the
+	 * buffer log item.
 	 */
-	hold = bip->bli_flags & XFS_BLI_HOLD;
-
-	/* Clear the per transaction state. */
-	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
+	flags = bip->bli_flags;
+	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
 
 	/*
 	 * If the buf item is marked stale, then don't do anything.  We'll
 	 * unlock the buffer and free the buf item when the buffer is unpinned
 	 * for the last time.
 	 */
-	if (bip->bli_flags & XFS_BLI_STALE) {
+	if (flags & XFS_BLI_STALE) {
 		trace_xfs_buf_item_unlock_stale(bip);
 		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
 		if (!aborted) {
@@ -656,13 +586,19 @@ xfs_buf_item_unlock(
 	 * be the only reference to the buf item, so we free it anyway
 	 * regardless of whether it is dirty or not. A dirty abort implies a
 	 * shutdown, anyway.
+	 *
+	 * Ordered buffers are dirty but may have no recorded changes, so ensure
+	 * we only release clean items here.
 	 */
-	clean = 1;
-	for (i = 0; i < bip->bli_format_count; i++) {
-		if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
-			     bip->bli_formats[i].blf_map_size)) {
-			clean = 0;
-			break;
+	clean = (flags & XFS_BLI_DIRTY) ? false : true;
+	if (clean) {
+		int i;
+		for (i = 0; i < bip->bli_format_count; i++) {
+			if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+				     bip->bli_formats[i].blf_map_size)) {
+				clean = false;
+				break;
+			}
 		}
 	}
 	if (clean)
@@ -675,7 +611,7 @@ xfs_buf_item_unlock(
 	} else
 		atomic_dec(&bip->bli_refcount);
 
-	if (!hold)
+	if (!(flags & XFS_BLI_HOLD))
 		xfs_buf_relse(bp);
 }
 
@@ -923,8 +859,6 @@ xfs_buf_item_log_segment(
 		mask = (1 << end_bit) - 1;
 		*wordp |= mask;
 	}
-
-	xfs_buf_item_log_debug(bip, first, last);
 }
 
 /*
@@ -943,12 +877,6 @@ xfs_buf_item_log(
 	struct xfs_buf		*bp = bip->bli_buf;
 
 	/*
-	 * Mark the item as having some dirty data for
-	 * quick reference in xfs_buf_item_dirty.
-	 */
-	bip->bli_flags |= XFS_BLI_DIRTY;
-
-	/*
 	 * walk each buffer segment and mark them dirty appropriately.
 	 */
 	start = 0;
@@ -974,7 +902,7 @@ xfs_buf_item_log(
 
 
 /*
- * Return 1 if the buffer has some data that has been logged (at any
+ * Return 1 if the buffer has been logged or ordered in a transaction (at any
  * point, not just the current transaction) and 0 if not.
  */
 uint
@@ -1008,11 +936,11 @@ void
 xfs_buf_item_relse(
 	xfs_buf_t	*bp)
 {
-	xfs_buf_log_item_t	*bip;
+	xfs_buf_log_item_t	*bip = bp->b_fspriv;
 
 	trace_xfs_buf_item_relse(bp, _RET_IP_);
+	ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
 
-	bip = bp->b_fspriv;
 	bp->b_fspriv = bip->bli_item.li_bio_list;
 	if (bp->b_fspriv == NULL)
 		bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 16def435944..0f1c247dc68 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -24,19 +24,20 @@ extern kmem_zone_t	*xfs_buf_item_zone;
  * This flag indicates that the buffer contains on disk inodes
  * and requires special recovery handling.
  */
-#define	XFS_BLF_INODE_BUF	0x1
+#define	XFS_BLF_INODE_BUF	(1<<0)
 /*
  * This flag indicates that the buffer should not be replayed
  * during recovery because its blocks are being freed.
  */
-#define	XFS_BLF_CANCEL		0x2
+#define	XFS_BLF_CANCEL		(1<<1)
+
 /*
  * This flag indicates that the buffer contains on disk
  * user or group dquots and may require special recovery handling.
  */
-#define	XFS_BLF_UDQUOT_BUF	0x4
-#define XFS_BLF_PDQUOT_BUF	0x8
-#define	XFS_BLF_GDQUOT_BUF	0x10
+#define	XFS_BLF_UDQUOT_BUF	(1<<2)
+#define XFS_BLF_PDQUOT_BUF	(1<<3)
+#define	XFS_BLF_GDQUOT_BUF	(1<<4)
 
 #define	XFS_BLF_CHUNK		128
 #define	XFS_BLF_SHIFT		7
@@ -61,6 +62,55 @@ typedef struct xfs_buf_log_format {
 } xfs_buf_log_format_t;
 
 /*
+ * All buffers now need to tell recovery where the magic number
+ * is so that it can verify and calculate the CRCs on the buffer correctly
+ * once the changes have been replayed into the buffer.
+ *
+ * The type value is held in the upper 5 bits of the blf_flags field, which is
+ * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
+ */
+#define XFS_BLFT_BITS	5
+#define XFS_BLFT_SHIFT	11
+#define XFS_BLFT_MASK	(((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
+
+enum xfs_blft {
+	XFS_BLFT_UNKNOWN_BUF = 0,
+	XFS_BLFT_UDQUOT_BUF,
+	XFS_BLFT_PDQUOT_BUF,
+	XFS_BLFT_GDQUOT_BUF,
+	XFS_BLFT_BTREE_BUF,
+	XFS_BLFT_AGF_BUF,
+	XFS_BLFT_AGFL_BUF,
+	XFS_BLFT_AGI_BUF,
+	XFS_BLFT_DINO_BUF,
+	XFS_BLFT_SYMLINK_BUF,
+	XFS_BLFT_DIR_BLOCK_BUF,
+	XFS_BLFT_DIR_DATA_BUF,
+	XFS_BLFT_DIR_FREE_BUF,
+	XFS_BLFT_DIR_LEAF1_BUF,
+	XFS_BLFT_DIR_LEAFN_BUF,
+	XFS_BLFT_DA_NODE_BUF,
+	XFS_BLFT_ATTR_LEAF_BUF,
+	XFS_BLFT_ATTR_RMT_BUF,
+	XFS_BLFT_SB_BUF,
+	XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
+};
+
+static inline void
+xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
+{
+	ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
+	blf->blf_flags &= ~XFS_BLFT_MASK;
+	blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
+}
+
+static inline __uint16_t
+xfs_blft_from_flags(struct xfs_buf_log_format *blf)
+{
+	return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
+}
+
+/*
  * buf log item flags
  */
 #define	XFS_BLI_HOLD		0x01
@@ -70,6 +120,7 @@ typedef struct xfs_buf_log_format {
 #define	XFS_BLI_INODE_ALLOC_BUF	0x10
 #define XFS_BLI_STALE_INODE	0x20
 #define	XFS_BLI_INODE_BUF	0x40
+#define	XFS_BLI_ORDERED		0x80
 
 #define XFS_BLI_FLAGS \
 	{ XFS_BLI_HOLD,		"HOLD" }, \
@@ -78,7 +129,8 @@ typedef struct xfs_buf_log_format {
 	{ XFS_BLI_LOGGED,	"LOGGED" }, \
 	{ XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
 	{ XFS_BLI_STALE_INODE,	"STALE_INODE" }, \
-	{ XFS_BLI_INODE_BUF,	"INODE_BUF" }
+	{ XFS_BLI_INODE_BUF,	"INODE_BUF" }, \
+	{ XFS_BLI_ORDERED,	"ORDERED" }
 
 
 #ifdef __KERNEL__
@@ -98,10 +150,6 @@ typedef struct xfs_buf_log_item {
 	unsigned int		bli_flags;	/* misc flags */
 	unsigned int		bli_recur;	/* lock recursion count */
 	atomic_t		bli_refcount;	/* cnt of tp refs */
-#ifdef XFS_TRANS_DEBUG
-	char			*bli_orig;	/* original buffer copy */
-	char			*bli_logged;	/* bytes logged (bitmap) */
-#endif
 	int			bli_format_count;	/* count of headers */
 	struct xfs_buf_log_format *bli_formats;	/* array of in-log header ptrs */
 	struct xfs_buf_log_format __bli_format;	/* embedded in-log header */
@@ -117,15 +165,9 @@ void	xfs_buf_attach_iodone(struct xfs_buf *,
 void	xfs_buf_iodone_callbacks(struct xfs_buf *);
 void	xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
 
-#ifdef XFS_TRANS_DEBUG
-void
-xfs_buf_item_flush_log_debug(
-	struct xfs_buf *bp,
-	uint	first,
-	uint	last);
-#else
-#define	xfs_buf_item_flush_log_debug(bp, first, last)
-#endif
+void	xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
+			       enum xfs_blft);
+void	xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp);
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 4d7696a0241..0b8b2a13cd2 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -38,6 +39,8 @@
 #include "xfs_attr_leaf.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
 
 /*
  * xfs_da_btree.c
@@ -52,69 +55,195 @@
 /*
  * Routines used for growing the Btree.
  */
-STATIC int xfs_da_root_split(xfs_da_state_t *state,
+STATIC int xfs_da3_root_split(xfs_da_state_t *state,
 					    xfs_da_state_blk_t *existing_root,
 					    xfs_da_state_blk_t *new_child);
-STATIC int xfs_da_node_split(xfs_da_state_t *state,
+STATIC int xfs_da3_node_split(xfs_da_state_t *state,
 					    xfs_da_state_blk_t *existing_blk,
 					    xfs_da_state_blk_t *split_blk,
 					    xfs_da_state_blk_t *blk_to_add,
 					    int treelevel,
 					    int *result);
-STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
+STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
 					 xfs_da_state_blk_t *node_blk_1,
 					 xfs_da_state_blk_t *node_blk_2);
-STATIC void xfs_da_node_add(xfs_da_state_t *state,
+STATIC void xfs_da3_node_add(xfs_da_state_t *state,
 				   xfs_da_state_blk_t *old_node_blk,
 				   xfs_da_state_blk_t *new_node_blk);
 
 /*
  * Routines used for shrinking the Btree.
  */
-STATIC int xfs_da_root_join(xfs_da_state_t *state,
+STATIC int xfs_da3_root_join(xfs_da_state_t *state,
 					   xfs_da_state_blk_t *root_blk);
-STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
-STATIC void xfs_da_node_remove(xfs_da_state_t *state,
+STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
 					      xfs_da_state_blk_t *drop_blk);
-STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
+STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
 					 xfs_da_state_blk_t *src_node_blk,
 					 xfs_da_state_blk_t *dst_node_blk);
 
 /*
  * Utility routines.
  */
-STATIC uint	xfs_da_node_lasthash(struct xfs_buf *bp, int *count);
-STATIC int	xfs_da_node_order(struct xfs_buf *node1_bp,
-				  struct xfs_buf *node2_bp);
-STATIC int	xfs_da_blk_unlink(xfs_da_state_t *state,
+STATIC int	xfs_da3_blk_unlink(xfs_da_state_t *state,
 				  xfs_da_state_blk_t *drop_blk,
 				  xfs_da_state_blk_t *save_blk);
-STATIC void	xfs_da_state_kill_altpath(xfs_da_state_t *state);
 
-static void
-xfs_da_node_verify(
+
+kmem_zone_t *xfs_da_state_zone;	/* anchor for state struct zone */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+	return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+STATIC void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+	int	i;
+
+	for (i = 0; i < state->altpath.active; i++)
+		state->altpath.blk[i].bp = NULL;
+	state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+	xfs_da_state_kill_altpath(state);
+#ifdef DEBUG
+	memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+	kmem_zone_free(xfs_da_state_zone, state);
+}
+
+void
+xfs_da3_node_hdr_from_disk(
+	struct xfs_da3_icnode_hdr	*to,
+	struct xfs_da_intnode		*from)
+{
+	ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+	       from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+
+	if (from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+		struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
+
+		to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+		to->back = be32_to_cpu(hdr3->info.hdr.back);
+		to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+		to->count = be16_to_cpu(hdr3->__count);
+		to->level = be16_to_cpu(hdr3->__level);
+		return;
+	}
+	to->forw = be32_to_cpu(from->hdr.info.forw);
+	to->back = be32_to_cpu(from->hdr.info.back);
+	to->magic = be16_to_cpu(from->hdr.info.magic);
+	to->count = be16_to_cpu(from->hdr.__count);
+	to->level = be16_to_cpu(from->hdr.__level);
+}
+
+void
+xfs_da3_node_hdr_to_disk(
+	struct xfs_da_intnode		*to,
+	struct xfs_da3_icnode_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DA_NODE_MAGIC ||
+	       from->magic == XFS_DA3_NODE_MAGIC);
+
+	if (from->magic == XFS_DA3_NODE_MAGIC) {
+		struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
+
+		hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+		hdr3->info.hdr.back = cpu_to_be32(from->back);
+		hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+		hdr3->__count = cpu_to_be16(from->count);
+		hdr3->__level = cpu_to_be16(from->level);
+		return;
+	}
+	to->hdr.info.forw = cpu_to_be32(from->forw);
+	to->hdr.info.back = cpu_to_be32(from->back);
+	to->hdr.info.magic = cpu_to_be16(from->magic);
+	to->hdr.__count = cpu_to_be16(from->count);
+	to->hdr.__level = cpu_to_be16(from->level);
+}
+
+static bool
+xfs_da3_node_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_da_node_hdr *hdr = bp->b_addr;
-	int			block_ok = 0;
-
-	block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
-	block_ok = block_ok &&
-			be16_to_cpu(hdr->level) > 0 &&
-			be16_to_cpu(hdr->count) > 0 ;
-	if (!block_ok) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	struct xfs_da_intnode	*hdr = bp->b_addr;
+	struct xfs_da3_icnode_hdr ichdr;
+
+	xfs_da3_node_hdr_from_disk(&ichdr, hdr);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+		if (ichdr.magic != XFS_DA3_NODE_MAGIC)
+			return false;
+
+		if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (ichdr.magic != XFS_DA_NODE_MAGIC)
+			return false;
 	}
+	if (ichdr.level == 0)
+		return false;
+	if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
+		return false;
+	if (ichdr.count == 0)
+		return false;
 
+	/*
+	 * we don't know if the node is for and attribute or directory tree,
+	 * so only fail if the count is outside both bounds
+	 */
+	if (ichdr.count > mp->m_dir_node_ents &&
+	    ichdr.count > mp->m_attr_node_ents)
+		return false;
+
+	/* XXX: hash order check? */
+
+	return true;
 }
 
 static void
-xfs_da_node_write_verify(
+xfs_da3_node_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_da_node_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+	if (!xfs_da3_node_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF);
 }
 
 /*
@@ -124,40 +253,48 @@ xfs_da_node_write_verify(
  * format of the block being read.
  */
 static void
-xfs_da_node_read_verify(
+xfs_da3_node_read_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_da_blkinfo	*info = bp->b_addr;
 
 	switch (be16_to_cpu(info->magic)) {
+		case XFS_DA3_NODE_MAGIC:
+			if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+					      XFS_DA3_NODE_CRC_OFF))
+				break;
+			/* fall through */
 		case XFS_DA_NODE_MAGIC:
-			xfs_da_node_verify(bp);
-			break;
+			if (!xfs_da3_node_verify(bp))
+				break;
+			return;
 		case XFS_ATTR_LEAF_MAGIC:
-			bp->b_ops = &xfs_attr_leaf_buf_ops;
+		case XFS_ATTR3_LEAF_MAGIC:
+			bp->b_ops = &xfs_attr3_leaf_buf_ops;
 			bp->b_ops->verify_read(bp);
 			return;
 		case XFS_DIR2_LEAFN_MAGIC:
-			bp->b_ops = &xfs_dir2_leafn_buf_ops;
+		case XFS_DIR3_LEAFN_MAGIC:
+			bp->b_ops = &xfs_dir3_leafn_buf_ops;
 			bp->b_ops->verify_read(bp);
 			return;
 		default:
-			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
-					     mp, info);
-			xfs_buf_ioerror(bp, EFSCORRUPTED);
 			break;
 	}
+
+	/* corrupt block */
+	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+	xfs_buf_ioerror(bp, EFSCORRUPTED);
 }
 
-const struct xfs_buf_ops xfs_da_node_buf_ops = {
-	.verify_read = xfs_da_node_read_verify,
-	.verify_write = xfs_da_node_write_verify,
+const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+	.verify_read = xfs_da3_node_read_verify,
+	.verify_write = xfs_da3_node_write_verify,
 };
 
-
 int
-xfs_da_node_read(
+xfs_da3_node_read(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*dp,
 	xfs_dablk_t		bno,
@@ -165,8 +302,35 @@ xfs_da_node_read(
 	struct xfs_buf		**bpp,
 	int			which_fork)
 {
-	return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
-					which_fork, &xfs_da_node_buf_ops);
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+					which_fork, &xfs_da3_node_buf_ops);
+	if (!err && tp) {
+		struct xfs_da_blkinfo	*info = (*bpp)->b_addr;
+		int			type;
+
+		switch (be16_to_cpu(info->magic)) {
+		case XFS_DA_NODE_MAGIC:
+		case XFS_DA3_NODE_MAGIC:
+			type = XFS_BLFT_DA_NODE_BUF;
+			break;
+		case XFS_ATTR_LEAF_MAGIC:
+		case XFS_ATTR3_LEAF_MAGIC:
+			type = XFS_BLFT_ATTR_LEAF_BUF;
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+		case XFS_DIR3_LEAFN_MAGIC:
+			type = XFS_BLFT_DIR_LEAFN_BUF;
+			break;
+		default:
+			type = 0;
+			ASSERT(0);
+			break;
+		}
+		xfs_trans_buf_set_type(tp, *bpp, type);
+	}
+	return err;
 }
 
 /*========================================================================
@@ -177,33 +341,46 @@ xfs_da_node_read(
  * Create the initial contents of an intermediate node.
  */
 int
-xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
-				 struct xfs_buf **bpp, int whichfork)
+xfs_da3_node_create(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		blkno,
+	int			level,
+	struct xfs_buf		**bpp,
+	int			whichfork)
 {
-	xfs_da_intnode_t *node;
-	struct xfs_buf *bp;
-	int error;
-	xfs_trans_t *tp;
+	struct xfs_da_intnode	*node;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_da3_icnode_hdr ichdr = {0};
+	struct xfs_buf		*bp;
+	int			error;
 
 	trace_xfs_da_node_create(args);
+	ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
 
-	tp = args->trans;
 	error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
 	if (error)
 		return(error);
-	ASSERT(bp != NULL);
+	bp->b_ops = &xfs_da3_node_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
 	node = bp->b_addr;
-	node->hdr.info.forw = 0;
-	node->hdr.info.back = 0;
-	node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC);
-	node->hdr.info.pad = 0;
-	node->hdr.count = 0;
-	node->hdr.level = cpu_to_be16(level);
 
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+
+		ichdr.magic = XFS_DA3_NODE_MAGIC;
+		hdr3->info.blkno = cpu_to_be64(bp->b_bn);
+		hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
+		uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+	} else {
+		ichdr.magic = XFS_DA_NODE_MAGIC;
+	}
+	ichdr.level = level;
+
+	xfs_da3_node_hdr_to_disk(node, &ichdr);
 	xfs_trans_log_buf(tp, bp,
-		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+		XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
 
-	bp->b_ops = &xfs_da_node_buf_ops;
 	*bpp = bp;
 	return(0);
 }
@@ -213,12 +390,18 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
  * intermediate nodes, rebalance, etc.
  */
 int							/* error */
-xfs_da_split(xfs_da_state_t *state)
+xfs_da3_split(
+	struct xfs_da_state	*state)
 {
-	xfs_da_state_blk_t *oldblk, *newblk, *addblk;
-	xfs_da_intnode_t *node;
-	struct xfs_buf *bp;
-	int max, action, error, i;
+	struct xfs_da_state_blk	*oldblk;
+	struct xfs_da_state_blk	*newblk;
+	struct xfs_da_state_blk	*addblk;
+	struct xfs_da_intnode	*node;
+	struct xfs_buf		*bp;
+	int			max;
+	int			action;
+	int			error;
+	int			i;
 
 	trace_xfs_da_split(state->args);
 
@@ -246,7 +429,7 @@ xfs_da_split(xfs_da_state_t *state)
 		 */
 		switch (oldblk->magic) {
 		case XFS_ATTR_LEAF_MAGIC:
-			error = xfs_attr_leaf_split(state, oldblk, newblk);
+			error = xfs_attr3_leaf_split(state, oldblk, newblk);
 			if ((error != 0) && (error != ENOSPC)) {
 				return(error);	/* GROT: attr is inconsistent */
 			}
@@ -261,12 +444,12 @@ xfs_da_split(xfs_da_state_t *state)
 			if (state->inleaf) {
 				state->extraafter = 0;	/* before newblk */
 				trace_xfs_attr_leaf_split_before(state->args);
-				error = xfs_attr_leaf_split(state, oldblk,
+				error = xfs_attr3_leaf_split(state, oldblk,
 							    &state->extrablk);
 			} else {
 				state->extraafter = 1;	/* after newblk */
 				trace_xfs_attr_leaf_split_after(state->args);
-				error = xfs_attr_leaf_split(state, newblk,
+				error = xfs_attr3_leaf_split(state, newblk,
 							    &state->extrablk);
 			}
 			if (error)
@@ -280,7 +463,7 @@ xfs_da_split(xfs_da_state_t *state)
 			addblk = newblk;
 			break;
 		case XFS_DA_NODE_MAGIC:
-			error = xfs_da_node_split(state, oldblk, newblk, addblk,
+			error = xfs_da3_node_split(state, oldblk, newblk, addblk,
 							 max - i, &action);
 			addblk->bp = NULL;
 			if (error)
@@ -298,7 +481,7 @@ xfs_da_split(xfs_da_state_t *state)
 		/*
 		 * Update the btree to show the new hashval for this child.
 		 */
-		xfs_da_fixhashpath(state, &state->path);
+		xfs_da3_fixhashpath(state, &state->path);
 	}
 	if (!addblk)
 		return(0);
@@ -308,7 +491,7 @@ xfs_da_split(xfs_da_state_t *state)
 	 */
 	ASSERT(state->path.active == 0);
 	oldblk = &state->path.blk[0];
-	error = xfs_da_root_split(state, oldblk, addblk);
+	error = xfs_da3_root_split(state, oldblk, addblk);
 	if (error) {
 		addblk->bp = NULL;
 		return(error);	/* GROT: dir is inconsistent */
@@ -319,8 +502,12 @@ xfs_da_split(xfs_da_state_t *state)
 	 * just got bumped because of the addition of a new root node.
 	 * There might be three blocks involved if a double split occurred,
 	 * and the original block 0 could be at any position in the list.
+	 *
+	 * Note: the magic numbers and sibling pointers are in the same
+	 * physical place for both v2 and v3 headers (by design). Hence it
+	 * doesn't matter which version of the xfs_da_intnode structure we use
+	 * here as the result will be the same using either structure.
 	 */
-
 	node = oldblk->bp->b_addr;
 	if (node->hdr.info.forw) {
 		if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
@@ -359,18 +546,25 @@ xfs_da_split(xfs_da_state_t *state)
  * the EOF, extending the inode in process.
  */
 STATIC int						/* error */
-xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
-				 xfs_da_state_blk_t *blk2)
+xfs_da3_root_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
 {
-	xfs_da_intnode_t *node, *oldroot;
-	xfs_da_args_t *args;
-	xfs_dablk_t blkno;
-	struct xfs_buf *bp;
-	int error, size;
-	xfs_inode_t *dp;
-	xfs_trans_t *tp;
-	xfs_mount_t *mp;
-	xfs_dir2_leaf_t *leaf;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_intnode	*oldroot;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	struct xfs_inode	*dp;
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp;
+	struct xfs_dir2_leaf	*leaf;
+	xfs_dablk_t		blkno;
+	int			level;
+	int			error;
+	int			size;
 
 	trace_xfs_da_root_split(state->args);
 
@@ -379,29 +573,65 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	 * to a free space somewhere.
 	 */
 	args = state->args;
-	ASSERT(args != NULL);
 	error = xfs_da_grow_inode(args, &blkno);
 	if (error)
-		return(error);
+		return error;
+
 	dp = args->dp;
 	tp = args->trans;
 	mp = state->mp;
 	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
 	if (error)
-		return(error);
-	ASSERT(bp != NULL);
+		return error;
 	node = bp->b_addr;
 	oldroot = blk1->bp->b_addr;
-	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)) {
-		size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] -
-			     (char *)oldroot);
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+		struct xfs_da3_icnode_hdr nodehdr;
+
+		xfs_da3_node_hdr_from_disk(&nodehdr, oldroot);
+		btree = xfs_da3_node_tree_p(oldroot);
+		size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
+		level = nodehdr.level;
+
+		/*
+		 * we are about to copy oldroot to bp, so set up the type
+		 * of bp while we know exactly what it will be.
+		 */
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
 	} else {
-		ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+		struct xfs_dir3_icleaf_hdr leafhdr;
+		struct xfs_dir2_leaf_entry *ents;
+
 		leaf = (xfs_dir2_leaf_t *)oldroot;
-		size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] -
-			     (char *)leaf);
+		xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+		ents = xfs_dir3_leaf_ents_p(leaf);
+
+		ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+		       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+		size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
+		level = 0;
+
+		/*
+		 * we are about to copy oldroot to bp, so set up the type
+		 * of bp while we know exactly what it will be.
+		 */
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
 	}
+
+	/*
+	 * we can copy most of the information in the node from one block to
+	 * another, but for CRC enabled headers we have to make sure that the
+	 * block specific identifiers are kept intact. We update the buffer
+	 * directly for this.
+	 */
 	memcpy(node, oldroot, size);
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
+
+		node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
+	}
 	xfs_trans_log_buf(tp, bp, 0, size - 1);
 
 	bp->b_ops = blk1->bp->b_ops;
@@ -411,20 +641,25 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	/*
 	 * Set up the new root node.
 	 */
-	error = xfs_da_node_create(args,
+	error = xfs_da3_node_create(args,
 		(args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
-		be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork);
+		level + 1, &bp, args->whichfork);
 	if (error)
-		return(error);
+		return error;
+
 	node = bp->b_addr;
-	node->btree[0].hashval = cpu_to_be32(blk1->hashval);
-	node->btree[0].before = cpu_to_be32(blk1->blkno);
-	node->btree[1].hashval = cpu_to_be32(blk2->hashval);
-	node->btree[1].before = cpu_to_be32(blk2->blkno);
-	node->hdr.count = cpu_to_be16(2);
+	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	btree = xfs_da3_node_tree_p(node);
+	btree[0].hashval = cpu_to_be32(blk1->hashval);
+	btree[0].before = cpu_to_be32(blk1->blkno);
+	btree[1].hashval = cpu_to_be32(blk2->hashval);
+	btree[1].before = cpu_to_be32(blk2->blkno);
+	nodehdr.count = 2;
+	xfs_da3_node_hdr_to_disk(node, &nodehdr);
 
 #ifdef DEBUG
-	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
+	if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	    oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
 		ASSERT(blk1->blkno >= mp->m_dirleafblk &&
 		       blk1->blkno < mp->m_dirfreeblk);
 		ASSERT(blk2->blkno >= mp->m_dirleafblk &&
@@ -434,30 +669,34 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 
 	/* Header is already logged by xfs_da_node_create */
 	xfs_trans_log_buf(tp, bp,
-		XFS_DA_LOGRANGE(node, node->btree,
-			sizeof(xfs_da_node_entry_t) * 2));
+		XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
 
-	return(0);
+	return 0;
 }
 
 /*
  * Split the node, rebalance, then add the new entry.
  */
 STATIC int						/* error */
-xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
-				 xfs_da_state_blk_t *newblk,
-				 xfs_da_state_blk_t *addblk,
-				 int treelevel, int *result)
+xfs_da3_node_split(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk,
+	struct xfs_da_state_blk	*addblk,
+	int			treelevel,
+	int			*result)
 {
-	xfs_da_intnode_t *node;
-	xfs_dablk_t blkno;
-	int newcount, error;
-	int useextra;
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	xfs_dablk_t		blkno;
+	int			newcount;
+	int			error;
+	int			useextra;
 
 	trace_xfs_da_node_split(state->args);
 
 	node = oldblk->bp->b_addr;
-	ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+	xfs_da3_node_hdr_from_disk(&nodehdr, node);
 
 	/*
 	 * With V2 dirs the extra block is data or freespace.
@@ -467,7 +706,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	/*
 	 * Do we have to split the node?
 	 */
-	if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) {
+	if (nodehdr.count + newcount > state->node_ents) {
 		/*
 		 * Allocate a new node, add to the doubly linked chain of
 		 * nodes, then move some of our excess entries into it.
@@ -476,14 +715,14 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 		if (error)
 			return(error);	/* GROT: dir is inconsistent */
 
-		error = xfs_da_node_create(state->args, blkno, treelevel,
+		error = xfs_da3_node_create(state->args, blkno, treelevel,
 					   &newblk->bp, state->args->whichfork);
 		if (error)
 			return(error);	/* GROT: dir is inconsistent */
 		newblk->blkno = blkno;
 		newblk->magic = XFS_DA_NODE_MAGIC;
-		xfs_da_node_rebalance(state, oldblk, newblk);
-		error = xfs_da_blk_link(state, oldblk, newblk);
+		xfs_da3_node_rebalance(state, oldblk, newblk);
+		error = xfs_da3_blk_link(state, oldblk, newblk);
 		if (error)
 			return(error);
 		*result = 1;
@@ -495,7 +734,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	 * Insert the new entry(s) into the correct block
 	 * (updating last hashval in the process).
 	 *
-	 * xfs_da_node_add() inserts BEFORE the given index,
+	 * xfs_da3_node_add() inserts BEFORE the given index,
 	 * and as a result of using node_lookup_int() we always
 	 * point to a valid entry (not after one), but a split
 	 * operation always results in a new block whose hashvals
@@ -504,22 +743,23 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	 * If we had double-split op below us, then add the extra block too.
 	 */
 	node = oldblk->bp->b_addr;
-	if (oldblk->index <= be16_to_cpu(node->hdr.count)) {
+	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	if (oldblk->index <= nodehdr.count) {
 		oldblk->index++;
-		xfs_da_node_add(state, oldblk, addblk);
+		xfs_da3_node_add(state, oldblk, addblk);
 		if (useextra) {
 			if (state->extraafter)
 				oldblk->index++;
-			xfs_da_node_add(state, oldblk, &state->extrablk);
+			xfs_da3_node_add(state, oldblk, &state->extrablk);
 			state->extravalid = 0;
 		}
 	} else {
 		newblk->index++;
-		xfs_da_node_add(state, newblk, addblk);
+		xfs_da3_node_add(state, newblk, addblk);
 		if (useextra) {
 			if (state->extraafter)
 				newblk->index++;
-			xfs_da_node_add(state, newblk, &state->extrablk);
+			xfs_da3_node_add(state, newblk, &state->extrablk);
 			state->extravalid = 0;
 		}
 	}
@@ -534,33 +774,53 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
  * NOTE: if blk2 is empty, then it will get the upper half of blk1.
  */
 STATIC void
-xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
-				     xfs_da_state_blk_t *blk2)
+xfs_da3_node_rebalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*blk1,
+	struct xfs_da_state_blk	*blk2)
 {
-	xfs_da_intnode_t *node1, *node2, *tmpnode;
-	xfs_da_node_entry_t *btree_s, *btree_d;
-	int count, tmp;
-	xfs_trans_t *tp;
+	struct xfs_da_intnode	*node1;
+	struct xfs_da_intnode	*node2;
+	struct xfs_da_intnode	*tmpnode;
+	struct xfs_da_node_entry *btree1;
+	struct xfs_da_node_entry *btree2;
+	struct xfs_da_node_entry *btree_s;
+	struct xfs_da_node_entry *btree_d;
+	struct xfs_da3_icnode_hdr nodehdr1;
+	struct xfs_da3_icnode_hdr nodehdr2;
+	struct xfs_trans	*tp;
+	int			count;
+	int			tmp;
+	int			swap = 0;
 
 	trace_xfs_da_node_rebalance(state->args);
 
 	node1 = blk1->bp->b_addr;
 	node2 = blk2->bp->b_addr;
+	xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
+	xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
+	btree1 = xfs_da3_node_tree_p(node1);
+	btree2 = xfs_da3_node_tree_p(node2);
+
 	/*
 	 * Figure out how many entries need to move, and in which direction.
 	 * Swap the nodes around if that makes it simpler.
 	 */
-	if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
-	    ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) ||
-	     (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
-	      be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
+	if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
+	    ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+	     (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
+			be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
 		tmpnode = node1;
 		node1 = node2;
 		node2 = tmpnode;
+		xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
+		xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
+		btree1 = xfs_da3_node_tree_p(node1);
+		btree2 = xfs_da3_node_tree_p(node2);
+		swap = 1;
 	}
-	ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-	ASSERT(node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-	count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2;
+
+	count = (nodehdr1.count - nodehdr2.count) / 2;
 	if (count == 0)
 		return;
 	tp = state->args->trans;
@@ -571,10 +831,11 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		/*
 		 * Move elements in node2 up to make a hole.
 		 */
-		if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) {
+		tmp = nodehdr2.count;
+		if (tmp > 0) {
 			tmp *= (uint)sizeof(xfs_da_node_entry_t);
-			btree_s = &node2->btree[0];
-			btree_d = &node2->btree[count];
+			btree_s = &btree2[0];
+			btree_d = &btree2[count];
 			memmove(btree_d, btree_s, tmp);
 		}
 
@@ -582,12 +843,12 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		 * Move the req'd B-tree elements from high in node1 to
 		 * low in node2.
 		 */
-		be16_add_cpu(&node2->hdr.count, count);
+		nodehdr2.count += count;
 		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
-		btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count];
-		btree_d = &node2->btree[0];
+		btree_s = &btree1[nodehdr1.count - count];
+		btree_d = &btree2[0];
 		memcpy(btree_d, btree_s, tmp);
-		be16_add_cpu(&node1->hdr.count, -count);
+		nodehdr1.count -= count;
 	} else {
 		/*
 		 * Move the req'd B-tree elements from low in node2 to
@@ -595,49 +856,60 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		 */
 		count = -count;
 		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
-		btree_s = &node2->btree[0];
-		btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)];
+		btree_s = &btree2[0];
+		btree_d = &btree1[nodehdr1.count];
 		memcpy(btree_d, btree_s, tmp);
-		be16_add_cpu(&node1->hdr.count, count);
+		nodehdr1.count += count;
+
 		xfs_trans_log_buf(tp, blk1->bp,
 			XFS_DA_LOGRANGE(node1, btree_d, tmp));
 
 		/*
 		 * Move elements in node2 down to fill the hole.
 		 */
-		tmp  = be16_to_cpu(node2->hdr.count) - count;
+		tmp  = nodehdr2.count - count;
 		tmp *= (uint)sizeof(xfs_da_node_entry_t);
-		btree_s = &node2->btree[count];
-		btree_d = &node2->btree[0];
+		btree_s = &btree2[count];
+		btree_d = &btree2[0];
 		memmove(btree_d, btree_s, tmp);
-		be16_add_cpu(&node2->hdr.count, -count);
+		nodehdr2.count -= count;
 	}
 
 	/*
 	 * Log header of node 1 and all current bits of node 2.
 	 */
+	xfs_da3_node_hdr_to_disk(node1, &nodehdr1);
 	xfs_trans_log_buf(tp, blk1->bp,
-		XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
+		XFS_DA_LOGRANGE(node1, &node1->hdr,
+				xfs_da3_node_hdr_size(node1)));
+
+	xfs_da3_node_hdr_to_disk(node2, &nodehdr2);
 	xfs_trans_log_buf(tp, blk2->bp,
 		XFS_DA_LOGRANGE(node2, &node2->hdr,
-			sizeof(node2->hdr) +
-			sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count)));
+				xfs_da3_node_hdr_size(node2) +
+				(sizeof(btree2[0]) * nodehdr2.count)));
 
 	/*
 	 * Record the last hashval from each block for upward propagation.
 	 * (note: don't use the swapped node pointers)
 	 */
-	node1 = blk1->bp->b_addr;
-	node2 = blk2->bp->b_addr;
-	blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval);
-	blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval);
+	if (swap) {
+		node1 = blk1->bp->b_addr;
+		node2 = blk2->bp->b_addr;
+		xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
+		xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
+		btree1 = xfs_da3_node_tree_p(node1);
+		btree2 = xfs_da3_node_tree_p(node2);
+	}
+	blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
+	blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
 
 	/*
 	 * Adjust the expected index for insertion.
 	 */
-	if (blk1->index >= be16_to_cpu(node1->hdr.count)) {
-		blk2->index = blk1->index - be16_to_cpu(node1->hdr.count);
-		blk1->index = be16_to_cpu(node1->hdr.count) + 1;	/* make it invalid */
+	if (blk1->index >= nodehdr1.count) {
+		blk2->index = blk1->index - nodehdr1.count;
+		blk1->index = nodehdr1.count + 1;	/* make it invalid */
 	}
 }
 
@@ -645,18 +917,23 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
  * Add a new entry to an intermediate node.
  */
 STATIC void
-xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
-			       xfs_da_state_blk_t *newblk)
+xfs_da3_node_add(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*oldblk,
+	struct xfs_da_state_blk	*newblk)
 {
-	xfs_da_intnode_t *node;
-	xfs_da_node_entry_t *btree;
-	int tmp;
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_node_entry *btree;
+	int			tmp;
 
 	trace_xfs_da_node_add(state->args);
 
 	node = oldblk->bp->b_addr;
-	ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-	ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
+	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	btree = xfs_da3_node_tree_p(node);
+
+	ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
 	ASSERT(newblk->blkno != 0);
 	if (state->args->whichfork == XFS_DATA_FORK)
 		ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
@@ -666,23 +943,25 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	 * We may need to make some room before we insert the new node.
 	 */
 	tmp = 0;
-	btree = &node->btree[ oldblk->index ];
-	if (oldblk->index < be16_to_cpu(node->hdr.count)) {
-		tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree);
-		memmove(btree + 1, btree, tmp);
+	if (oldblk->index < nodehdr.count) {
+		tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
+		memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
 	}
-	btree->hashval = cpu_to_be32(newblk->hashval);
-	btree->before = cpu_to_be32(newblk->blkno);
+	btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
+	btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
 	xfs_trans_log_buf(state->args->trans, oldblk->bp,
-		XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
-	be16_add_cpu(&node->hdr.count, 1);
+		XFS_DA_LOGRANGE(node, &btree[oldblk->index],
+				tmp + sizeof(*btree)));
+
+	nodehdr.count += 1;
+	xfs_da3_node_hdr_to_disk(node, &nodehdr);
 	xfs_trans_log_buf(state->args->trans, oldblk->bp,
-		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+		XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
 
 	/*
 	 * Copy the last hash value from the oldblk to propagate upwards.
 	 */
-	oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval);
+	oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
 }
 
 /*========================================================================
@@ -694,14 +973,16 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
  * possibly deallocating that block, etc...
  */
 int
-xfs_da_join(xfs_da_state_t *state)
+xfs_da3_join(
+	struct xfs_da_state	*state)
 {
-	xfs_da_state_blk_t *drop_blk, *save_blk;
-	int action, error;
+	struct xfs_da_state_blk	*drop_blk;
+	struct xfs_da_state_blk	*save_blk;
+	int			action = 0;
+	int			error;
 
 	trace_xfs_da_join(state->args);
 
-	action = 0;
 	drop_blk = &state->path.blk[ state->path.active-1 ];
 	save_blk = &state->altpath.blk[ state->path.active-1 ];
 	ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
@@ -722,12 +1003,12 @@ xfs_da_join(xfs_da_state_t *state)
 		 */
 		switch (drop_blk->magic) {
 		case XFS_ATTR_LEAF_MAGIC:
-			error = xfs_attr_leaf_toosmall(state, &action);
+			error = xfs_attr3_leaf_toosmall(state, &action);
 			if (error)
 				return(error);
 			if (action == 0)
 				return(0);
-			xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
+			xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
 			break;
 		case XFS_DIR2_LEAFN_MAGIC:
 			error = xfs_dir2_leafn_toosmall(state, &action);
@@ -742,18 +1023,18 @@ xfs_da_join(xfs_da_state_t *state)
 			 * Remove the offending node, fixup hashvals,
 			 * check for a toosmall neighbor.
 			 */
-			xfs_da_node_remove(state, drop_blk);
-			xfs_da_fixhashpath(state, &state->path);
-			error = xfs_da_node_toosmall(state, &action);
+			xfs_da3_node_remove(state, drop_blk);
+			xfs_da3_fixhashpath(state, &state->path);
+			error = xfs_da3_node_toosmall(state, &action);
 			if (error)
 				return(error);
 			if (action == 0)
 				return 0;
-			xfs_da_node_unbalance(state, drop_blk, save_blk);
+			xfs_da3_node_unbalance(state, drop_blk, save_blk);
 			break;
 		}
-		xfs_da_fixhashpath(state, &state->altpath);
-		error = xfs_da_blk_unlink(state, drop_blk, save_blk);
+		xfs_da3_fixhashpath(state, &state->altpath);
+		error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
 		xfs_da_state_kill_altpath(state);
 		if (error)
 			return(error);
@@ -768,9 +1049,9 @@ xfs_da_join(xfs_da_state_t *state)
 	 * we only have one entry in the root, make the child block
 	 * the new root.
 	 */
-	xfs_da_node_remove(state, drop_blk);
-	xfs_da_fixhashpath(state, &state->path);
-	error = xfs_da_root_join(state, &state->path.blk[0]);
+	xfs_da3_node_remove(state, drop_blk);
+	xfs_da3_fixhashpath(state, &state->path);
+	error = xfs_da3_root_join(state, &state->path.blk[0]);
 	return(error);
 }
 
@@ -782,9 +1063,13 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
 
 	if (level == 1) {
 		ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-		       magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-	} else
-		ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+		       magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+		       magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+		       magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+	} else {
+		ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+		       magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+	}
 	ASSERT(!blkinfo->forw);
 	ASSERT(!blkinfo->back);
 }
@@ -797,52 +1082,61 @@ xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
  * the old root to block 0 as the new root node.
  */
 STATIC int
-xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
+xfs_da3_root_join(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*root_blk)
 {
-	xfs_da_intnode_t *oldroot;
-	xfs_da_args_t *args;
-	xfs_dablk_t child;
-	struct xfs_buf *bp;
-	int error;
+	struct xfs_da_intnode	*oldroot;
+	struct xfs_da_args	*args;
+	xfs_dablk_t		child;
+	struct xfs_buf		*bp;
+	struct xfs_da3_icnode_hdr oldroothdr;
+	struct xfs_da_node_entry *btree;
+	int			error;
 
 	trace_xfs_da_root_join(state->args);
 
-	args = state->args;
-	ASSERT(args != NULL);
 	ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+
+	args = state->args;
 	oldroot = root_blk->bp->b_addr;
-	ASSERT(oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-	ASSERT(!oldroot->hdr.info.forw);
-	ASSERT(!oldroot->hdr.info.back);
+	xfs_da3_node_hdr_from_disk(&oldroothdr, oldroot);
+	ASSERT(oldroothdr.forw == 0);
+	ASSERT(oldroothdr.back == 0);
 
 	/*
 	 * If the root has more than one child, then don't do anything.
 	 */
-	if (be16_to_cpu(oldroot->hdr.count) > 1)
-		return(0);
+	if (oldroothdr.count > 1)
+		return 0;
 
 	/*
 	 * Read in the (only) child block, then copy those bytes into
 	 * the root block's buffer and free the original child block.
 	 */
-	child = be32_to_cpu(oldroot->btree[0].before);
+	btree = xfs_da3_node_tree_p(oldroot);
+	child = be32_to_cpu(btree[0].before);
 	ASSERT(child != 0);
-	error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
+	error = xfs_da3_node_read(args->trans, args->dp, child, -1, &bp,
 					     args->whichfork);
 	if (error)
-		return(error);
-	ASSERT(bp != NULL);
-	xfs_da_blkinfo_onlychild_validate(bp->b_addr,
-					be16_to_cpu(oldroot->hdr.level));
+		return error;
+	xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
 
 	/*
 	 * This could be copying a leaf back into the root block in the case of
 	 * there only being a single leaf block left in the tree. Hence we have
 	 * to update the b_ops pointer as well to match the buffer type change
-	 * that could occur.
+	 * that could occur. For dir3 blocks we also need to update the block
+	 * number in the buffer header.
 	 */
 	memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
 	root_blk->bp->b_ops = bp->b_ops;
+	xfs_trans_buf_copy_type(root_blk->bp, bp);
+	if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
+		struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
+		da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
+	}
 	xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
 	error = xfs_da_shrink_inode(args, child, bp);
 	return(error);
@@ -858,14 +1152,21 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
  * If nothing can be done, return 0.
  */
 STATIC int
-xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
+xfs_da3_node_toosmall(
+	struct xfs_da_state	*state,
+	int			*action)
 {
-	xfs_da_intnode_t *node;
-	xfs_da_state_blk_t *blk;
-	xfs_da_blkinfo_t *info;
-	int count, forward, error, retval, i;
-	xfs_dablk_t blkno;
-	struct xfs_buf *bp;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*info;
+	xfs_dablk_t		blkno;
+	struct xfs_buf		*bp;
+	struct xfs_da3_icnode_hdr nodehdr;
+	int			count;
+	int			forward;
+	int			error;
+	int			retval;
+	int			i;
 
 	trace_xfs_da_node_toosmall(state->args);
 
@@ -876,10 +1177,9 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
 	 */
 	blk = &state->path.blk[ state->path.active-1 ];
 	info = blk->bp->b_addr;
-	ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
 	node = (xfs_da_intnode_t *)info;
-	count = be16_to_cpu(node->hdr.count);
-	if (count > (state->node_ents >> 1)) {
+	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	if (nodehdr.count > (state->node_ents >> 1)) {
 		*action = 0;	/* blk over 50%, don't try to join */
 		return(0);	/* blk over 50%, don't try to join */
 	}
@@ -890,14 +1190,14 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
 	 * coalesce it with a sibling block.  We choose (arbitrarily)
 	 * to merge with the forward block unless it is NULL.
 	 */
-	if (count == 0) {
+	if (nodehdr.count == 0) {
 		/*
 		 * Make altpath point to the block we want to keep and
 		 * path point to the block we want to drop (this one).
 		 */
 		forward = (info->forw != 0);
 		memcpy(&state->altpath, &state->path, sizeof(state->path));
-		error = xfs_da_path_shift(state, &state->altpath, forward,
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
 						 0, &retval);
 		if (error)
 			return(error);
@@ -916,35 +1216,34 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
 	 * We prefer coalescing with the lower numbered sibling so as
 	 * to shrink a directory over time.
 	 */
+	count  = state->node_ents;
+	count -= state->node_ents >> 2;
+	count -= nodehdr.count;
+
 	/* start with smaller blk num */
-	forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));
+	forward = nodehdr.forw < nodehdr.back;
 	for (i = 0; i < 2; forward = !forward, i++) {
 		if (forward)
-			blkno = be32_to_cpu(info->forw);
+			blkno = nodehdr.forw;
 		else
-			blkno = be32_to_cpu(info->back);
+			blkno = nodehdr.back;
 		if (blkno == 0)
 			continue;
-		error = xfs_da_node_read(state->args->trans, state->args->dp,
+		error = xfs_da3_node_read(state->args->trans, state->args->dp,
 					blkno, -1, &bp, state->args->whichfork);
 		if (error)
 			return(error);
-		ASSERT(bp != NULL);
 
-		node = (xfs_da_intnode_t *)info;
-		count  = state->node_ents;
-		count -= state->node_ents >> 2;
-		count -= be16_to_cpu(node->hdr.count);
 		node = bp->b_addr;
-		ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-		count -= be16_to_cpu(node->hdr.count);
+		xfs_da3_node_hdr_from_disk(&nodehdr, node);
 		xfs_trans_brelse(state->args->trans, bp);
-		if (count >= 0)
+
+		if (count - nodehdr.count >= 0)
 			break;	/* fits with at least 25% to spare */
 	}
 	if (i >= 2) {
 		*action = 0;
-		return(0);
+		return 0;
 	}
 
 	/*
@@ -953,28 +1252,42 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
 	 */
 	memcpy(&state->altpath, &state->path, sizeof(state->path));
 	if (blkno < blk->blkno) {
-		error = xfs_da_path_shift(state, &state->altpath, forward,
+		error = xfs_da3_path_shift(state, &state->altpath, forward,
 						 0, &retval);
-		if (error) {
-			return(error);
-		}
-		if (retval) {
-			*action = 0;
-			return(0);
-		}
 	} else {
-		error = xfs_da_path_shift(state, &state->path, forward,
+		error = xfs_da3_path_shift(state, &state->path, forward,
 						 0, &retval);
-		if (error) {
-			return(error);
-		}
-		if (retval) {
-			*action = 0;
-			return(0);
-		}
+	}
+	if (error)
+		return error;
+	if (retval) {
+		*action = 0;
+		return 0;
 	}
 	*action = 1;
-	return(0);
+	return 0;
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da3_node_lasthash(
+	struct xfs_buf		*bp,
+	int			*count)
+{
+	struct xfs_da_intnode	 *node;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+
+	node = bp->b_addr;
+	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	if (count)
+		*count = nodehdr.count;
+	if (!nodehdr.count)
+		return 0;
+	btree = xfs_da3_node_tree_p(node);
+	return be32_to_cpu(btree[nodehdr.count - 1].hashval);
 }
 
 /*
@@ -982,13 +1295,16 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
  * when we stop making changes, return.
  */
 void
-xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
+xfs_da3_fixhashpath(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_path *path)
 {
-	xfs_da_state_blk_t *blk;
-	xfs_da_intnode_t *node;
-	xfs_da_node_entry_t *btree;
-	xfs_dahash_t lasthash=0;
-	int level, count;
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_node_entry *btree;
+	xfs_dahash_t		lasthash=0;
+	int			level;
+	int			count;
 
 	trace_xfs_da_fixhashpath(state->args);
 
@@ -1006,23 +1322,26 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
 			return;
 		break;
 	case XFS_DA_NODE_MAGIC:
-		lasthash = xfs_da_node_lasthash(blk->bp, &count);
+		lasthash = xfs_da3_node_lasthash(blk->bp, &count);
 		if (count == 0)
 			return;
 		break;
 	}
 	for (blk--, level--; level >= 0; blk--, level--) {
+		struct xfs_da3_icnode_hdr nodehdr;
+
 		node = blk->bp->b_addr;
-		ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-		btree = &node->btree[ blk->index ];
+		xfs_da3_node_hdr_from_disk(&nodehdr, node);
+		btree = xfs_da3_node_tree_p(node);
 		if (be32_to_cpu(btree->hashval) == lasthash)
 			break;
 		blk->hashval = lasthash;
-		btree->hashval = cpu_to_be32(lasthash);
+		btree[blk->index].hashval = cpu_to_be32(lasthash);
 		xfs_trans_log_buf(state->args->trans, blk->bp,
-				  XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+				  XFS_DA_LOGRANGE(node, &btree[blk->index],
+						  sizeof(*btree)));
 
-		lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+		lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
 	}
 }
 
@@ -1030,104 +1349,120 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
  * Remove an entry from an intermediate node.
  */
 STATIC void
-xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
+xfs_da3_node_remove(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk)
 {
-	xfs_da_intnode_t *node;
-	xfs_da_node_entry_t *btree;
-	int tmp;
+	struct xfs_da_intnode	*node;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_node_entry *btree;
+	int			index;
+	int			tmp;
 
 	trace_xfs_da_node_remove(state->args);
 
 	node = drop_blk->bp->b_addr;
-	ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count));
+	xfs_da3_node_hdr_from_disk(&nodehdr, node);
+	ASSERT(drop_blk->index < nodehdr.count);
 	ASSERT(drop_blk->index >= 0);
 
 	/*
 	 * Copy over the offending entry, or just zero it out.
 	 */
-	btree = &node->btree[drop_blk->index];
-	if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) {
-		tmp  = be16_to_cpu(node->hdr.count) - drop_blk->index - 1;
+	index = drop_blk->index;
+	btree = xfs_da3_node_tree_p(node);
+	if (index < nodehdr.count - 1) {
+		tmp  = nodehdr.count - index - 1;
 		tmp *= (uint)sizeof(xfs_da_node_entry_t);
-		memmove(btree, btree + 1, tmp);
+		memmove(&btree[index], &btree[index + 1], tmp);
 		xfs_trans_log_buf(state->args->trans, drop_blk->bp,
-		    XFS_DA_LOGRANGE(node, btree, tmp));
-		btree = &node->btree[be16_to_cpu(node->hdr.count)-1];
+		    XFS_DA_LOGRANGE(node, &btree[index], tmp));
+		index = nodehdr.count - 1;
 	}
-	memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
+	memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
 	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
-	    XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
-	be16_add_cpu(&node->hdr.count, -1);
+	    XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
+	nodehdr.count -= 1;
+	xfs_da3_node_hdr_to_disk(node, &nodehdr);
 	xfs_trans_log_buf(state->args->trans, drop_blk->bp,
-	    XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+	    XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
 
 	/*
 	 * Copy the last hash value from the block to propagate upwards.
 	 */
-	btree--;
-	drop_blk->hashval = be32_to_cpu(btree->hashval);
+	drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
 }
 
 /*
- * Unbalance the btree elements between two intermediate nodes,
+ * Unbalance the elements between two intermediate nodes,
  * move all Btree elements from one node into another.
  */
 STATIC void
-xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
-				     xfs_da_state_blk_t *save_blk)
+xfs_da3_node_unbalance(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
 {
-	xfs_da_intnode_t *drop_node, *save_node;
-	xfs_da_node_entry_t *btree;
-	int tmp;
-	xfs_trans_t *tp;
+	struct xfs_da_intnode	*drop_node;
+	struct xfs_da_intnode	*save_node;
+	struct xfs_da_node_entry *drop_btree;
+	struct xfs_da_node_entry *save_btree;
+	struct xfs_da3_icnode_hdr drop_hdr;
+	struct xfs_da3_icnode_hdr save_hdr;
+	struct xfs_trans	*tp;
+	int			sindex;
+	int			tmp;
 
 	trace_xfs_da_node_unbalance(state->args);
 
 	drop_node = drop_blk->bp->b_addr;
 	save_node = save_blk->bp->b_addr;
-	ASSERT(drop_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-	ASSERT(save_node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+	xfs_da3_node_hdr_from_disk(&drop_hdr, drop_node);
+	xfs_da3_node_hdr_from_disk(&save_hdr, save_node);
+	drop_btree = xfs_da3_node_tree_p(drop_node);
+	save_btree = xfs_da3_node_tree_p(save_node);
 	tp = state->args->trans;
 
 	/*
 	 * If the dying block has lower hashvals, then move all the
 	 * elements in the remaining block up to make a hole.
 	 */
-	if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) ||
-	    (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) <
-	     be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval)))
-	{
-		btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)];
-		tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
-		memmove(btree, &save_node->btree[0], tmp);
-		btree = &save_node->btree[0];
+	if ((be32_to_cpu(drop_btree[0].hashval) <
+			be32_to_cpu(save_btree[0].hashval)) ||
+	    (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
+			be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
+		/* XXX: check this - is memmove dst correct? */
+		tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
+		memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
+
+		sindex = 0;
 		xfs_trans_log_buf(tp, save_blk->bp,
-			XFS_DA_LOGRANGE(save_node, btree,
-				(be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) *
-				sizeof(xfs_da_node_entry_t)));
+			XFS_DA_LOGRANGE(save_node, &save_btree[0],
+				(save_hdr.count + drop_hdr.count) *
+						sizeof(xfs_da_node_entry_t)));
 	} else {
-		btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)];
+		sindex = save_hdr.count;
 		xfs_trans_log_buf(tp, save_blk->bp,
-			XFS_DA_LOGRANGE(save_node, btree,
-				be16_to_cpu(drop_node->hdr.count) *
-				sizeof(xfs_da_node_entry_t)));
+			XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
+				drop_hdr.count * sizeof(xfs_da_node_entry_t)));
 	}
 
 	/*
 	 * Move all the B-tree elements from drop_blk to save_blk.
 	 */
-	tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t);
-	memcpy(btree, &drop_node->btree[0], tmp);
-	be16_add_cpu(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count));
+	tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
+	memcpy(&save_btree[sindex], &drop_btree[0], tmp);
+	save_hdr.count += drop_hdr.count;
 
+	xfs_da3_node_hdr_to_disk(save_node, &save_hdr);
 	xfs_trans_log_buf(tp, save_blk->bp,
 		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
-			sizeof(save_node->hdr)));
+				xfs_da3_node_hdr_size(save_node)));
 
 	/*
 	 * Save the last hashval in the remaining block for upward propagation.
 	 */
-	save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval);
+	save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
 }
 
 /*========================================================================
@@ -1146,16 +1481,24 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
  * pruned depth-first tree search.
  */
 int							/* error */
-xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
+xfs_da3_node_lookup_int(
+	struct xfs_da_state	*state,
+	int			*result)
 {
-	xfs_da_state_blk_t *blk;
-	xfs_da_blkinfo_t *curr;
-	xfs_da_intnode_t *node;
-	xfs_da_node_entry_t *btree;
-	xfs_dablk_t blkno;
-	int probe, span, max, error, retval;
-	xfs_dahash_t hashval, btreehashval;
-	xfs_da_args_t *args;
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*curr;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	struct xfs_da_args	*args;
+	xfs_dablk_t		blkno;
+	xfs_dahash_t		hashval;
+	xfs_dahash_t		btreehashval;
+	int			probe;
+	int			span;
+	int			max;
+	int			error;
+	int			retval;
 
 	args = state->args;
 
@@ -1171,7 +1514,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 		 * Read the next node down in the tree.
 		 */
 		blk->blkno = blkno;
-		error = xfs_da_node_read(args->trans, args->dp, blkno,
+		error = xfs_da3_node_read(args->trans, args->dp, blkno,
 					-1, &blk->bp, args->whichfork);
 		if (error) {
 			blk->blkno = 0;
@@ -1180,66 +1523,75 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 		}
 		curr = blk->bp->b_addr;
 		blk->magic = be16_to_cpu(curr->magic);
-		ASSERT(blk->magic == XFS_DA_NODE_MAGIC ||
-		       blk->magic == XFS_DIR2_LEAFN_MAGIC ||
-		       blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+		if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
+		    blk->magic == XFS_ATTR3_LEAF_MAGIC) {
+			blk->magic = XFS_ATTR_LEAF_MAGIC;
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+			break;
+		}
+
+		if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+		    blk->magic == XFS_DIR3_LEAFN_MAGIC) {
+			blk->magic = XFS_DIR2_LEAFN_MAGIC;
+			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
+			break;
+		}
+
+		blk->magic = XFS_DA_NODE_MAGIC;
+
 
 		/*
 		 * Search an intermediate node for a match.
 		 */
-		if (blk->magic == XFS_DA_NODE_MAGIC) {
-			node = blk->bp->b_addr;
-			max = be16_to_cpu(node->hdr.count);
-			blk->hashval = be32_to_cpu(node->btree[max-1].hashval);
+		node = blk->bp->b_addr;
+		xfs_da3_node_hdr_from_disk(&nodehdr, node);
+		btree = xfs_da3_node_tree_p(node);
 
-			/*
-			 * Binary search.  (note: small blocks will skip loop)
-			 */
-			probe = span = max / 2;
-			hashval = args->hashval;
-			for (btree = &node->btree[probe]; span > 4;
-				   btree = &node->btree[probe]) {
-				span /= 2;
-				btreehashval = be32_to_cpu(btree->hashval);
-				if (btreehashval < hashval)
-					probe += span;
-				else if (btreehashval > hashval)
-					probe -= span;
-				else
-					break;
-			}
-			ASSERT((probe >= 0) && (probe < max));
-			ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval));
+		max = nodehdr.count;
+		blk->hashval = be32_to_cpu(btree[max - 1].hashval);
 
-			/*
-			 * Since we may have duplicate hashval's, find the first
-			 * matching hashval in the node.
-			 */
-			while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) {
-				btree--;
-				probe--;
-			}
-			while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) {
-				btree++;
-				probe++;
-			}
+		/*
+		 * Binary search.  (note: small blocks will skip loop)
+		 */
+		probe = span = max / 2;
+		hashval = args->hashval;
+		while (span > 4) {
+			span /= 2;
+			btreehashval = be32_to_cpu(btree[probe].hashval);
+			if (btreehashval < hashval)
+				probe += span;
+			else if (btreehashval > hashval)
+				probe -= span;
+			else
+				break;
+		}
+		ASSERT((probe >= 0) && (probe < max));
+		ASSERT((span <= 4) ||
+			(be32_to_cpu(btree[probe].hashval) == hashval));
 
-			/*
-			 * Pick the right block to descend on.
-			 */
-			if (probe == max) {
-				blk->index = max-1;
-				blkno = be32_to_cpu(node->btree[max-1].before);
-			} else {
-				blk->index = probe;
-				blkno = be32_to_cpu(btree->before);
-			}
-		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
-			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
-			break;
-		} else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
-			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
-			break;
+		/*
+		 * Since we may have duplicate hashval's, find the first
+		 * matching hashval in the node.
+		 */
+		while (probe > 0 &&
+		       be32_to_cpu(btree[probe].hashval) >= hashval) {
+			probe--;
+		}
+		while (probe < max &&
+		       be32_to_cpu(btree[probe].hashval) < hashval) {
+			probe++;
+		}
+
+		/*
+		 * Pick the right block to descend on.
+		 */
+		if (probe == max) {
+			blk->index = max - 1;
+			blkno = be32_to_cpu(btree[max - 1].before);
+		} else {
+			blk->index = probe;
+			blkno = be32_to_cpu(btree[probe].before);
 		}
 	}
 
@@ -1254,7 +1606,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 			retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
 							&blk->index, state);
 		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
-			retval = xfs_attr_leaf_lookup_int(blk->bp, args);
+			retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
 			blk->index = args->index;
 			args->blkno = blk->blkno;
 		} else {
@@ -1263,7 +1615,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 		}
 		if (((retval == ENOENT) || (retval == ENOATTR)) &&
 		    (blk->hashval == args->hashval)) {
-			error = xfs_da_path_shift(state, &state->path, 1, 1,
+			error = xfs_da3_path_shift(state, &state->path, 1, 1,
 							 &retval);
 			if (error)
 				return(error);
@@ -1285,16 +1637,52 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
  *========================================================================*/
 
 /*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da3_node_order(
+	struct xfs_buf	*node1_bp,
+	struct xfs_buf	*node2_bp)
+{
+	struct xfs_da_intnode	*node1;
+	struct xfs_da_intnode	*node2;
+	struct xfs_da_node_entry *btree1;
+	struct xfs_da_node_entry *btree2;
+	struct xfs_da3_icnode_hdr node1hdr;
+	struct xfs_da3_icnode_hdr node2hdr;
+
+	node1 = node1_bp->b_addr;
+	node2 = node2_bp->b_addr;
+	xfs_da3_node_hdr_from_disk(&node1hdr, node1);
+	xfs_da3_node_hdr_from_disk(&node2hdr, node2);
+	btree1 = xfs_da3_node_tree_p(node1);
+	btree2 = xfs_da3_node_tree_p(node2);
+
+	if (node1hdr.count > 0 && node2hdr.count > 0 &&
+	    ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
+	     (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
+	      be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
+		return 1;
+	}
+	return 0;
+}
+
+/*
  * Link a new block into a doubly linked list of blocks (of whatever type).
  */
 int							/* error */
-xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
-			       xfs_da_state_blk_t *new_blk)
+xfs_da3_blk_link(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*old_blk,
+	struct xfs_da_state_blk	*new_blk)
 {
-	xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
-	xfs_da_args_t *args;
-	int before=0, error;
-	struct xfs_buf *bp;
+	struct xfs_da_blkinfo	*old_info;
+	struct xfs_da_blkinfo	*new_info;
+	struct xfs_da_blkinfo	*tmp_info;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	int			before = 0;
+	int			error;
 
 	/*
 	 * Set up environment.
@@ -1306,9 +1694,6 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 	ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
 	       old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
 	       old_blk->magic == XFS_ATTR_LEAF_MAGIC);
-	ASSERT(old_blk->magic == be16_to_cpu(old_info->magic));
-	ASSERT(new_blk->magic == be16_to_cpu(new_info->magic));
-	ASSERT(old_blk->magic == new_blk->magic);
 
 	switch (old_blk->magic) {
 	case XFS_ATTR_LEAF_MAGIC:
@@ -1318,7 +1703,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 		before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
 		break;
 	case XFS_DA_NODE_MAGIC:
-		before = xfs_da_node_order(old_blk->bp, new_blk->bp);
+		before = xfs_da3_node_order(old_blk->bp, new_blk->bp);
 		break;
 	}
 
@@ -1333,14 +1718,14 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 		new_info->forw = cpu_to_be32(old_blk->blkno);
 		new_info->back = old_info->back;
 		if (old_info->back) {
-			error = xfs_da_node_read(args->trans, args->dp,
+			error = xfs_da3_node_read(args->trans, args->dp,
 						be32_to_cpu(old_info->back),
 						-1, &bp, args->whichfork);
 			if (error)
 				return(error);
 			ASSERT(bp != NULL);
 			tmp_info = bp->b_addr;
-			ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic));
+			ASSERT(tmp_info->magic == old_info->magic);
 			ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
 			tmp_info->forw = cpu_to_be32(new_blk->blkno);
 			xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
@@ -1354,7 +1739,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 		new_info->forw = old_info->forw;
 		new_info->back = cpu_to_be32(old_blk->blkno);
 		if (old_info->forw) {
-			error = xfs_da_node_read(args->trans, args->dp,
+			error = xfs_da3_node_read(args->trans, args->dp,
 						be32_to_cpu(old_info->forw),
 						-1, &bp, args->whichfork);
 			if (error)
@@ -1375,59 +1760,20 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 }
 
 /*
- * Compare two intermediate nodes for "order".
- */
-STATIC int
-xfs_da_node_order(
-	struct xfs_buf	*node1_bp,
-	struct xfs_buf	*node2_bp)
-{
-	xfs_da_intnode_t *node1, *node2;
-
-	node1 = node1_bp->b_addr;
-	node2 = node2_bp->b_addr;
-	ASSERT(node1->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) &&
-	       node2->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-	if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) &&
-	    ((be32_to_cpu(node2->btree[0].hashval) <
-	      be32_to_cpu(node1->btree[0].hashval)) ||
-	     (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) <
-	      be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) {
-		return(1);
-	}
-	return(0);
-}
-
-/*
- * Pick up the last hashvalue from an intermediate node.
- */
-STATIC uint
-xfs_da_node_lasthash(
-	struct xfs_buf	*bp,
-	int		*count)
-{
-	xfs_da_intnode_t *node;
-
-	node = bp->b_addr;
-	ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-	if (count)
-		*count = be16_to_cpu(node->hdr.count);
-	if (!node->hdr.count)
-		return(0);
-	return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
-}
-
-/*
  * Unlink a block from a doubly linked list of blocks.
  */
 STATIC int						/* error */
-xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
-				 xfs_da_state_blk_t *save_blk)
+xfs_da3_blk_unlink(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_blk	*drop_blk,
+	struct xfs_da_state_blk	*save_blk)
 {
-	xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
-	xfs_da_args_t *args;
-	struct xfs_buf *bp;
-	int error;
+	struct xfs_da_blkinfo	*drop_info;
+	struct xfs_da_blkinfo	*save_info;
+	struct xfs_da_blkinfo	*tmp_info;
+	struct xfs_da_args	*args;
+	struct xfs_buf		*bp;
+	int			error;
 
 	/*
 	 * Set up environment.
@@ -1439,8 +1785,6 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 	ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
 	       save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
 	       save_blk->magic == XFS_ATTR_LEAF_MAGIC);
-	ASSERT(save_blk->magic == be16_to_cpu(save_info->magic));
-	ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic));
 	ASSERT(save_blk->magic == drop_blk->magic);
 	ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
 	       (be32_to_cpu(save_info->back) == drop_blk->blkno));
@@ -1454,7 +1798,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 		trace_xfs_da_unlink_back(args);
 		save_info->back = drop_info->back;
 		if (drop_info->back) {
-			error = xfs_da_node_read(args->trans, args->dp,
+			error = xfs_da3_node_read(args->trans, args->dp,
 						be32_to_cpu(drop_info->back),
 						-1, &bp, args->whichfork);
 			if (error)
@@ -1471,7 +1815,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 		trace_xfs_da_unlink_forward(args);
 		save_info->forw = drop_info->forw;
 		if (drop_info->forw) {
-			error = xfs_da_node_read(args->trans, args->dp,
+			error = xfs_da3_node_read(args->trans, args->dp,
 						be32_to_cpu(drop_info->forw),
 						-1, &bp, args->whichfork);
 			if (error)
@@ -1499,15 +1843,22 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
  * the new bottom and the root.
  */
 int							/* error */
-xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
-				 int forward, int release, int *result)
+xfs_da3_path_shift(
+	struct xfs_da_state	*state,
+	struct xfs_da_state_path *path,
+	int			forward,
+	int			release,
+	int			*result)
 {
-	xfs_da_state_blk_t *blk;
-	xfs_da_blkinfo_t *info;
-	xfs_da_intnode_t *node;
-	xfs_da_args_t *args;
-	xfs_dablk_t blkno=0;
-	int level, error;
+	struct xfs_da_state_blk	*blk;
+	struct xfs_da_blkinfo	*info;
+	struct xfs_da_intnode	*node;
+	struct xfs_da_args	*args;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr nodehdr;
+	xfs_dablk_t		blkno = 0;
+	int			level;
+	int			error;
 
 	trace_xfs_da_path_shift(state->args);
 
@@ -1522,16 +1873,17 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 	ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
 	level = (path->active-1) - 1;	/* skip bottom layer in path */
 	for (blk = &path->blk[level]; level >= 0; blk--, level--) {
-		ASSERT(blk->bp != NULL);
 		node = blk->bp->b_addr;
-		ASSERT(node->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
-		if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) {
+		xfs_da3_node_hdr_from_disk(&nodehdr, node);
+		btree = xfs_da3_node_tree_p(node);
+
+		if (forward && (blk->index < nodehdr.count - 1)) {
 			blk->index++;
-			blkno = be32_to_cpu(node->btree[blk->index].before);
+			blkno = be32_to_cpu(btree[blk->index].before);
 			break;
 		} else if (!forward && (blk->index > 0)) {
 			blk->index--;
-			blkno = be32_to_cpu(node->btree[blk->index].before);
+			blkno = be32_to_cpu(btree[blk->index].before);
 			break;
 		}
 	}
@@ -1557,45 +1909,60 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 		 * Read the next child block.
 		 */
 		blk->blkno = blkno;
-		error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
+		error = xfs_da3_node_read(args->trans, args->dp, blkno, -1,
 					&blk->bp, args->whichfork);
 		if (error)
 			return(error);
-		ASSERT(blk->bp != NULL);
 		info = blk->bp->b_addr;
 		ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
 		       info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
-		       info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
-		blk->magic = be16_to_cpu(info->magic);
-		if (blk->magic == XFS_DA_NODE_MAGIC) {
+		       info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
+		       info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
+
+
+		/*
+		 * Note: we flatten the magic number to a single type so we
+		 * don't have to compare against crc/non-crc types elsewhere.
+		 */
+		switch (be16_to_cpu(info->magic)) {
+		case XFS_DA_NODE_MAGIC:
+		case XFS_DA3_NODE_MAGIC:
+			blk->magic = XFS_DA_NODE_MAGIC;
 			node = (xfs_da_intnode_t *)info;
-			blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+			xfs_da3_node_hdr_from_disk(&nodehdr, node);
+			btree = xfs_da3_node_tree_p(node);
+			blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
 			if (forward)
 				blk->index = 0;
 			else
-				blk->index = be16_to_cpu(node->hdr.count)-1;
-			blkno = be32_to_cpu(node->btree[blk->index].before);
-		} else {
+				blk->index = nodehdr.count - 1;
+			blkno = be32_to_cpu(btree[blk->index].before);
+			break;
+		case XFS_ATTR_LEAF_MAGIC:
+		case XFS_ATTR3_LEAF_MAGIC:
+			blk->magic = XFS_ATTR_LEAF_MAGIC;
 			ASSERT(level == path->active-1);
 			blk->index = 0;
-			switch(blk->magic) {
-			case XFS_ATTR_LEAF_MAGIC:
-				blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
-								      NULL);
-				break;
-			case XFS_DIR2_LEAFN_MAGIC:
-				blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
-								       NULL);
-				break;
-			default:
-				ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
-				       blk->magic == XFS_DIR2_LEAFN_MAGIC);
-				break;
-			}
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
+							      NULL);
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+		case XFS_DIR3_LEAFN_MAGIC:
+			blk->magic = XFS_DIR2_LEAFN_MAGIC;
+			ASSERT(level == path->active-1);
+			blk->index = 0;
+			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
+							       NULL);
+			break;
+		default:
+			ASSERT(0);
+			break;
 		}
 	}
 	*result = 0;
-	return(0);
+	return 0;
 }
 
 
@@ -1782,22 +2149,36 @@ xfs_da_grow_inode(
  * a bmap btree split to do that.
  */
 STATIC int
-xfs_da_swap_lastblock(
-	xfs_da_args_t	*args,
-	xfs_dablk_t	*dead_blknop,
-	struct xfs_buf	**dead_bufp)
+xfs_da3_swap_lastblock(
+	struct xfs_da_args	*args,
+	xfs_dablk_t		*dead_blknop,
+	struct xfs_buf		**dead_bufp)
 {
-	xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
-	struct xfs_buf *dead_buf, *last_buf, *sib_buf, *par_buf;
-	xfs_fileoff_t lastoff;
-	xfs_inode_t *ip;
-	xfs_trans_t *tp;
-	xfs_mount_t *mp;
-	int error, w, entno, level, dead_level;
-	xfs_da_blkinfo_t *dead_info, *sib_info;
-	xfs_da_intnode_t *par_node, *dead_node;
-	xfs_dir2_leaf_t *dead_leaf2;
-	xfs_dahash_t dead_hash;
+	struct xfs_da_blkinfo	*dead_info;
+	struct xfs_da_blkinfo	*sib_info;
+	struct xfs_da_intnode	*par_node;
+	struct xfs_da_intnode	*dead_node;
+	struct xfs_dir2_leaf	*dead_leaf2;
+	struct xfs_da_node_entry *btree;
+	struct xfs_da3_icnode_hdr par_hdr;
+	struct xfs_inode	*ip;
+	struct xfs_trans	*tp;
+	struct xfs_mount	*mp;
+	struct xfs_buf		*dead_buf;
+	struct xfs_buf		*last_buf;
+	struct xfs_buf		*sib_buf;
+	struct xfs_buf		*par_buf;
+	xfs_dahash_t		dead_hash;
+	xfs_fileoff_t		lastoff;
+	xfs_dablk_t		dead_blkno;
+	xfs_dablk_t		last_blkno;
+	xfs_dablk_t		sib_blkno;
+	xfs_dablk_t		par_blkno;
+	int			error;
+	int			w;
+	int			entno;
+	int			level;
+	int			dead_level;
 
 	trace_xfs_da_swap_lastblock(args);
 
@@ -1821,7 +2202,7 @@ xfs_da_swap_lastblock(
 	 * Read the last block in the btree space.
 	 */
 	last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
-	error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
+	error = xfs_da3_node_read(tp, ip, last_blkno, -1, &last_buf, w);
 	if (error)
 		return error;
 	/*
@@ -1833,22 +2214,31 @@ xfs_da_swap_lastblock(
 	/*
 	 * Get values from the moved block.
 	 */
-	if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
+	if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	    dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		struct xfs_dir3_icleaf_hdr leafhdr;
+		struct xfs_dir2_leaf_entry *ents;
+
 		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+		xfs_dir3_leaf_hdr_from_disk(&leafhdr, dead_leaf2);
+		ents = xfs_dir3_leaf_ents_p(dead_leaf2);
 		dead_level = 0;
-		dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval);
+		dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
 	} else {
-		ASSERT(dead_info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+		struct xfs_da3_icnode_hdr deadhdr;
+
 		dead_node = (xfs_da_intnode_t *)dead_info;
-		dead_level = be16_to_cpu(dead_node->hdr.level);
-		dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval);
+		xfs_da3_node_hdr_from_disk(&deadhdr, dead_node);
+		btree = xfs_da3_node_tree_p(dead_node);
+		dead_level = deadhdr.level;
+		dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
 	}
 	sib_buf = par_buf = NULL;
 	/*
 	 * If the moved block has a left sibling, fix up the pointers.
 	 */
 	if ((sib_blkno = be32_to_cpu(dead_info->back))) {
-		error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+		error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
 		if (error)
 			goto done;
 		sib_info = sib_buf->b_addr;
@@ -1870,7 +2260,7 @@ xfs_da_swap_lastblock(
 	 * If the moved block has a right sibling, fix up the pointers.
 	 */
 	if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
-		error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+		error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
 		if (error)
 			goto done;
 		sib_info = sib_buf->b_addr;
@@ -1894,31 +2284,31 @@ xfs_da_swap_lastblock(
 	 * Walk down the tree looking for the parent of the moved block.
 	 */
 	for (;;) {
-		error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+		error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w);
 		if (error)
 			goto done;
 		par_node = par_buf->b_addr;
-		if (unlikely(par_node->hdr.info.magic !=
-		    cpu_to_be16(XFS_DA_NODE_MAGIC) ||
-		    (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) {
+		xfs_da3_node_hdr_from_disk(&par_hdr, par_node);
+		if (level >= 0 && level != par_hdr.level + 1) {
 			XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
 					 XFS_ERRLEVEL_LOW, mp);
 			error = XFS_ERROR(EFSCORRUPTED);
 			goto done;
 		}
-		level = be16_to_cpu(par_node->hdr.level);
+		level = par_hdr.level;
+		btree = xfs_da3_node_tree_p(par_node);
 		for (entno = 0;
-		     entno < be16_to_cpu(par_node->hdr.count) &&
-		     be32_to_cpu(par_node->btree[entno].hashval) < dead_hash;
+		     entno < par_hdr.count &&
+		     be32_to_cpu(btree[entno].hashval) < dead_hash;
 		     entno++)
 			continue;
-		if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) {
+		if (entno == par_hdr.count) {
 			XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
 					 XFS_ERRLEVEL_LOW, mp);
 			error = XFS_ERROR(EFSCORRUPTED);
 			goto done;
 		}
-		par_blkno = be32_to_cpu(par_node->btree[entno].before);
+		par_blkno = be32_to_cpu(btree[entno].before);
 		if (level == dead_level + 1)
 			break;
 		xfs_trans_brelse(tp, par_buf);
@@ -1930,13 +2320,13 @@ xfs_da_swap_lastblock(
 	 */
 	for (;;) {
 		for (;
-		     entno < be16_to_cpu(par_node->hdr.count) &&
-		     be32_to_cpu(par_node->btree[entno].before) != last_blkno;
+		     entno < par_hdr.count &&
+		     be32_to_cpu(btree[entno].before) != last_blkno;
 		     entno++)
 			continue;
-		if (entno < be16_to_cpu(par_node->hdr.count))
+		if (entno < par_hdr.count)
 			break;
-		par_blkno = be32_to_cpu(par_node->hdr.info.forw);
+		par_blkno = par_hdr.forw;
 		xfs_trans_brelse(tp, par_buf);
 		par_buf = NULL;
 		if (unlikely(par_blkno == 0)) {
@@ -1945,27 +2335,27 @@ xfs_da_swap_lastblock(
 			error = XFS_ERROR(EFSCORRUPTED);
 			goto done;
 		}
-		error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+		error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w);
 		if (error)
 			goto done;
 		par_node = par_buf->b_addr;
-		if (unlikely(
-		    be16_to_cpu(par_node->hdr.level) != level ||
-		    par_node->hdr.info.magic != cpu_to_be16(XFS_DA_NODE_MAGIC))) {
+		xfs_da3_node_hdr_from_disk(&par_hdr, par_node);
+		if (par_hdr.level != level) {
 			XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
 					 XFS_ERRLEVEL_LOW, mp);
 			error = XFS_ERROR(EFSCORRUPTED);
 			goto done;
 		}
+		btree = xfs_da3_node_tree_p(par_node);
 		entno = 0;
 	}
 	/*
 	 * Update the parent entry pointing to the moved block.
 	 */
-	par_node->btree[entno].before = cpu_to_be32(dead_blkno);
+	btree[entno].before = cpu_to_be32(dead_blkno);
 	xfs_trans_log_buf(tp, par_buf,
-		XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
-				sizeof(par_node->btree[entno].before)));
+		XFS_DA_LOGRANGE(par_node, &btree[entno].before,
+				sizeof(btree[entno].before)));
 	*dead_blknop = last_blkno;
 	*dead_bufp = last_buf;
 	return 0;
@@ -2007,14 +2397,15 @@ xfs_da_shrink_inode(
 		 * Remove extents.  If we get ENOSPC for a dir we have to move
 		 * the last block to the place we want to kill.
 		 */
-		if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
-				xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-				0, args->firstblock, args->flist,
-				&done)) == ENOSPC) {
+		error = xfs_bunmapi(tp, dp, dead_blkno, count,
+				    xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
+				    0, args->firstblock, args->flist, &done);
+		if (error == ENOSPC) {
 			if (w != XFS_DATA_FORK)
 				break;
-			if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
-					&dead_buf)))
+			error = xfs_da3_swap_lastblock(args, &dead_blkno,
+						      &dead_buf);
+			if (error)
 				break;
 		} else {
 			break;
@@ -2074,7 +2465,8 @@ xfs_buf_map_from_irec(
 	ASSERT(nirecs >= 1);
 
 	if (nirecs > 1) {
-		map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP);
+		map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
+				  KM_SLEEP | KM_NOFS);
 		if (!map)
 			return ENOMEM;
 		*mapp = map;
@@ -2130,7 +2522,8 @@ xfs_dabuf_map(
 		 * Optimize the one-block case.
 		 */
 		if (nfsb != 1)
-			irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP);
+			irecs = kmem_zalloc(sizeof(irec) * nfsb,
+					    KM_SLEEP | KM_NOFS);
 
 		nirecs = nfsb;
 		error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
@@ -2279,12 +2672,21 @@ xfs_da_read_buf(
 		magic1 = be32_to_cpu(hdr->magic);
 		if (unlikely(
 		    XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
+				   (magic != XFS_DA3_NODE_MAGIC) &&
 				   (magic != XFS_ATTR_LEAF_MAGIC) &&
+				   (magic != XFS_ATTR3_LEAF_MAGIC) &&
 				   (magic != XFS_DIR2_LEAF1_MAGIC) &&
+				   (magic != XFS_DIR3_LEAF1_MAGIC) &&
 				   (magic != XFS_DIR2_LEAFN_MAGIC) &&
+				   (magic != XFS_DIR3_LEAFN_MAGIC) &&
 				   (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
+				   (magic1 != XFS_DIR3_BLOCK_MAGIC) &&
 				   (magic1 != XFS_DIR2_DATA_MAGIC) &&
-				   (free->hdr.magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)),
+				   (magic1 != XFS_DIR3_DATA_MAGIC) &&
+				   (free->hdr.magic !=
+					cpu_to_be32(XFS_DIR2_FREE_MAGIC)) &&
+				   (free->hdr.magic !=
+					cpu_to_be32(XFS_DIR3_FREE_MAGIC)),
 				mp, XFS_ERRTAG_DA_READ_BUF,
 				XFS_RANDOM_DA_READ_BUF))) {
 			trace_xfs_da_btree_corrupt(bp, _RET_IP_);
@@ -2342,41 +2744,3 @@ out_free:
 		return -1;
 	return mappedbno;
 }
-
-kmem_zone_t *xfs_da_state_zone;	/* anchor for state struct zone */
-
-/*
- * Allocate a dir-state structure.
- * We don't put them on the stack since they're large.
- */
-xfs_da_state_t *
-xfs_da_state_alloc(void)
-{
-	return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
-}
-
-/*
- * Kill the altpath contents of a da-state structure.
- */
-STATIC void
-xfs_da_state_kill_altpath(xfs_da_state_t *state)
-{
-	int	i;
-
-	for (i = 0; i < state->altpath.active; i++)
-		state->altpath.blk[i].bp = NULL;
-	state->altpath.active = 0;
-}
-
-/*
- * Free a da-state structure.
- */
-void
-xfs_da_state_free(xfs_da_state_t *state)
-{
-	xfs_da_state_kill_altpath(state);
-#ifdef DEBUG
-	memset((char *)state, 0, sizeof(*state));
-#endif /* DEBUG */
-	kmem_zone_free(xfs_da_state_zone, state);
-}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index ee5170c46ae..6fb3371c63c 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -20,7 +21,6 @@
 
 struct xfs_bmap_free;
 struct xfs_inode;
-struct xfs_mount;
 struct xfs_trans;
 struct zone;
 
@@ -47,6 +47,33 @@ typedef struct xfs_da_blkinfo {
 } xfs_da_blkinfo_t;
 
 /*
+ * CRC enabled directory structure types
+ *
+ * The headers change size for the additional verification information, but
+ * otherwise the tree layouts and contents are unchanged. Hence the da btree
+ * code can use the struct xfs_da_blkinfo for manipulating the tree links and
+ * magic numbers without modification for both v2 and v3 nodes.
+ */
+#define XFS_DA3_NODE_MAGIC	0x3ebe	/* magic number: non-leaf blocks */
+#define XFS_ATTR3_LEAF_MAGIC	0x3bee	/* magic number: attribute leaf blks */
+#define	XFS_DIR3_LEAF1_MAGIC	0x3df1	/* magic number: v2 dirlf single blks */
+#define	XFS_DIR3_LEAFN_MAGIC	0x3dff	/* magic number: v2 dirlf multi blks */
+
+struct xfs_da3_blkinfo {
+	/*
+	 * the node link manipulation code relies on the fact that the first
+	 * element of this structure is the struct xfs_da_blkinfo so it can
+	 * ignore the differences in the rest of the structures.
+	 */
+	struct xfs_da_blkinfo	hdr;
+	__be32			crc;	/* CRC of block */
+	__be64			blkno;	/* first block of the buffer */
+	__be64			lsn;	/* sequence number of last write */
+	uuid_t			uuid;	/* filesystem we belong to */
+	__be64			owner;	/* inode that owns the block */
+};
+
+/*
  * This is the structure of the root and intermediate nodes in the Btree.
  * The leaf nodes are defined above.
  *
@@ -57,19 +84,76 @@ typedef struct xfs_da_blkinfo {
  */
 #define	XFS_DA_NODE_MAXDEPTH	5	/* max depth of Btree */
 
+typedef struct xfs_da_node_hdr {
+	struct xfs_da_blkinfo	info;	/* block type, links, etc. */
+	__be16			__count; /* count of active entries */
+	__be16			__level; /* level above leaves (leaf == 0) */
+} xfs_da_node_hdr_t;
+
+struct xfs_da3_node_hdr {
+	struct xfs_da3_blkinfo	info;	/* block type, links, etc. */
+	__be16			__count; /* count of active entries */
+	__be16			__level; /* level above leaves (leaf == 0) */
+	__be32			__pad32;
+};
+
+#define XFS_DA3_NODE_CRC_OFF	(offsetof(struct xfs_da3_node_hdr, info.crc))
+
+typedef struct xfs_da_node_entry {
+	__be32	hashval;	/* hash value for this descendant */
+	__be32	before;		/* Btree block before this key */
+} xfs_da_node_entry_t;
+
 typedef struct xfs_da_intnode {
-	struct xfs_da_node_hdr {	/* constant-structure header block */
-		xfs_da_blkinfo_t info;	/* block type, links, etc. */
-		__be16	count;		/* count of active entries */
-		__be16	level;		/* level above leaves (leaf == 0) */
-	} hdr;
-	struct xfs_da_node_entry {
-		__be32	hashval;	/* hash value for this descendant */
-		__be32	before;		/* Btree block before this key */
-	} btree[1];			/* variable sized array of keys */
+	struct xfs_da_node_hdr	hdr;
+	struct xfs_da_node_entry __btree[];
 } xfs_da_intnode_t;
-typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
-typedef struct xfs_da_node_entry xfs_da_node_entry_t;
+
+struct xfs_da3_intnode {
+	struct xfs_da3_node_hdr	hdr;
+	struct xfs_da_node_entry __btree[];
+};
+
+/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+	__uint32_t	forw;
+	__uint32_t	back;
+	__uint16_t	magic;
+	__uint16_t	count;
+	__uint16_t	level;
+};
+
+extern void xfs_da3_node_hdr_from_disk(struct xfs_da3_icnode_hdr *to,
+				       struct xfs_da_intnode *from);
+extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to,
+				     struct xfs_da3_icnode_hdr *from);
+
+static inline int
+xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
+{
+	if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC))
+		return sizeof(struct xfs_da3_node_hdr);
+	return sizeof(struct xfs_da_node_hdr);
+}
+
+static inline struct xfs_da_node_entry *
+xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
+{
+	if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
+		struct xfs_da3_intnode *dap3 = (struct xfs_da3_intnode *)dap;
+		return dap3->__btree;
+	}
+	return dap->__btree;
+}
+
+extern void xfs_da3_intnode_from_disk(struct xfs_da3_icnode_hdr *to,
+				      struct xfs_da_intnode *from);
+extern void xfs_da3_intnode_to_disk(struct xfs_da_intnode *to,
+				    struct xfs_da3_icnode_hdr *from);
 
 #define	XFS_LBSIZE(mp)	(mp)->m_sb.sb_blocksize
 
@@ -191,32 +275,34 @@ struct xfs_nameops {
 /*
  * Routines used for growing the Btree.
  */
-int	xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
-					 struct xfs_buf **bpp, int whichfork);
-int	xfs_da_split(xfs_da_state_t *state);
+int	xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
+			    int level, struct xfs_buf **bpp, int whichfork);
+int	xfs_da3_split(xfs_da_state_t *state);
 
 /*
  * Routines used for shrinking the Btree.
  */
-int	xfs_da_join(xfs_da_state_t *state);
-void	xfs_da_fixhashpath(xfs_da_state_t *state,
-					  xfs_da_state_path_t *path_to_to_fix);
+int	xfs_da3_join(xfs_da_state_t *state);
+void	xfs_da3_fixhashpath(struct xfs_da_state *state,
+			    struct xfs_da_state_path *path_to_to_fix);
 
 /*
  * Routines used for finding things in the Btree.
  */
-int	xfs_da_node_lookup_int(xfs_da_state_t *state, int *result);
-int	xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+int	xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
+int	xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 					 int forward, int release, int *result);
 /*
  * Utility routines.
  */
-int	xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+int	xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 				       xfs_da_state_blk_t *new_blk);
-int	xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+int	xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
 			 xfs_dablk_t bno, xfs_daddr_t mappedbno,
 			 struct xfs_buf **bpp, int which_fork);
 
+extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
+
 /*
  * Utility routines.
  */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index a8bd26b82ec..e36445ceaf8 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,6 +24,9 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -78,14 +81,14 @@ xfs_swapext(
 		goto out_put_tmp_file;
 	}
 
-	if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) ||
-	    IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) {
+	if (IS_SWAPFILE(file_inode(f.file)) ||
+	    IS_SWAPFILE(file_inode(tmp.file))) {
 		error = XFS_ERROR(EINVAL);
 		goto out_put_tmp_file;
 	}
 
-	ip = XFS_I(f.file->f_path.dentry->d_inode);
-	tip = XFS_I(tmp.file->f_path.dentry->d_inode);
+	ip = XFS_I(file_inode(f.file));
+	tip = XFS_I(file_inode(tmp.file));
 
 	if (ip->i_mount != tip->i_mount) {
 		error = XFS_ERROR(EINVAL);
@@ -182,7 +185,7 @@ xfs_swap_extents_check_format(
 	 */
 	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
 		if (XFS_IFORK_BOFF(ip) &&
-		    tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+		    XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
 			return EINVAL;
 		if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
 		    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
@@ -192,9 +195,8 @@ xfs_swap_extents_check_format(
 	/* Reciprocal target->temp btree format checks */
 	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
 		if (XFS_IFORK_BOFF(tip) &&
-		    ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
 			return EINVAL;
-
 		if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
 		    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
 			return EINVAL;
@@ -219,6 +221,14 @@ xfs_swap_extents(
 	int		taforkblks = 0;
 	__uint64_t	tmp;
 
+	/*
+	 * We have no way of updating owner information in the BMBT blocks for
+	 * each inode on CRC enabled filesystems, so to avoid corrupting the
+	 * this metadata we simply don't allow extent swaps to occur.
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return XFS_ERROR(EINVAL);
+
 	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
 	if (!tempifp) {
 		error = XFS_ERROR(ENOMEM);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 1d9643b3dce..f7a0e95d197 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -19,7 +19,7 @@
 #define	__XFS_DINODE_H__
 
 #define	XFS_DINODE_MAGIC		0x494e	/* 'IN' */
-#define XFS_DINODE_GOOD_VERSION(v)	(((v) == 1 || (v) == 2))
+#define XFS_DINODE_GOOD_VERSION(v)	((v) >= 1 && (v) <= 3)
 
 typedef struct xfs_timestamp {
 	__be32		t_sec;		/* timestamp seconds */
@@ -70,11 +70,36 @@ typedef struct xfs_dinode {
 
 	/* di_next_unlinked is the only non-core field in the old dinode */
 	__be32		di_next_unlinked;/* agi unlinked list ptr */
-} __attribute__((packed)) xfs_dinode_t;
+
+	/* start of the extended dinode, writable fields */
+	__le32		di_crc;		/* CRC of the inode */
+	__be64		di_changecount;	/* number of attribute changes */
+	__be64		di_lsn;		/* flush sequence */
+	__be64		di_flags2;	/* more random flags */
+	__u8		di_pad2[16];	/* more padding for future expansion */
+
+	/* fields only written to during inode creation */
+	xfs_timestamp_t	di_crtime;	/* time created */
+	__be64		di_ino;		/* inode number */
+	uuid_t		di_uuid;	/* UUID of the filesystem */
+
+	/* structure must be padded to 64 bit alignment */
+} xfs_dinode_t;
 
 #define DI_MAX_FLUSH 0xffff
 
 /*
+ * Size of the core inode on disk.  Version 1 and 2 inodes have
+ * the same size, but version 3 has grown a few additional fields.
+ */
+static inline uint xfs_dinode_size(int version)
+{
+	if (version == 3)
+		return sizeof(struct xfs_dinode);
+	return offsetof(struct xfs_dinode, di_crc);
+}
+
+/*
  * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
  * Since the pathconf interface is signed, we use 2^31 - 1 instead.
  * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
@@ -104,11 +129,11 @@ typedef enum xfs_dinode_fmt {
 /*
  * Inode size for given fs.
  */
-#define XFS_LITINO(mp) \
-	((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
+#define XFS_LITINO(mp, version) \
+	((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
 
-#define	XFS_BROOT_SIZE_ADJ	\
-	(XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
+#define XFS_BROOT_SIZE_ADJ(ip) \
+	(XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t))
 
 /*
  * Inode data & attribute fork sizes, per inode.
@@ -119,10 +144,10 @@ typedef enum xfs_dinode_fmt {
 #define XFS_DFORK_DSIZE(dip,mp) \
 	(XFS_DFORK_Q(dip) ? \
 		XFS_DFORK_BOFF(dip) : \
-		XFS_LITINO(mp))
+		XFS_LITINO(mp, (dip)->di_version))
 #define XFS_DFORK_ASIZE(dip,mp) \
 	(XFS_DFORK_Q(dip) ? \
-		XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : \
+		XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
 		0)
 #define XFS_DFORK_SIZE(dip,mp,w) \
 	((w) == XFS_DATA_FORK ? \
@@ -133,7 +158,7 @@ typedef enum xfs_dinode_fmt {
  * Return pointers to the data or attribute forks.
  */
 #define XFS_DFORK_DPTR(dip) \
-	((char *)(dip) + sizeof(struct xfs_dinode))
+	((char *)dip + xfs_dinode_size(dip->di_version))
 #define XFS_DFORK_APTR(dip)	\
 	(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
 #define XFS_DFORK_PTR(dip,w)	\
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index b26a50f9921..8f023dee404 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -368,10 +368,8 @@ xfs_dir_removename(
 int
 xfs_readdir(
 	xfs_inode_t	*dp,
-	void		*dirent,
-	size_t		bufsize,
-	xfs_off_t	*offset,
-	filldir_t	filldir)
+	struct dir_context *ctx,
+	size_t		bufsize)
 {
 	int		rval;		/* return value */
 	int		v;		/* type-checking value */
@@ -385,14 +383,13 @@ xfs_readdir(
 	XFS_STATS_INC(xs_dir_getdents);
 
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-		rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir);
+		rval = xfs_dir2_sf_getdents(dp, ctx);
 	else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
 		;
 	else if (v)
-		rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir);
+		rval = xfs_dir2_block_getdents(dp, ctx);
 	else
-		rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset,
-					      filldir);
+		rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
 	return rval;
 }
 
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 12afe07a91d..09aea0247d9 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -28,11 +29,13 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
 #include "xfs_dir2.h"
 #include "xfs_dir2_format.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_cksum.h"
 
 /*
  * Local function prototypes.
@@ -56,52 +59,110 @@ xfs_dir_startup(void)
 	xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
 }
 
-static void
-xfs_dir2_block_verify(
+static bool
+xfs_dir3_block_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
-	int			block_ok = 0;
-
-	block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
-	block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
-
-	if (!block_ok) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
+			return false;
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+			return false;
 	}
+	if (__xfs_dir3_data_check(NULL, bp))
+		return false;
+	return true;
 }
 
 static void
-xfs_dir2_block_read_verify(
+xfs_dir3_block_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dir2_block_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+					  XFS_DIR3_DATA_CRC_OFF)) ||
+	    !xfs_dir3_block_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
 }
 
 static void
-xfs_dir2_block_write_verify(
+xfs_dir3_block_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dir2_block_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_block_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF);
 }
 
-const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
-	.verify_read = xfs_dir2_block_read_verify,
-	.verify_write = xfs_dir2_block_write_verify,
+const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+	.verify_read = xfs_dir3_block_read_verify,
+	.verify_write = xfs_dir3_block_write_verify,
 };
 
 static int
-xfs_dir2_block_read(
+xfs_dir3_block_read(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*dp,
 	struct xfs_buf		**bpp)
 {
 	struct xfs_mount	*mp = dp->i_mount;
+	int			err;
 
-	return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
-				XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
+	err = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
+				XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+	return err;
+}
+
+static void
+xfs_dir3_block_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	struct xfs_inode	*dp)
+{
+	struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+	bp->b_ops = &xfs_dir3_block_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		memset(hdr3, 0, sizeof(*hdr3));
+		hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+		hdr3->blkno = cpu_to_be64(bp->b_bn);
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+		return;
+
+	}
+	hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
 }
 
 static void
@@ -121,7 +182,7 @@ xfs_dir2_block_need_space(
 	struct xfs_dir2_data_unused	*enddup = NULL;
 
 	*compact = 0;
-	bf = hdr->bestfree;
+	bf = xfs_dir3_data_bestfree_p(hdr);
 
 	/*
 	 * If there are stale entries we'll use one for the leaf.
@@ -303,7 +364,7 @@ xfs_dir2_block_addname(
 	mp = dp->i_mount;
 
 	/* Read the (one and only) directory block into bp. */
-	error = xfs_dir2_block_read(tp, dp, &bp);
+	error = xfs_dir3_block_read(tp, dp, &bp);
 	if (error)
 		return error;
 
@@ -498,7 +559,7 @@ xfs_dir2_block_addname(
 		xfs_dir2_data_log_header(tp, bp);
 	xfs_dir2_block_log_tail(tp, bp);
 	xfs_dir2_data_log_entry(tp, bp, dep);
-	xfs_dir2_data_check(dp, bp);
+	xfs_dir3_data_check(dp, bp);
 	return 0;
 }
 
@@ -508,9 +569,7 @@ xfs_dir2_block_addname(
 int						/* error */
 xfs_dir2_block_getdents(
 	xfs_inode_t		*dp,		/* incore inode */
-	void			*dirent,
-	xfs_off_t		*offset,
-	filldir_t		filldir)
+	struct dir_context	*ctx)
 {
 	xfs_dir2_data_hdr_t	*hdr;		/* block header */
 	struct xfs_buf		*bp;		/* buffer for block */
@@ -528,10 +587,10 @@ xfs_dir2_block_getdents(
 	/*
 	 * If the block number in the offset is out of range, we're done.
 	 */
-	if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+	if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
 		return 0;
 
-	error = xfs_dir2_block_read(NULL, dp, &bp);
+	error = xfs_dir3_block_read(NULL, dp, &bp);
 	if (error)
 		return error;
 
@@ -539,14 +598,14 @@ xfs_dir2_block_getdents(
 	 * Extract the byte offset we start at from the seek pointer.
 	 * We'll skip entries before this.
 	 */
-	wantoff = xfs_dir2_dataptr_to_off(mp, *offset);
+	wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
 	hdr = bp->b_addr;
-	xfs_dir2_data_check(dp, bp);
+	xfs_dir3_data_check(dp, bp);
 	/*
 	 * Set up values for the loop.
 	 */
 	btp = xfs_dir2_block_tail_p(mp, hdr);
-	ptr = (char *)(hdr + 1);
+	ptr = (char *)xfs_dir3_data_entry_p(hdr);
 	endptr = (char *)xfs_dir2_block_leaf_p(btp);
 
 	/*
@@ -578,13 +637,12 @@ xfs_dir2_block_getdents(
 		cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
 					    (char *)dep - (char *)hdr);
 
+		ctx->pos = cook & 0x7fffffff;
 		/*
 		 * If it didn't fit, set the final offset to here & return.
 		 */
-		if (filldir(dirent, (char *)dep->name, dep->namelen,
-			    cook & 0x7fffffff, be64_to_cpu(dep->inumber),
-			    DT_UNKNOWN)) {
-			*offset = cook & 0x7fffffff;
+		if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
+			    be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
 			xfs_trans_brelse(NULL, bp);
 			return 0;
 		}
@@ -594,7 +652,7 @@ xfs_dir2_block_getdents(
 	 * Reached the end of the block.
 	 * Set the offset to a non-existent block 1 and return.
 	 */
-	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+	ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
 			0x7fffffff;
 	xfs_trans_brelse(NULL, bp);
 	return 0;
@@ -665,7 +723,7 @@ xfs_dir2_block_lookup(
 	dp = args->dp;
 	mp = dp->i_mount;
 	hdr = bp->b_addr;
-	xfs_dir2_data_check(dp, bp);
+	xfs_dir3_data_check(dp, bp);
 	btp = xfs_dir2_block_tail_p(mp, hdr);
 	blp = xfs_dir2_block_leaf_p(btp);
 	/*
@@ -711,12 +769,12 @@ xfs_dir2_block_lookup_int(
 	tp = args->trans;
 	mp = dp->i_mount;
 
-	error = xfs_dir2_block_read(tp, dp, &bp);
+	error = xfs_dir3_block_read(tp, dp, &bp);
 	if (error)
 		return error;
 
 	hdr = bp->b_addr;
-	xfs_dir2_data_check(dp, bp);
+	xfs_dir3_data_check(dp, bp);
 	btp = xfs_dir2_block_tail_p(mp, hdr);
 	blp = xfs_dir2_block_leaf_p(btp);
 	/*
@@ -853,7 +911,7 @@ xfs_dir2_block_removename(
 		xfs_dir2_data_freescan(mp, hdr, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, bp);
-	xfs_dir2_data_check(dp, bp);
+	xfs_dir3_data_check(dp, bp);
 	/*
 	 * See if the size as a shortform is good enough.
 	 */
@@ -910,7 +968,7 @@ xfs_dir2_block_replace(
 	 */
 	dep->inumber = cpu_to_be64(args->inumber);
 	xfs_dir2_data_log_entry(args->trans, bp, dep);
-	xfs_dir2_data_check(dp, bp);
+	xfs_dir3_data_check(dp, bp);
 	return 0;
 }
 
@@ -958,6 +1016,8 @@ xfs_dir2_leaf_to_block(
 	__be16			*tagp;		/* end of entry (tag) */
 	int			to;		/* block/leaf to index */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	trace_xfs_dir2_leaf_to_block(args);
 
@@ -965,8 +1025,12 @@ xfs_dir2_leaf_to_block(
 	tp = args->trans;
 	mp = dp->i_mount;
 	leaf = lbp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = xfs_dir3_leaf_ents_p(leaf);
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
 	/*
 	 * If there are data blocks other than the first one, take this
 	 * opportunity to remove trailing empty data blocks that may have
@@ -974,9 +1038,12 @@ xfs_dir2_leaf_to_block(
 	 * These will show up in the leaf bests table.
 	 */
 	while (dp->i_d.di_size > mp->m_dirblksize) {
+		int hdrsz;
+
+		hdrsz = xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
 		bestsp = xfs_dir2_leaf_bests_p(ltp);
 		if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
-		    mp->m_dirblksize - (uint)sizeof(*hdr)) {
+					    mp->m_dirblksize - hdrsz) {
 			if ((error =
 			    xfs_dir2_leaf_trim_data(args, lbp,
 				    (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
@@ -988,17 +1055,19 @@ xfs_dir2_leaf_to_block(
 	 * Read the data block if we don't already have it, give up if it fails.
 	 */
 	if (!dbp) {
-		error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
+		error = xfs_dir3_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
 		if (error)
 			return error;
 	}
 	hdr = dbp->b_addr;
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+
 	/*
 	 * Size of the "leaf" area in the block.
 	 */
 	size = (uint)sizeof(xfs_dir2_block_tail_t) +
-	       (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale));
+	       (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale);
 	/*
 	 * Look at the last data entry.
 	 */
@@ -1014,8 +1083,8 @@ xfs_dir2_leaf_to_block(
 	/*
 	 * Start converting it to block form.
 	 */
-	dbp->b_ops = &xfs_dir2_block_buf_ops;
-	hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+	xfs_dir3_block_init(mp, tp, dbp, dp);
+
 	needlog = 1;
 	needscan = 0;
 	/*
@@ -1027,18 +1096,17 @@ xfs_dir2_leaf_to_block(
 	 * Initialize the block tail.
 	 */
 	btp = xfs_dir2_block_tail_p(mp, hdr);
-	btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale));
+	btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
 	btp->stale = 0;
 	xfs_dir2_block_log_tail(tp, dbp);
 	/*
 	 * Initialize the block leaf area.  We compact out stale entries.
 	 */
 	lep = xfs_dir2_block_leaf_p(btp);
-	for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) {
-		if (leaf->ents[from].address ==
-		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+	for (from = to = 0; from < leafhdr.count; from++) {
+		if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
 			continue;
-		lep[to++] = leaf->ents[from];
+		lep[to++] = ents[from];
 	}
 	ASSERT(to == be32_to_cpu(btp->count));
 	xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
@@ -1137,16 +1205,16 @@ xfs_dir2_sf_to_block(
 		return error;
 	}
 	/*
-	 * Initialize the data block.
+	 * Initialize the data block, then convert it to block format.
 	 */
-	error = xfs_dir2_data_init(args, blkno, &bp);
+	error = xfs_dir3_data_init(args, blkno, &bp);
 	if (error) {
 		kmem_free(sfp);
 		return error;
 	}
-	bp->b_ops = &xfs_dir2_block_buf_ops;
+	xfs_dir3_block_init(mp, tp, bp, dp);
 	hdr = bp->b_addr;
-	hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+
 	/*
 	 * Compute size of block "tail" area.
 	 */
@@ -1156,7 +1224,7 @@ xfs_dir2_sf_to_block(
 	 * The whole thing is initialized to free by the init routine.
 	 * Say we're using the leaf and tail area.
 	 */
-	dup = (xfs_dir2_data_unused_t *)(hdr + 1);
+	dup = xfs_dir3_data_unused_p(hdr);
 	needlog = needscan = 0;
 	xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
 		&needscan);
@@ -1178,8 +1246,7 @@ xfs_dir2_sf_to_block(
 	/*
 	 * Create entry for .
 	 */
-	dep = (xfs_dir2_data_entry_t *)
-	      ((char *)hdr + XFS_DIR2_DATA_DOT_OFFSET);
+	dep = xfs_dir3_data_dot_entry_p(hdr);
 	dep->inumber = cpu_to_be64(dp->i_ino);
 	dep->namelen = 1;
 	dep->name[0] = '.';
@@ -1192,8 +1259,7 @@ xfs_dir2_sf_to_block(
 	/*
 	 * Create entry for ..
 	 */
-	dep = (xfs_dir2_data_entry_t *)
-		((char *)hdr + XFS_DIR2_DATA_DOTDOT_OFFSET);
+	dep = xfs_dir3_data_dotdot_entry_p(hdr);
 	dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
 	dep->namelen = 2;
 	dep->name[0] = dep->name[1] = '.';
@@ -1203,7 +1269,7 @@ xfs_dir2_sf_to_block(
 	blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
 	blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
 				(char *)dep - (char *)hdr));
-	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	offset = xfs_dir3_data_first_offset(hdr);
 	/*
 	 * Loop over existing entries, stuff them in.
 	 */
@@ -1273,6 +1339,6 @@ xfs_dir2_sf_to_block(
 	ASSERT(needscan == 0);
 	xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1);
 	xfs_dir2_block_log_tail(tp, bp);
-	xfs_dir2_data_check(dp, bp);
+	xfs_dir3_data_check(dp, bp);
 	return 0;
 }
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index ffcf1774152..c2930238005 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -30,6 +31,8 @@
 #include "xfs_dir2_format.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
 
 STATIC xfs_dir2_data_free_t *
 xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
@@ -40,7 +43,7 @@ xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
  * Return 0 is the buffer is good, otherwise an error.
  */
 int
-__xfs_dir2_data_check(
+__xfs_dir3_data_check(
 	struct xfs_inode	*dp,		/* incore inode pointer */
 	struct xfs_buf		*bp)		/* data block's buffer */
 {
@@ -65,15 +68,17 @@ __xfs_dir2_data_check(
 
 	mp = bp->b_target->bt_mount;
 	hdr = bp->b_addr;
-	bf = hdr->bestfree;
-	p = (char *)(hdr + 1);
+	bf = xfs_dir3_data_bestfree_p(hdr);
+	p = (char *)xfs_dir3_data_entry_p(hdr);
 
 	switch (hdr->magic) {
+	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
 	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
 		btp = xfs_dir2_block_tail_p(mp, hdr);
 		lep = xfs_dir2_block_leaf_p(btp);
 		endp = (char *)lep;
 		break;
+	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
 	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
 		endp = (char *)hdr + mp->m_dirblksize;
 		break;
@@ -148,7 +153,8 @@ __xfs_dir2_data_check(
 					       (char *)dep - (char *)hdr);
 		count++;
 		lastfree = 0;
-		if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+		if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
 			addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
 				(xfs_dir2_data_aoff_t)
 				((char *)dep - (char *)hdr));
@@ -168,7 +174,8 @@ __xfs_dir2_data_check(
 	 * Need to have seen all the entries and all the bestfree slots.
 	 */
 	XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
-	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
 		for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
 			if (lep[i].address ==
 			    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
@@ -185,21 +192,27 @@ __xfs_dir2_data_check(
 	return 0;
 }
 
-static void
-xfs_dir2_data_verify(
+static bool
+xfs_dir3_data_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
-	int			block_ok = 0;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
 
-	block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
-	block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
-
-	if (!block_ok) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
+			return false;
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
+			return false;
 	}
+	if (__xfs_dir3_data_check(NULL, bp))
+		return false;
+	return true;
 }
 
 /*
@@ -208,7 +221,7 @@ xfs_dir2_data_verify(
  * format buffer or a data format buffer on readahead.
  */
 static void
-xfs_dir2_data_reada_verify(
+xfs_dir3_data_reada_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
@@ -216,11 +229,13 @@ xfs_dir2_data_reada_verify(
 
 	switch (hdr->magic) {
 	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
-		bp->b_ops = &xfs_dir2_block_buf_ops;
+	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
+		bp->b_ops = &xfs_dir3_block_buf_ops;
 		bp->b_ops->verify_read(bp);
 		return;
 	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
-		xfs_dir2_data_verify(bp);
+	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
+		xfs_dir3_data_verify(bp);
 		return;
 	default:
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
@@ -230,51 +245,80 @@ xfs_dir2_data_reada_verify(
 }
 
 static void
-xfs_dir2_data_read_verify(
+xfs_dir3_data_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dir2_data_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+					  XFS_DIR3_DATA_CRC_OFF)) ||
+	    !xfs_dir3_data_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
 }
 
 static void
-xfs_dir2_data_write_verify(
+xfs_dir3_data_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dir2_data_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_data_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF);
 }
 
-const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
-	.verify_read = xfs_dir2_data_read_verify,
-	.verify_write = xfs_dir2_data_write_verify,
+const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+	.verify_read = xfs_dir3_data_read_verify,
+	.verify_write = xfs_dir3_data_write_verify,
 };
 
-static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
-	.verify_read = xfs_dir2_data_reada_verify,
-	.verify_write = xfs_dir2_data_write_verify,
+static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+	.verify_read = xfs_dir3_data_reada_verify,
+	.verify_write = xfs_dir3_data_write_verify,
 };
 
 
 int
-xfs_dir2_data_read(
+xfs_dir3_data_read(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*dp,
 	xfs_dablk_t		bno,
 	xfs_daddr_t		mapped_bno,
 	struct xfs_buf		**bpp)
 {
-	return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
-				XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+	return err;
 }
 
 int
-xfs_dir2_data_readahead(
+xfs_dir3_data_readahead(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*dp,
 	xfs_dablk_t		bno,
 	xfs_daddr_t		mapped_bno)
 {
 	return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
-				XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
+				XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
 }
 
 /*
@@ -288,12 +332,15 @@ xfs_dir2_data_freefind(
 {
 	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
 	xfs_dir2_data_aoff_t	off;		/* offset value needed */
+	struct xfs_dir2_data_free *bf;
 #if defined(DEBUG) && defined(__KERNEL__)
 	int			matched;	/* matched the value */
 	int			seenzero;	/* saw a 0 bestfree entry */
 #endif
 
 	off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+	bf = xfs_dir3_data_bestfree_p(hdr);
+
 #if defined(DEBUG) && defined(__KERNEL__)
 	/*
 	 * Validate some consistency in the bestfree table.
@@ -301,9 +348,11 @@ xfs_dir2_data_freefind(
 	 * one we're looking for it has to be exact.
 	 */
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
-	for (dfp = &hdr->bestfree[0], seenzero = matched = 0;
-	     dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT];
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+	for (dfp = &bf[0], seenzero = matched = 0;
+	     dfp < &bf[XFS_DIR2_DATA_FD_COUNT];
 	     dfp++) {
 		if (!dfp->offset) {
 			ASSERT(!dfp->length);
@@ -319,7 +368,7 @@ xfs_dir2_data_freefind(
 		else
 			ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off);
 		ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length));
-		if (dfp > &hdr->bestfree[0])
+		if (dfp > &bf[0])
 			ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length));
 	}
 #endif
@@ -328,14 +377,12 @@ xfs_dir2_data_freefind(
 	 * it can't be there since they're sorted.
 	 */
 	if (be16_to_cpu(dup->length) <
-	    be16_to_cpu(hdr->bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length))
+	    be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
 		return NULL;
 	/*
 	 * Look at the three bestfree entries for our guy.
 	 */
-	for (dfp = &hdr->bestfree[0];
-	     dfp < &hdr->bestfree[XFS_DIR2_DATA_FD_COUNT];
-	     dfp++) {
+	for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
 		if (!dfp->offset)
 			return NULL;
 		if (be16_to_cpu(dfp->offset) == off)
@@ -359,11 +406,12 @@ xfs_dir2_data_freeinsert(
 	xfs_dir2_data_free_t	*dfp;		/* bestfree table pointer */
 	xfs_dir2_data_free_t	new;		/* new bestfree entry */
 
-#ifdef __KERNEL__
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
-#endif
-	dfp = hdr->bestfree;
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
+	dfp = xfs_dir3_data_bestfree_p(hdr);
 	new.length = dup->length;
 	new.offset = cpu_to_be16((char *)dup - (char *)hdr);
 
@@ -400,32 +448,36 @@ xfs_dir2_data_freeremove(
 	xfs_dir2_data_free_t	*dfp,		/* bestfree entry pointer */
 	int			*loghead)	/* out: log data header */
 {
-#ifdef __KERNEL__
+	struct xfs_dir2_data_free *bf;
+
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
-#endif
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
 	/*
 	 * It's the first entry, slide the next 2 up.
 	 */
-	if (dfp == &hdr->bestfree[0]) {
-		hdr->bestfree[0] = hdr->bestfree[1];
-		hdr->bestfree[1] = hdr->bestfree[2];
+	bf = xfs_dir3_data_bestfree_p(hdr);
+	if (dfp == &bf[0]) {
+		bf[0] = bf[1];
+		bf[1] = bf[2];
 	}
 	/*
 	 * It's the second entry, slide the 3rd entry up.
 	 */
-	else if (dfp == &hdr->bestfree[1])
-		hdr->bestfree[1] = hdr->bestfree[2];
+	else if (dfp == &bf[1])
+		bf[1] = bf[2];
 	/*
 	 * Must be the last entry.
 	 */
 	else
-		ASSERT(dfp == &hdr->bestfree[2]);
+		ASSERT(dfp == &bf[2]);
 	/*
 	 * Clear the 3rd entry, must be zero now.
 	 */
-	hdr->bestfree[2].length = 0;
-	hdr->bestfree[2].offset = 0;
+	bf[2].length = 0;
+	bf[2].offset = 0;
 	*loghead = 1;
 }
 
@@ -441,23 +493,27 @@ xfs_dir2_data_freescan(
 	xfs_dir2_block_tail_t	*btp;		/* block tail */
 	xfs_dir2_data_entry_t	*dep;		/* active data entry */
 	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	struct xfs_dir2_data_free *bf;
 	char			*endp;		/* end of block's data */
 	char			*p;		/* current entry pointer */
 
-#ifdef __KERNEL__
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
-#endif
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+
 	/*
 	 * Start by clearing the table.
 	 */
-	memset(hdr->bestfree, 0, sizeof(hdr->bestfree));
+	bf = xfs_dir3_data_bestfree_p(hdr);
+	memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
 	*loghead = 1;
 	/*
 	 * Set up pointers.
 	 */
-	p = (char *)(hdr + 1);
-	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+	p = (char *)xfs_dir3_data_entry_p(hdr);
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
 		btp = xfs_dir2_block_tail_p(mp, hdr);
 		endp = (char *)xfs_dir2_block_leaf_p(btp);
 	} else
@@ -493,7 +549,7 @@ xfs_dir2_data_freescan(
  * Give back the buffer for the created block.
  */
 int						/* error */
-xfs_dir2_data_init(
+xfs_dir3_data_init(
 	xfs_da_args_t		*args,		/* directory operation args */
 	xfs_dir2_db_t		blkno,		/* logical dir block number */
 	struct xfs_buf		**bpp)		/* output block buffer */
@@ -502,6 +558,7 @@ xfs_dir2_data_init(
 	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
 	xfs_inode_t		*dp;		/* incore directory inode */
 	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	struct xfs_dir2_data_free *bf;
 	int			error;		/* error return value */
 	int			i;		/* bestfree index */
 	xfs_mount_t		*mp;		/* filesystem mount point */
@@ -518,27 +575,40 @@ xfs_dir2_data_init(
 		XFS_DATA_FORK);
 	if (error)
 		return error;
-	bp->b_ops = &xfs_dir2_data_buf_ops;
+	bp->b_ops = &xfs_dir3_data_buf_ops;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF);
 
 	/*
 	 * Initialize the header.
 	 */
 	hdr = bp->b_addr;
-	hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
-	hdr->bestfree[0].offset = cpu_to_be16(sizeof(*hdr));
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+		memset(hdr3, 0, sizeof(*hdr3));
+		hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+		hdr3->blkno = cpu_to_be64(bp->b_bn);
+		hdr3->owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+
+	} else
+		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+
+	bf = xfs_dir3_data_bestfree_p(hdr);
+	bf[0].offset = cpu_to_be16(xfs_dir3_data_entry_offset(hdr));
 	for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
-		hdr->bestfree[i].length = 0;
-		hdr->bestfree[i].offset = 0;
+		bf[i].length = 0;
+		bf[i].offset = 0;
 	}
 
 	/*
 	 * Set up an unused entry for the block's body.
 	 */
-	dup = (xfs_dir2_data_unused_t *)(hdr + 1);
+	dup = xfs_dir3_data_unused_p(hdr);
 	dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
 
-	t = mp->m_dirblksize - (uint)sizeof(*hdr);
-	hdr->bestfree[0].length = cpu_to_be16(t);
+	t = mp->m_dirblksize - (uint)xfs_dir3_data_entry_offset(hdr);
+	bf[0].length = cpu_to_be16(t);
 	dup->length = cpu_to_be16(t);
 	*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
 	/*
@@ -562,7 +632,9 @@ xfs_dir2_data_log_entry(
 	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
 
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
 
 	xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
 		(uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) -
@@ -580,9 +652,11 @@ xfs_dir2_data_log_header(
 	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
 
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
 
-	xfs_trans_log_buf(tp, bp, 0, sizeof(*hdr) - 1);
+	xfs_trans_log_buf(tp, bp, 0, xfs_dir3_data_entry_offset(hdr) - 1);
 }
 
 /*
@@ -597,7 +671,9 @@ xfs_dir2_data_log_unused(
 	xfs_dir2_data_hdr_t	*hdr = bp->b_addr;
 
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
 
 	/*
 	 * Log the first part of the unused entry.
@@ -635,6 +711,7 @@ xfs_dir2_data_make_free(
 	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
 	xfs_dir2_data_unused_t	*postdup;	/* unused entry after us */
 	xfs_dir2_data_unused_t	*prevdup;	/* unused entry before us */
+	struct xfs_dir2_data_free *bf;
 
 	mp = tp->t_mountp;
 	hdr = bp->b_addr;
@@ -642,12 +719,14 @@ xfs_dir2_data_make_free(
 	/*
 	 * Figure out where the end of the data area is.
 	 */
-	if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC))
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
 		endptr = (char *)hdr + mp->m_dirblksize;
 	else {
 		xfs_dir2_block_tail_t	*btp;	/* block tail */
 
-		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+			hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
 		btp = xfs_dir2_block_tail_p(mp, hdr);
 		endptr = (char *)xfs_dir2_block_leaf_p(btp);
 	}
@@ -655,7 +734,7 @@ xfs_dir2_data_make_free(
 	 * If this isn't the start of the block, then back up to
 	 * the previous entry and see if it's free.
 	 */
-	if (offset > sizeof(*hdr)) {
+	if (offset > xfs_dir3_data_entry_offset(hdr)) {
 		__be16			*tagp;	/* tag just before us */
 
 		tagp = (__be16 *)((char *)hdr + offset) - 1;
@@ -681,6 +760,7 @@ xfs_dir2_data_make_free(
 	 * Previous and following entries are both free,
 	 * merge everything into a single free entry.
 	 */
+	bf = xfs_dir3_data_bestfree_p(hdr);
 	if (prevdup && postdup) {
 		xfs_dir2_data_free_t	*dfp2;	/* another bestfree pointer */
 
@@ -695,7 +775,7 @@ xfs_dir2_data_make_free(
 		 * since the third bestfree is there, there might be more
 		 * entries.
 		 */
-		needscan = (hdr->bestfree[2].length != 0);
+		needscan = (bf[2].length != 0);
 		/*
 		 * Fix up the new big freespace.
 		 */
@@ -711,10 +791,10 @@ xfs_dir2_data_make_free(
 			 * Remove entry 1 first then entry 0.
 			 */
 			ASSERT(dfp && dfp2);
-			if (dfp == &hdr->bestfree[1]) {
-				dfp = &hdr->bestfree[0];
+			if (dfp == &bf[1]) {
+				dfp = &bf[0];
 				ASSERT(dfp2 == dfp);
-				dfp2 = &hdr->bestfree[1];
+				dfp2 = &bf[1];
 			}
 			xfs_dir2_data_freeremove(hdr, dfp2, needlogp);
 			xfs_dir2_data_freeremove(hdr, dfp, needlogp);
@@ -722,7 +802,7 @@ xfs_dir2_data_make_free(
 			 * Now insert the new entry.
 			 */
 			dfp = xfs_dir2_data_freeinsert(hdr, prevdup, needlogp);
-			ASSERT(dfp == &hdr->bestfree[0]);
+			ASSERT(dfp == &bf[0]);
 			ASSERT(dfp->length == prevdup->length);
 			ASSERT(!dfp[1].length);
 			ASSERT(!dfp[2].length);
@@ -751,7 +831,7 @@ xfs_dir2_data_make_free(
 		 */
 		else {
 			needscan = be16_to_cpu(prevdup->length) >
-				   be16_to_cpu(hdr->bestfree[2].length);
+				   be16_to_cpu(bf[2].length);
 		}
 	}
 	/*
@@ -779,7 +859,7 @@ xfs_dir2_data_make_free(
 		 */
 		else {
 			needscan = be16_to_cpu(newdup->length) >
-				   be16_to_cpu(hdr->bestfree[2].length);
+				   be16_to_cpu(bf[2].length);
 		}
 	}
 	/*
@@ -818,10 +898,13 @@ xfs_dir2_data_use_free(
 	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
 	xfs_dir2_data_unused_t	*newdup2;	/* another new unused entry */
 	int			oldlen;		/* old unused entry's length */
+	struct xfs_dir2_data_free *bf;
 
 	hdr = bp->b_addr;
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
-	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC));
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
 	ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG);
 	ASSERT(offset >= (char *)dup - (char *)hdr);
 	ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr);
@@ -831,7 +914,8 @@ xfs_dir2_data_use_free(
 	 */
 	dfp = xfs_dir2_data_freefind(hdr, dup);
 	oldlen = be16_to_cpu(dup->length);
-	ASSERT(dfp || oldlen <= be16_to_cpu(hdr->bestfree[2].length));
+	bf = xfs_dir3_data_bestfree_p(hdr);
+	ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
 	/*
 	 * Check for alignment with front and back of the entry.
 	 */
@@ -845,7 +929,7 @@ xfs_dir2_data_use_free(
 	 */
 	if (matchfront && matchback) {
 		if (dfp) {
-			needscan = (hdr->bestfree[2].offset != 0);
+			needscan = (bf[2].offset != 0);
 			if (!needscan)
 				xfs_dir2_data_freeremove(hdr, dfp, needlogp);
 		}
@@ -875,7 +959,7 @@ xfs_dir2_data_use_free(
 			 * that means we don't know if there was a better
 			 * choice for the last slot, or not.  Rescan.
 			 */
-			needscan = dfp == &hdr->bestfree[2];
+			needscan = dfp == &bf[2];
 		}
 	}
 	/*
@@ -902,7 +986,7 @@ xfs_dir2_data_use_free(
 			 * that means we don't know if there was a better
 			 * choice for the last slot, or not.  Rescan.
 			 */
-			needscan = dfp == &hdr->bestfree[2];
+			needscan = dfp == &bf[2];
 		}
 	}
 	/*
@@ -930,7 +1014,7 @@ xfs_dir2_data_use_free(
 		 * the 2 new will work.
 		 */
 		if (dfp) {
-			needscan = (hdr->bestfree[2].length != 0);
+			needscan = (bf[2].length != 0);
 			if (!needscan) {
 				xfs_dir2_data_freeremove(hdr, dfp, needlogp);
 				xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
index 07270981f48..7826782b8d7 100644
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_dir2_format.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -36,6 +37,38 @@
 #define	XFS_DIR2_FREE_MAGIC	0x58443246	/* XD2F: free index blocks */
 
 /*
+ * Directory Version 3 With CRCs.
+ *
+ * The tree formats are the same as for version 2 directories.  The difference
+ * is in the block header and dirent formats. In many cases the v3 structures
+ * use v2 definitions as they are no different and this makes code sharing much
+ * easier.
+ *
+ * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the
+ * format is v2 then they switch to the existing v2 code, or the format is v3
+ * they implement the v3 functionality. This means the existing dir2 is a mix of
+ * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called
+ * where there is a difference in the formats, otherwise the code is unchanged.
+ *
+ * Where it is possible, the code decides what to do based on the magic numbers
+ * in the blocks rather than feature bits in the superblock. This means the code
+ * is as independent of the external XFS code as possible as doesn't require
+ * passing struct xfs_mount pointers into places where it isn't really
+ * necessary.
+ *
+ * Version 3 includes:
+ *
+ *	- a larger block header for CRC and identification purposes and so the
+ *	offsets of all the structures inside the blocks are different.
+ *
+ *	- new magic numbers to be able to detect the v2/v3 types on the fly.
+ */
+
+#define	XFS_DIR3_BLOCK_MAGIC	0x58444233	/* XDB3: single block dirs */
+#define	XFS_DIR3_DATA_MAGIC	0x58444433	/* XDD3: multiblock dirs */
+#define	XFS_DIR3_FREE_MAGIC	0x58444633	/* XDF3: free index blocks */
+
+/*
  * Byte offset in data block and shortform entry.
  */
 typedef	__uint16_t	xfs_dir2_data_off_t;
@@ -195,16 +228,6 @@ xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr,
 	xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET)
 
 /*
- * Offsets of . and .. in data space (always block 0)
- */
-#define	XFS_DIR2_DATA_DOT_OFFSET	\
-	((xfs_dir2_data_aoff_t)sizeof(struct xfs_dir2_data_hdr))
-#define	XFS_DIR2_DATA_DOTDOT_OFFSET	\
-	(XFS_DIR2_DATA_DOT_OFFSET + xfs_dir2_data_entsize(1))
-#define	XFS_DIR2_DATA_FIRST_OFFSET		\
-	(XFS_DIR2_DATA_DOTDOT_OFFSET + xfs_dir2_data_entsize(2))
-
-/*
  * Describe a free area in the data block.
  *
  * The freespace will be formatted as a xfs_dir2_data_unused_t.
@@ -226,6 +249,40 @@ typedef struct xfs_dir2_data_hdr {
 } xfs_dir2_data_hdr_t;
 
 /*
+ * define a structure for all the verification fields we are adding to the
+ * directory block structures. This will be used in several structures.
+ * The magic number must be the first entry to align with all the dir2
+ * structures so we determine how to decode them just by the magic number.
+ */
+struct xfs_dir3_blk_hdr {
+	__be32			magic;	/* magic number */
+	__be32			crc;	/* CRC of block */
+	__be64			blkno;	/* first block of the buffer */
+	__be64			lsn;	/* sequence number of last write */
+	uuid_t			uuid;	/* filesystem we belong to */
+	__be64			owner;	/* inode that owns the block */
+};
+
+struct xfs_dir3_data_hdr {
+	struct xfs_dir3_blk_hdr	hdr;
+	xfs_dir2_data_free_t	best_free[XFS_DIR2_DATA_FD_COUNT];
+	__be32			pad;	/* 64 bit alignment */
+};
+
+#define XFS_DIR3_DATA_CRC_OFF  offsetof(struct xfs_dir3_data_hdr, hdr.crc)
+
+static inline struct xfs_dir2_data_free *
+xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+	if (hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
+		struct xfs_dir3_data_hdr *hdr3 = (struct xfs_dir3_data_hdr *)hdr;
+		return hdr3->best_free;
+	}
+	return hdr->bestfree;
+}
+
+/*
  * Active entry in a data block.
  *
  * Aligned to 8 bytes.  After the variable length name field there is a
@@ -280,6 +337,94 @@ xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
 			be16_to_cpu(dup->length) - sizeof(__be16));
 }
 
+static inline size_t
+xfs_dir3_data_hdr_size(bool dir3)
+{
+	if (dir3)
+		return sizeof(struct xfs_dir3_data_hdr);
+	return sizeof(struct xfs_dir2_data_hdr);
+}
+
+static inline size_t
+xfs_dir3_data_entry_offset(struct xfs_dir2_data_hdr *hdr)
+{
+	bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
+		    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
+	return xfs_dir3_data_hdr_size(dir3);
+}
+
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_entry_offset(hdr));
+}
+
+static inline struct xfs_dir2_data_unused *
+xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_unused *)
+		((char *)hdr + xfs_dir3_data_entry_offset(hdr));
+}
+
+/*
+ * Offsets of . and .. in data space (always block 0)
+ *
+ * The macros are used for shortform directories as they have no headers to read
+ * the magic number out of. Shortform directories need to know the size of the
+ * data block header because the sfe embeds the block offset of the entry into
+ * it so that it doesn't change when format conversion occurs. Bad Things Happen
+ * if we don't follow this rule.
+ */
+#define	XFS_DIR3_DATA_DOT_OFFSET(mp)	\
+	xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb))
+#define	XFS_DIR3_DATA_DOTDOT_OFFSET(mp)	\
+	(XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir2_data_entsize(1))
+#define	XFS_DIR3_DATA_FIRST_OFFSET(mp)		\
+	(XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir2_data_entsize(2))
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr)
+{
+	return xfs_dir3_data_entry_offset(hdr);
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr)
+{
+	return xfs_dir3_data_dot_offset(hdr) + xfs_dir2_data_entsize(1);
+}
+
+static inline xfs_dir2_data_aoff_t
+xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr)
+{
+	return xfs_dir3_data_dotdot_offset(hdr) + xfs_dir2_data_entsize(2);
+}
+
+/*
+ * location of . and .. in data space (always block 0)
+ */
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_dot_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_dot_offset(hdr));
+}
+
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_dotdot_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_dotdot_offset(hdr));
+}
+
+static inline struct xfs_dir2_data_entry *
+xfs_dir3_data_first_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+	return (struct xfs_dir2_data_entry *)
+		((char *)hdr + xfs_dir3_data_first_offset(hdr));
+}
+
 /*
  * Leaf block structures.
  *
@@ -329,6 +474,21 @@ typedef struct xfs_dir2_leaf_hdr {
 	__be16			stale;		/* count of stale entries */
 } xfs_dir2_leaf_hdr_t;
 
+struct xfs_dir3_leaf_hdr {
+	struct xfs_da3_blkinfo	info;		/* header for da routines */
+	__be16			count;		/* count of entries */
+	__be16			stale;		/* count of stale entries */
+	__be32			pad;		/* 64 bit alignment */
+};
+
+struct xfs_dir3_icleaf_hdr {
+	__uint32_t		forw;
+	__uint32_t		back;
+	__uint16_t		magic;
+	__uint16_t		count;
+	__uint16_t		stale;
+};
+
 /*
  * Leaf block entry.
  */
@@ -348,23 +508,50 @@ typedef struct xfs_dir2_leaf_tail {
  * Leaf block.
  */
 typedef struct xfs_dir2_leaf {
-	xfs_dir2_leaf_hdr_t	hdr;		/* leaf header */
-	xfs_dir2_leaf_entry_t	ents[];		/* entries */
+	xfs_dir2_leaf_hdr_t	hdr;			/* leaf header */
+	xfs_dir2_leaf_entry_t	__ents[];		/* entries */
 } xfs_dir2_leaf_t;
 
-/*
- * DB blocks here are logical directory block numbers, not filesystem blocks.
- */
+struct xfs_dir3_leaf {
+	struct xfs_dir3_leaf_hdr	hdr;		/* leaf header */
+	struct xfs_dir2_leaf_entry	__ents[];	/* entries */
+};
 
-static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp)
+#define XFS_DIR3_LEAF_CRC_OFF  offsetof(struct xfs_dir3_leaf_hdr, info.crc)
+
+static inline int
+xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp)
 {
-	return (mp->m_dirblksize - (uint)sizeof(struct xfs_dir2_leaf_hdr)) /
+	if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	    lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC))
+		return sizeof(struct xfs_dir3_leaf_hdr);
+	return sizeof(struct xfs_dir2_leaf_hdr);
+}
+
+static inline int
+xfs_dir3_max_leaf_ents(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
+{
+	return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size(lp)) /
 		(uint)sizeof(struct xfs_dir2_leaf_entry);
 }
 
 /*
  * Get address of the bestcount field in the single-leaf block.
  */
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+	if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	    lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
+		struct xfs_dir3_leaf *lp3 = (struct xfs_dir3_leaf *)lp;
+		return lp3->__ents;
+	}
+	return lp->__ents;
+}
+
+/*
+ * Get address of the bestcount field in the single-leaf block.
+ */
 static inline struct xfs_dir2_leaf_tail *
 xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
 {
@@ -383,6 +570,10 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
 }
 
 /*
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
  * Convert dataptr to byte in file space
  */
 static inline xfs_dir2_off_t
@@ -520,19 +711,66 @@ typedef struct xfs_dir2_free {
 						/* unused entries are -1 */
 } xfs_dir2_free_t;
 
-static inline int xfs_dir2_free_max_bests(struct xfs_mount *mp)
+struct xfs_dir3_free_hdr {
+	struct xfs_dir3_blk_hdr	hdr;
+	__be32			firstdb;	/* db of first entry */
+	__be32			nvalid;		/* count of valid entries */
+	__be32			nused;		/* count of used entries */
+	__be32			pad;		/* 64 bit alignment */
+};
+
+struct xfs_dir3_free {
+	struct xfs_dir3_free_hdr hdr;
+	__be16			bests[];	/* best free counts */
+						/* unused entries are -1 */
+};
+
+#define XFS_DIR3_FREE_CRC_OFF  offsetof(struct xfs_dir3_free, hdr.hdr.crc)
+
+/*
+ * In core version of the free block header, abstracted away from on-disk format
+ * differences. Use this in the code, and convert to/from the disk version using
+ * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
+ */
+struct xfs_dir3_icfree_hdr {
+	__uint32_t	magic;
+	__uint32_t	firstdb;
+	__uint32_t	nvalid;
+	__uint32_t	nused;
+
+};
+
+void xfs_dir3_free_hdr_from_disk(struct xfs_dir3_icfree_hdr *to,
+				 struct xfs_dir2_free *from);
+
+static inline int
+xfs_dir3_free_hdr_size(struct xfs_mount *mp)
 {
-	return (mp->m_dirblksize - sizeof(struct xfs_dir2_free_hdr)) /
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return sizeof(struct xfs_dir3_free_hdr);
+	return sizeof(struct xfs_dir2_free_hdr);
+}
+
+static inline int
+xfs_dir3_free_max_bests(struct xfs_mount *mp)
+{
+	return (mp->m_dirblksize - xfs_dir3_free_hdr_size(mp)) /
 		sizeof(xfs_dir2_data_off_t);
 }
 
+static inline __be16 *
+xfs_dir3_free_bests_p(struct xfs_mount *mp, struct xfs_dir2_free *free)
+{
+	return (__be16 *)((char *)free + xfs_dir3_free_hdr_size(mp));
+}
+
 /*
  * Convert data space db to the corresponding free db.
  */
 static inline xfs_dir2_db_t
 xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
 {
-	return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir2_free_max_bests(mp);
+	return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp);
 }
 
 /*
@@ -541,7 +779,7 @@ xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
 static inline int
 xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
 {
-	return db % xfs_dir2_free_max_bests(mp);
+	return db % xfs_dir3_free_max_bests(mp);
 }
 
 /*
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 60cd2fa4e04..2aed25cae04 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -33,97 +34,371 @@
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
 
 /*
  * Local function declarations.
  */
-#ifdef DEBUG
-static void xfs_dir2_leaf_check(struct xfs_inode *dp, struct xfs_buf *bp);
-#else
-#define	xfs_dir2_leaf_check(dp, bp)
-#endif
 static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
 				    int *indexp, struct xfs_buf **dbpp);
-static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
+static void xfs_dir3_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
 				    int first, int last);
-static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
+static void xfs_dir3_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
 
-static void
-xfs_dir2_leaf_verify(
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+#ifdef DEBUG
+#define	xfs_dir3_leaf_check(mp, bp) \
+do { \
+	if (!xfs_dir3_leaf1_check((mp), (bp))) \
+		ASSERT(0); \
+} while (0);
+
+STATIC bool
+xfs_dir3_leaf1_check(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
+	if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+			return false;
+	} else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
+		return false;
+
+	return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+}
+#else
+#define	xfs_dir3_leaf_check(mp, bp)
+#endif
+
+void
+xfs_dir3_leaf_hdr_from_disk(
+	struct xfs_dir3_icleaf_hdr	*to,
+	struct xfs_dir2_leaf		*from)
+{
+	if (from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	    from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
+		to->forw = be32_to_cpu(from->hdr.info.forw);
+		to->back = be32_to_cpu(from->hdr.info.back);
+		to->magic = be16_to_cpu(from->hdr.info.magic);
+		to->count = be16_to_cpu(from->hdr.count);
+		to->stale = be16_to_cpu(from->hdr.stale);
+	} else {
+		struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
+
+		to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+		to->back = be32_to_cpu(hdr3->info.hdr.back);
+		to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+		to->count = be16_to_cpu(hdr3->count);
+		to->stale = be16_to_cpu(hdr3->stale);
+	}
+
+	ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+	       to->magic == XFS_DIR3_LEAF1_MAGIC ||
+	       to->magic == XFS_DIR2_LEAFN_MAGIC ||
+	       to->magic == XFS_DIR3_LEAFN_MAGIC);
+}
+
+void
+xfs_dir3_leaf_hdr_to_disk(
+	struct xfs_dir2_leaf		*to,
+	struct xfs_dir3_icleaf_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+	       from->magic == XFS_DIR3_LEAF1_MAGIC ||
+	       from->magic == XFS_DIR2_LEAFN_MAGIC ||
+	       from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+	if (from->magic == XFS_DIR2_LEAF1_MAGIC ||
+	    from->magic == XFS_DIR2_LEAFN_MAGIC) {
+		to->hdr.info.forw = cpu_to_be32(from->forw);
+		to->hdr.info.back = cpu_to_be32(from->back);
+		to->hdr.info.magic = cpu_to_be16(from->magic);
+		to->hdr.count = cpu_to_be16(from->count);
+		to->hdr.stale = cpu_to_be16(from->stale);
+	} else {
+		struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
+
+		hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+		hdr3->info.hdr.back = cpu_to_be32(from->back);
+		hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+		hdr3->count = cpu_to_be16(from->count);
+		hdr3->stale = cpu_to_be16(from->stale);
+	}
+}
+
+bool
+xfs_dir3_leaf_check_int(
+	struct xfs_mount	*mp,
+	struct xfs_dir3_icleaf_hdr *hdr,
+	struct xfs_dir2_leaf	*leaf)
+{
+	struct xfs_dir2_leaf_entry *ents;
+	xfs_dir2_leaf_tail_t	*ltp;
+	int			stale;
+	int			i;
+
+	ents = xfs_dir3_leaf_ents_p(leaf);
+	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+
+	/*
+	 * XXX (dgc): This value is not restrictive enough.
+	 * Should factor in the size of the bests table as well.
+	 * We can deduce a value for that from di_size.
+	 */
+	if (hdr->count > xfs_dir3_max_leaf_ents(mp, leaf))
+		return false;
+
+	/* Leaves and bests don't overlap in leaf format. */
+	if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+	     hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
+	    (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+		return false;
+
+	/* Check hash value order, count stale entries.  */
+	for (i = stale = 0; i < hdr->count; i++) {
+		if (i + 1 < hdr->count) {
+			if (be32_to_cpu(ents[i].hashval) >
+					be32_to_cpu(ents[i + 1].hashval))
+				return false;
+		}
+		if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			stale++;
+	}
+	if (hdr->stale != stale)
+		return false;
+	return true;
+}
+
+static bool
+xfs_dir3_leaf_verify(
 	struct xfs_buf		*bp,
-	__be16			magic)
+	__uint16_t		magic)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
-	int			block_ok = 0;
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+
+		if ((magic == XFS_DIR2_LEAF1_MAGIC &&
+		     leafhdr.magic != XFS_DIR3_LEAF1_MAGIC) ||
+		    (magic == XFS_DIR2_LEAFN_MAGIC &&
+		     leafhdr.magic != XFS_DIR3_LEAFN_MAGIC))
+			return false;
 
-	block_ok = hdr->info.magic == magic;
-	if (!block_ok) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+		if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (leafhdr.magic != magic)
+			return false;
+	}
+	return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+}
+
+static void
+__read_verify(
+	struct xfs_buf  *bp,
+	__uint16_t	magic)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+					  XFS_DIR3_LEAF_CRC_OFF)) ||
+	    !xfs_dir3_leaf_verify(bp, magic)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+}
+
+static void
+__write_verify(
+	struct xfs_buf  *bp,
+	__uint16_t	magic)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_leaf_verify(bp, magic)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
 	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF);
 }
 
 static void
-xfs_dir2_leaf1_read_verify(
+xfs_dir3_leaf1_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+	__read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
 }
 
 static void
-xfs_dir2_leaf1_write_verify(
+xfs_dir3_leaf1_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+	__write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
 }
 
-void
-xfs_dir2_leafn_read_verify(
+static void
+xfs_dir3_leafn_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+	__read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
 }
 
-void
-xfs_dir2_leafn_write_verify(
+static void
+xfs_dir3_leafn_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+	__write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
 }
 
-static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
-	.verify_read = xfs_dir2_leaf1_read_verify,
-	.verify_write = xfs_dir2_leaf1_write_verify,
+const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+	.verify_read = xfs_dir3_leaf1_read_verify,
+	.verify_write = xfs_dir3_leaf1_write_verify,
 };
 
-const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
-	.verify_read = xfs_dir2_leafn_read_verify,
-	.verify_write = xfs_dir2_leafn_write_verify,
+const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+	.verify_read = xfs_dir3_leafn_read_verify,
+	.verify_write = xfs_dir3_leafn_write_verify,
 };
 
 static int
-xfs_dir2_leaf_read(
+xfs_dir3_leaf_read(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*dp,
 	xfs_dablk_t		fbno,
 	xfs_daddr_t		mappedbno,
 	struct xfs_buf		**bpp)
 {
-	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-				XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
+	return err;
 }
 
 int
-xfs_dir2_leafn_read(
+xfs_dir3_leafn_read(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*dp,
 	xfs_dablk_t		fbno,
 	xfs_daddr_t		mappedbno,
 	struct xfs_buf		**bpp)
 {
-	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-				XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+	if (!err && tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
+	return err;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+static void
+xfs_dir3_leaf_init(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	xfs_ino_t		owner,
+	__uint16_t		type)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+
+	ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+
+		memset(leaf3, 0, sizeof(*leaf3));
+
+		leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC)
+					 ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)
+					 : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+		leaf3->info.blkno = cpu_to_be64(bp->b_bn);
+		leaf3->info.owner = cpu_to_be64(owner);
+		uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+	} else {
+		memset(leaf, 0, sizeof(*leaf));
+		leaf->hdr.info.magic = cpu_to_be16(type);
+	}
+
+	/*
+	 * If it's a leaf-format directory initialize the tail.
+	 * Caller is responsible for initialising the bests table.
+	 */
+	if (type == XFS_DIR2_LEAF1_MAGIC) {
+		struct xfs_dir2_leaf_tail *ltp;
+
+		ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+		ltp->bestcount = 0;
+		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
+	} else {
+		bp->b_ops = &xfs_dir3_leafn_buf_ops;
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+	}
+}
+
+int
+xfs_dir3_leaf_get_buf(
+	xfs_da_args_t		*args,
+	xfs_dir2_db_t		bno,
+	struct xfs_buf		**bpp,
+	__uint16_t		magic)
+{
+	struct xfs_inode	*dp = args->dp;
+	struct xfs_trans	*tp = args->trans;
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+
+	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
+	       bno < XFS_DIR2_FREE_FIRSTDB(mp));
+
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
+			       XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
+	xfs_dir3_leaf_log_header(tp, bp);
+	if (magic == XFS_DIR2_LEAF1_MAGIC)
+		xfs_dir3_leaf_log_tail(tp, bp);
+	*bpp = bp;
+	return 0;
 }
 
 /*
@@ -149,6 +424,9 @@ xfs_dir2_block_to_leaf(
 	int			needlog;	/* need to log block header */
 	int			needscan;	/* need to rescan bestfree */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_data_free *bf;
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	trace_xfs_dir2_block_to_leaf(args);
 
@@ -168,26 +446,33 @@ xfs_dir2_block_to_leaf(
 	/*
 	 * Initialize the leaf block, get a buffer for it.
 	 */
-	if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) {
+	error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC);
+	if (error)
 		return error;
-	}
-	ASSERT(lbp != NULL);
+
 	leaf = lbp->b_addr;
 	hdr = dbp->b_addr;
-	xfs_dir2_data_check(dp, dbp);
+	xfs_dir3_data_check(dp, dbp);
 	btp = xfs_dir2_block_tail_p(mp, hdr);
 	blp = xfs_dir2_block_leaf_p(btp);
+	bf = xfs_dir3_data_bestfree_p(hdr);
+	ents = xfs_dir3_leaf_ents_p(leaf);
+
 	/*
 	 * Set the counts in the leaf header.
 	 */
-	leaf->hdr.count = cpu_to_be16(be32_to_cpu(btp->count));
-	leaf->hdr.stale = cpu_to_be16(be32_to_cpu(btp->stale));
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	leafhdr.count = be32_to_cpu(btp->count);
+	leafhdr.stale = be32_to_cpu(btp->stale);
+	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(tp, lbp);
+
 	/*
 	 * Could compact these but I think we always do the conversion
 	 * after squeezing out stale entries.
 	 */
-	memcpy(leaf->ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
-	xfs_dir2_leaf_log_ents(tp, lbp, 0, be16_to_cpu(leaf->hdr.count) - 1);
+	memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir3_leaf_log_ents(tp, lbp, 0, leafhdr.count - 1);
 	needscan = 0;
 	needlog = 1;
 	/*
@@ -202,8 +487,13 @@ xfs_dir2_block_to_leaf(
 	/*
 	 * Fix up the block header, make it a data block.
 	 */
-	dbp->b_ops = &xfs_dir2_data_buf_ops;
-	hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+	dbp->b_ops = &xfs_dir3_data_buf_ops;
+	xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF);
+	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
+		hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+	else
+		hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
+
 	if (needscan)
 		xfs_dir2_data_freescan(mp, hdr, &needlog);
 	/*
@@ -212,21 +502,22 @@ xfs_dir2_block_to_leaf(
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
 	ltp->bestcount = cpu_to_be32(1);
 	bestsp = xfs_dir2_leaf_bests_p(ltp);
-	bestsp[0] =  hdr->bestfree[0].length;
+	bestsp[0] =  bf[0].length;
 	/*
 	 * Log the data header and leaf bests table.
 	 */
 	if (needlog)
 		xfs_dir2_data_log_header(tp, dbp);
-	xfs_dir2_leaf_check(dp, lbp);
-	xfs_dir2_data_check(dp, dbp);
-	xfs_dir2_leaf_log_bests(tp, lbp, 0, 0);
+	xfs_dir3_leaf_check(mp, lbp);
+	xfs_dir3_data_check(dp, dbp);
+	xfs_dir3_leaf_log_bests(tp, lbp, 0, 0);
 	return 0;
 }
 
 STATIC void
-xfs_dir2_leaf_find_stale(
-	struct xfs_dir2_leaf	*leaf,
+xfs_dir3_leaf_find_stale(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
 	int			index,
 	int			*lowstale,
 	int			*highstale)
@@ -235,7 +526,7 @@ xfs_dir2_leaf_find_stale(
 	 * Find the first stale entry before our index, if any.
 	 */
 	for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) {
-		if (leaf->ents[*lowstale].address ==
+		if (ents[*lowstale].address ==
 		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
 			break;
 	}
@@ -245,10 +536,8 @@ xfs_dir2_leaf_find_stale(
 	 * Stop if the result would require moving more entries than using
 	 * lowstale.
 	 */
-	for (*highstale = index;
-	     *highstale < be16_to_cpu(leaf->hdr.count);
-	     ++*highstale) {
-		if (leaf->ents[*highstale].address ==
+	for (*highstale = index; *highstale < leafhdr->count; ++*highstale) {
+		if (ents[*highstale].address ==
 		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
 			break;
 		if (*lowstale >= 0 && index - *lowstale <= *highstale - index)
@@ -257,8 +546,9 @@ xfs_dir2_leaf_find_stale(
 }
 
 struct xfs_dir2_leaf_entry *
-xfs_dir2_leaf_find_entry(
-	xfs_dir2_leaf_t		*leaf,		/* leaf structure */
+xfs_dir3_leaf_find_entry(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
 	int			index,		/* leaf table position */
 	int			compact,	/* need to compact leaves */
 	int			lowstale,	/* index of prev stale leaf */
@@ -266,7 +556,7 @@ xfs_dir2_leaf_find_entry(
 	int			*lfloglow,	/* low leaf logging index */
 	int			*lfloghigh)	/* high leaf logging index */
 {
-	if (!leaf->hdr.stale) {
+	if (!leafhdr->stale) {
 		xfs_dir2_leaf_entry_t	*lep;	/* leaf entry table pointer */
 
 		/*
@@ -274,18 +564,16 @@ xfs_dir2_leaf_find_entry(
 		 *
 		 * If there are no stale entries, just insert a hole at index.
 		 */
-		lep = &leaf->ents[index];
-		if (index < be16_to_cpu(leaf->hdr.count))
+		lep = &ents[index];
+		if (index < leafhdr->count)
 			memmove(lep + 1, lep,
-				(be16_to_cpu(leaf->hdr.count) - index) *
-				 sizeof(*lep));
+				(leafhdr->count - index) * sizeof(*lep));
 
 		/*
 		 * Record low and high logging indices for the leaf.
 		 */
 		*lfloglow = index;
-		*lfloghigh = be16_to_cpu(leaf->hdr.count);
-		be16_add_cpu(&leaf->hdr.count, 1);
+		*lfloghigh = leafhdr->count++;
 		return lep;
 	}
 
@@ -299,16 +587,17 @@ xfs_dir2_leaf_find_entry(
 	 * entries before and after our insertion point.
 	 */
 	if (compact == 0)
-		xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale);
+		xfs_dir3_leaf_find_stale(leafhdr, ents, index,
+					 &lowstale, &highstale);
 
 	/*
 	 * If the low one is better, use it.
 	 */
 	if (lowstale >= 0 &&
-	    (highstale == be16_to_cpu(leaf->hdr.count) ||
+	    (highstale == leafhdr->count ||
 	     index - lowstale - 1 < highstale - index)) {
 		ASSERT(index - lowstale - 1 >= 0);
-		ASSERT(leaf->ents[lowstale].address ==
+		ASSERT(ents[lowstale].address ==
 		       cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
 
 		/*
@@ -316,37 +605,34 @@ xfs_dir2_leaf_find_entry(
 		 * for the new entry.
 		 */
 		if (index - lowstale - 1 > 0) {
-			memmove(&leaf->ents[lowstale],
-				&leaf->ents[lowstale + 1],
+			memmove(&ents[lowstale], &ents[lowstale + 1],
 				(index - lowstale - 1) *
-				sizeof(xfs_dir2_leaf_entry_t));
+					sizeof(xfs_dir2_leaf_entry_t));
 		}
 		*lfloglow = MIN(lowstale, *lfloglow);
 		*lfloghigh = MAX(index - 1, *lfloghigh);
-		be16_add_cpu(&leaf->hdr.stale, -1);
-		return &leaf->ents[index - 1];
+		leafhdr->stale--;
+		return &ents[index - 1];
 	}
 
 	/*
 	 * The high one is better, so use that one.
 	 */
 	ASSERT(highstale - index >= 0);
-	ASSERT(leaf->ents[highstale].address ==
-	       cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
+	ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR));
 
 	/*
 	 * Copy entries down to cover the stale entry and make room for the
 	 * new entry.
 	 */
 	if (highstale - index > 0) {
-		memmove(&leaf->ents[index + 1],
-			&leaf->ents[index],
+		memmove(&ents[index + 1], &ents[index],
 			(highstale - index) * sizeof(xfs_dir2_leaf_entry_t));
 	}
 	*lfloglow = MIN(index, *lfloglow);
 	*lfloghigh = MAX(highstale, *lfloghigh);
-	be16_add_cpu(&leaf->hdr.stale, -1);
-	return &leaf->ents[index];
+	leafhdr->stale--;
+	return &ents[index];
 }
 
 /*
@@ -383,6 +669,9 @@ xfs_dir2_leaf_addname(
 	__be16			*tagp;		/* end of data entry */
 	xfs_trans_t		*tp;		/* transaction pointer */
 	xfs_dir2_db_t		use_block;	/* data block number */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	trace_xfs_dir2_leaf_addname(args);
 
@@ -390,7 +679,7 @@ xfs_dir2_leaf_addname(
 	tp = args->trans;
 	mp = dp->i_mount;
 
-	error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+	error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
 	if (error)
 		return error;
 
@@ -403,16 +692,19 @@ xfs_dir2_leaf_addname(
 	index = xfs_dir2_leaf_search_hash(args, lbp);
 	leaf = lbp->b_addr;
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+	ents = xfs_dir3_leaf_ents_p(leaf);
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
 	bestsp = xfs_dir2_leaf_bests_p(ltp);
 	length = xfs_dir2_data_entsize(args->namelen);
+
 	/*
 	 * See if there are any entries with the same hash value
 	 * and space in their block for the new entry.
 	 * This is good because it puts multiple same-hash value entries
 	 * in a data block, improving the lookup of those entries.
 	 */
-	for (use_block = -1, lep = &leaf->ents[index];
-	     index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval;
+	for (use_block = -1, lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
 	     index++, lep++) {
 		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
 			continue;
@@ -445,7 +737,7 @@ xfs_dir2_leaf_addname(
 	 * How many bytes do we need in the leaf block?
 	 */
 	needbytes = 0;
-	if (!leaf->hdr.stale)
+	if (!leafhdr.stale)
 		needbytes += sizeof(xfs_dir2_leaf_entry_t);
 	if (use_block == -1)
 		needbytes += sizeof(xfs_dir2_data_off_t);
@@ -460,16 +752,15 @@ xfs_dir2_leaf_addname(
 	 * If we don't have enough free bytes but we can make enough
 	 * by compacting out stale entries, we'll do that.
 	 */
-	if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <
-				needbytes && be16_to_cpu(leaf->hdr.stale) > 1) {
+	if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes &&
+	    leafhdr.stale > 1)
 		compact = 1;
-	}
+
 	/*
 	 * Otherwise if we don't have enough free bytes we need to
 	 * convert to node form.
 	 */
-	else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(
-						leaf->hdr.count)] < needbytes) {
+	else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) {
 		/*
 		 * Just checking or no space reservation, give up.
 		 */
@@ -517,15 +808,15 @@ xfs_dir2_leaf_addname(
 	 * point later.
 	 */
 	if (compact) {
-		xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale,
-			&lfloglow, &lfloghigh);
+		xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+			&highstale, &lfloglow, &lfloghigh);
 	}
 	/*
 	 * There are stale entries, so we'll need log-low and log-high
 	 * impossibly bad values later.
 	 */
-	else if (be16_to_cpu(leaf->hdr.stale)) {
-		lfloglow = be16_to_cpu(leaf->hdr.count);
+	else if (leafhdr.stale) {
+		lfloglow = leafhdr.count;
 		lfloghigh = -1;
 	}
 	/*
@@ -544,7 +835,7 @@ xfs_dir2_leaf_addname(
 		/*
 		 * Initialize the block.
 		 */
-		if ((error = xfs_dir2_data_init(args, use_block, &dbp))) {
+		if ((error = xfs_dir3_data_init(args, use_block, &dbp))) {
 			xfs_trans_brelse(tp, lbp);
 			return error;
 		}
@@ -557,23 +848,24 @@ xfs_dir2_leaf_addname(
 			memmove(&bestsp[0], &bestsp[1],
 				be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
 			be32_add_cpu(&ltp->bestcount, 1);
-			xfs_dir2_leaf_log_tail(tp, lbp);
-			xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+			xfs_dir3_leaf_log_tail(tp, lbp);
+			xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
 		}
 		/*
 		 * If we're filling in a previously empty block just log it.
 		 */
 		else
-			xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+			xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block);
 		hdr = dbp->b_addr;
-		bestsp[use_block] = hdr->bestfree[0].length;
+		bf = xfs_dir3_data_bestfree_p(hdr);
+		bestsp[use_block] = bf[0].length;
 		grown = 1;
 	} else {
 		/*
 		 * Already had space in some data block.
 		 * Just read that one in.
 		 */
-		error = xfs_dir2_data_read(tp, dp,
+		error = xfs_dir3_data_read(tp, dp,
 					   xfs_dir2_db_to_da(mp, use_block),
 					   -1, &dbp);
 		if (error) {
@@ -581,13 +873,14 @@ xfs_dir2_leaf_addname(
 			return error;
 		}
 		hdr = dbp->b_addr;
+		bf = xfs_dir3_data_bestfree_p(hdr);
 		grown = 0;
 	}
 	/*
 	 * Point to the biggest freespace in our data block.
 	 */
 	dup = (xfs_dir2_data_unused_t *)
-	      ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset));
+	      ((char *)hdr + be16_to_cpu(bf[0].offset));
 	ASSERT(be16_to_cpu(dup->length) >= length);
 	needscan = needlog = 0;
 	/*
@@ -620,13 +913,13 @@ xfs_dir2_leaf_addname(
 	 * If the bests table needs to be changed, do it.
 	 * Log the change unless we've already done that.
 	 */
-	if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(hdr->bestfree[0].length)) {
-		bestsp[use_block] = hdr->bestfree[0].length;
+	if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
+		bestsp[use_block] = bf[0].length;
 		if (!grown)
-			xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+			xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block);
 	}
 
-	lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale,
+	lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
 				       highstale, &lfloglow, &lfloghigh);
 
 	/*
@@ -638,82 +931,40 @@ xfs_dir2_leaf_addname(
 	/*
 	 * Log the leaf fields and give up the buffers.
 	 */
-	xfs_dir2_leaf_log_header(tp, lbp);
-	xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
-	xfs_dir2_leaf_check(dp, lbp);
-	xfs_dir2_data_check(dp, dbp);
+	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(tp, lbp);
+	xfs_dir3_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
+	xfs_dir3_leaf_check(mp, lbp);
+	xfs_dir3_data_check(dp, dbp);
 	return 0;
 }
 
-#ifdef DEBUG
-/*
- * Check the internal consistency of a leaf1 block.
- * Pop an assert if something is wrong.
- */
-STATIC void
-xfs_dir2_leaf_check(
-	struct xfs_inode	*dp,		/* incore directory inode */
-	struct xfs_buf		*bp)		/* leaf's buffer */
-{
-	int			i;		/* leaf index */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail pointer */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	int			stale;		/* count of stale leaves */
-
-	leaf = bp->b_addr;
-	mp = dp->i_mount;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
-	/*
-	 * This value is not restrictive enough.
-	 * Should factor in the size of the bests table as well.
-	 * We can deduce a value for that from di_size.
-	 */
-	ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
-	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
-	/*
-	 * Leaves and bests don't overlap.
-	 */
-	ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <=
-	       (char *)xfs_dir2_leaf_bests_p(ltp));
-	/*
-	 * Check hash value order, count stale entries.
-	 */
-	for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) {
-		if (i + 1 < be16_to_cpu(leaf->hdr.count))
-			ASSERT(be32_to_cpu(leaf->ents[i].hashval) <=
-			       be32_to_cpu(leaf->ents[i + 1].hashval));
-		if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-			stale++;
-	}
-	ASSERT(be16_to_cpu(leaf->hdr.stale) == stale);
-}
-#endif	/* DEBUG */
-
 /*
  * Compact out any stale entries in the leaf.
  * Log the header and changed leaf entries, if any.
  */
 void
-xfs_dir2_leaf_compact(
+xfs_dir3_leaf_compact(
 	xfs_da_args_t	*args,		/* operation arguments */
+	struct xfs_dir3_icleaf_hdr *leafhdr,
 	struct xfs_buf	*bp)		/* leaf buffer */
 {
 	int		from;		/* source leaf index */
 	xfs_dir2_leaf_t	*leaf;		/* leaf structure */
 	int		loglow;		/* first leaf entry to log */
 	int		to;		/* target leaf index */
+	struct xfs_dir2_leaf_entry *ents;
 
 	leaf = bp->b_addr;
-	if (!leaf->hdr.stale) {
+	if (!leafhdr->stale)
 		return;
-	}
+
 	/*
 	 * Compress out the stale entries in place.
 	 */
-	for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) {
-		if (leaf->ents[from].address ==
-		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+	ents = xfs_dir3_leaf_ents_p(leaf);
+	for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
+		if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
 			continue;
 		/*
 		 * Only actually copy the entries that are different.
@@ -721,19 +972,21 @@ xfs_dir2_leaf_compact(
 		if (from > to) {
 			if (loglow == -1)
 				loglow = to;
-			leaf->ents[to] = leaf->ents[from];
+			ents[to] = ents[from];
 		}
 		to++;
 	}
 	/*
 	 * Update and log the header, log the leaf entries.
 	 */
-	ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to);
-	be16_add_cpu(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale)));
-	leaf->hdr.stale = 0;
-	xfs_dir2_leaf_log_header(args->trans, bp);
+	ASSERT(leafhdr->stale == from - to);
+	leafhdr->count -= leafhdr->stale;
+	leafhdr->stale = 0;
+
+	xfs_dir3_leaf_hdr_to_disk(leaf, leafhdr);
+	xfs_dir3_leaf_log_header(args->trans, bp);
 	if (loglow != -1)
-		xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1);
+		xfs_dir3_leaf_log_ents(args->trans, bp, loglow, to - 1);
 }
 
 /*
@@ -745,8 +998,9 @@ xfs_dir2_leaf_compact(
  * and leaf logging indices.
  */
 void
-xfs_dir2_leaf_compact_x1(
-	struct xfs_buf	*bp,		/* leaf buffer */
+xfs_dir3_leaf_compact_x1(
+	struct xfs_dir3_icleaf_hdr *leafhdr,
+	struct xfs_dir2_leaf_entry *ents,
 	int		*indexp,	/* insertion index */
 	int		*lowstalep,	/* out: stale entry before us */
 	int		*highstalep,	/* out: stale entry after us */
@@ -757,22 +1011,20 @@ xfs_dir2_leaf_compact_x1(
 	int		highstale;	/* stale entry at/after index */
 	int		index;		/* insertion index */
 	int		keepstale;	/* source index of kept stale */
-	xfs_dir2_leaf_t	*leaf;		/* leaf structure */
 	int		lowstale;	/* stale entry before index */
 	int		newindex=0;	/* new insertion index */
 	int		to;		/* destination copy index */
 
-	leaf = bp->b_addr;
-	ASSERT(be16_to_cpu(leaf->hdr.stale) > 1);
+	ASSERT(leafhdr->stale > 1);
 	index = *indexp;
 
-	xfs_dir2_leaf_find_stale(leaf, index, &lowstale, &highstale);
+	xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale);
 
 	/*
 	 * Pick the better of lowstale and highstale.
 	 */
 	if (lowstale >= 0 &&
-	    (highstale == be16_to_cpu(leaf->hdr.count) ||
+	    (highstale == leafhdr->count ||
 	     index - lowstale <= highstale - index))
 		keepstale = lowstale;
 	else
@@ -781,15 +1033,14 @@ xfs_dir2_leaf_compact_x1(
 	 * Copy the entries in place, removing all the stale entries
 	 * except keepstale.
 	 */
-	for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) {
+	for (from = to = 0; from < leafhdr->count; from++) {
 		/*
 		 * Notice the new value of index.
 		 */
 		if (index == from)
 			newindex = to;
 		if (from != keepstale &&
-		    leaf->ents[from].address ==
-		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+		    ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
 			if (from == to)
 				*lowlogp = to;
 			continue;
@@ -803,7 +1054,7 @@ xfs_dir2_leaf_compact_x1(
 		 * Copy only the entries that have moved.
 		 */
 		if (from > to)
-			leaf->ents[to] = leaf->ents[from];
+			ents[to] = ents[from];
 		to++;
 	}
 	ASSERT(from > to);
@@ -817,8 +1068,8 @@ xfs_dir2_leaf_compact_x1(
 	/*
 	 * Adjust the leaf header values.
 	 */
-	be16_add_cpu(&leaf->hdr.count, -(from - to));
-	leaf->hdr.stale = cpu_to_be16(1);
+	leafhdr->count -= from - to;
+	leafhdr->stale = 1;
 	/*
 	 * Remember the low/high stale value only in the "right"
 	 * direction.
@@ -826,8 +1077,8 @@ xfs_dir2_leaf_compact_x1(
 	if (lowstale >= newindex)
 		lowstale = -1;
 	else
-		highstale = be16_to_cpu(leaf->hdr.count);
-	*highlogp = be16_to_cpu(leaf->hdr.count) - 1;
+		highstale = leafhdr->count;
+	*highlogp = leafhdr->count - 1;
 	*lowstalep = lowstale;
 	*highstalep = highstale;
 }
@@ -857,6 +1108,7 @@ xfs_dir2_leaf_readbuf(
 	struct xfs_mount	*mp = dp->i_mount;
 	struct xfs_buf		*bp = *bpp;
 	struct xfs_bmbt_irec	*map = mip->map;
+	struct blk_plug		plug;
 	int			error = 0;
 	int			length;
 	int			i;
@@ -965,7 +1217,7 @@ xfs_dir2_leaf_readbuf(
 	 * Read the directory block starting at the first mapping.
 	 */
 	mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
-	error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
+	error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
 			map->br_blockcount >= mp->m_dirblkfsbs ?
 			    XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
 
@@ -985,6 +1237,7 @@ xfs_dir2_leaf_readbuf(
 	/*
 	 * Do we need more readahead?
 	 */
+	blk_start_plug(&plug);
 	for (mip->ra_index = mip->ra_offset = i = 0;
 	     mip->ra_want > mip->ra_current && i < mip->map_blocks;
 	     i += mp->m_dirblkfsbs) {
@@ -994,7 +1247,7 @@ xfs_dir2_leaf_readbuf(
 		 */
 		if (i > mip->ra_current &&
 		    map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
-			xfs_dir2_data_readahead(NULL, dp,
+			xfs_dir3_data_readahead(NULL, dp,
 				map[mip->ra_index].br_startoff + mip->ra_offset,
 				XFS_FSB_TO_DADDR(mp,
 					map[mip->ra_index].br_startblock +
@@ -1007,7 +1260,7 @@ xfs_dir2_leaf_readbuf(
 		 * use our mapping, but this is a very rare case.
 		 */
 		else if (i > mip->ra_current) {
-			xfs_dir2_data_readahead(NULL, dp,
+			xfs_dir3_data_readahead(NULL, dp,
 					map[mip->ra_index].br_startoff +
 							mip->ra_offset, -1);
 			mip->ra_current = i;
@@ -1036,6 +1289,7 @@ xfs_dir2_leaf_readbuf(
 			}
 		}
 	}
+	blk_finish_plug(&plug);
 
 out:
 	*bpp = bp;
@@ -1049,10 +1303,8 @@ out:
 int						/* error */
 xfs_dir2_leaf_getdents(
 	xfs_inode_t		*dp,		/* incore directory inode */
-	void			*dirent,
-	size_t			bufsize,
-	xfs_off_t		*offset,
-	filldir_t		filldir)
+	struct dir_context	*ctx,
+	size_t			bufsize)
 {
 	struct xfs_buf		*bp = NULL;	/* data block buffer */
 	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
@@ -1071,7 +1323,7 @@ xfs_dir2_leaf_getdents(
 	 * If the offset is at or past the largest allowed value,
 	 * give up right away.
 	 */
-	if (*offset >= XFS_DIR2_MAX_DATAPTR)
+	if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
 		return 0;
 
 	mp = dp->i_mount;
@@ -1085,14 +1337,14 @@ xfs_dir2_leaf_getdents(
 				     mp->m_sb.sb_blocksize);
 	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
 				(length * sizeof(struct xfs_bmbt_irec)),
-			       KM_SLEEP);
+			       KM_SLEEP | KM_NOFS);
 	map_info->map_size = length;
 
 	/*
 	 * Inside the loop we keep the main offset value as a byte offset
 	 * in the directory file.
 	 */
-	curoff = xfs_dir2_dataptr_to_byte(mp, *offset);
+	curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
 
 	/*
 	 * Force this conversion through db so we truncate the offset
@@ -1133,17 +1385,17 @@ xfs_dir2_leaf_getdents(
 				ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
 				       map_info->curdb);
 			hdr = bp->b_addr;
-			xfs_dir2_data_check(dp, bp);
+			xfs_dir3_data_check(dp, bp);
 			/*
 			 * Find our position in the block.
 			 */
-			ptr = (char *)(hdr + 1);
+			ptr = (char *)xfs_dir3_data_entry_p(hdr);
 			byteoff = xfs_dir2_byte_to_off(mp, curoff);
 			/*
 			 * Skip past the header.
 			 */
 			if (byteoff == 0)
-				curoff += (uint)sizeof(*hdr);
+				curoff += xfs_dir3_data_entry_offset(hdr);
 			/*
 			 * Skip past entries until we reach our offset.
 			 */
@@ -1193,8 +1445,8 @@ xfs_dir2_leaf_getdents(
 		dep = (xfs_dir2_data_entry_t *)ptr;
 		length = xfs_dir2_data_entsize(dep->namelen);
 
-		if (filldir(dirent, (char *)dep->name, dep->namelen,
-			    xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
+		ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+		if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
 			    be64_to_cpu(dep->inumber), DT_UNKNOWN))
 			break;
 
@@ -1211,78 +1463,21 @@ xfs_dir2_leaf_getdents(
 	 * All done.  Set output offset value to current offset.
 	 */
 	if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
-		*offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
+		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
 	else
-		*offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+		ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
 	kmem_free(map_info);
 	if (bp)
 		xfs_trans_brelse(NULL, bp);
 	return error;
 }
 
-/*
- * Initialize a new leaf block, leaf1 or leafn magic accepted.
- */
-int
-xfs_dir2_leaf_init(
-	xfs_da_args_t		*args,		/* operation arguments */
-	xfs_dir2_db_t		bno,		/* directory block number */
-	struct xfs_buf		**bpp,		/* out: leaf buffer */
-	int			magic)		/* magic number for block */
-{
-	struct xfs_buf		*bp;		/* leaf buffer */
-	xfs_inode_t		*dp;		/* incore directory inode */
-	int			error;		/* error return code */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
-	xfs_mount_t		*mp;		/* filesystem mount point */
-	xfs_trans_t		*tp;		/* transaction pointer */
-
-	dp = args->dp;
-	ASSERT(dp != NULL);
-	tp = args->trans;
-	mp = dp->i_mount;
-	ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
-	       bno < XFS_DIR2_FREE_FIRSTDB(mp));
-	/*
-	 * Get the buffer for the block.
-	 */
-	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
-			       XFS_DATA_FORK);
-	if (error)
-		return error;
-
-	/*
-	 * Initialize the header.
-	 */
-	leaf = bp->b_addr;
-	leaf->hdr.info.magic = cpu_to_be16(magic);
-	leaf->hdr.info.forw = 0;
-	leaf->hdr.info.back = 0;
-	leaf->hdr.count = 0;
-	leaf->hdr.stale = 0;
-	xfs_dir2_leaf_log_header(tp, bp);
-	/*
-	 * If it's a leaf-format directory initialize the tail.
-	 * In this case our caller has the real bests table to copy into
-	 * the block.
-	 */
-	if (magic == XFS_DIR2_LEAF1_MAGIC) {
-		bp->b_ops = &xfs_dir2_leaf1_buf_ops;
-		ltp = xfs_dir2_leaf_tail_p(mp, leaf);
-		ltp->bestcount = 0;
-		xfs_dir2_leaf_log_tail(tp, bp);
-	} else
-		bp->b_ops = &xfs_dir2_leafn_buf_ops;
-	*bpp = bp;
-	return 0;
-}
 
 /*
  * Log the bests entries indicated from a leaf1 block.
  */
 static void
-xfs_dir2_leaf_log_bests(
+xfs_dir3_leaf_log_bests(
 	xfs_trans_t		*tp,		/* transaction pointer */
 	struct xfs_buf		*bp,		/* leaf buffer */
 	int			first,		/* first entry to log */
@@ -1290,11 +1485,12 @@ xfs_dir2_leaf_log_bests(
 {
 	__be16			*firstb;	/* pointer to first entry */
 	__be16			*lastb;		/* pointer to last entry */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
 	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
 
-	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
+
 	ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf);
 	firstb = xfs_dir2_leaf_bests_p(ltp) + first;
 	lastb = xfs_dir2_leaf_bests_p(ltp) + last;
@@ -1306,7 +1502,7 @@ xfs_dir2_leaf_log_bests(
  * Log the leaf entries indicated from a leaf1 or leafn block.
  */
 void
-xfs_dir2_leaf_log_ents(
+xfs_dir3_leaf_log_ents(
 	xfs_trans_t		*tp,		/* transaction pointer */
 	struct xfs_buf		*bp,		/* leaf buffer */
 	int			first,		/* first entry to log */
@@ -1314,13 +1510,17 @@ xfs_dir2_leaf_log_ents(
 {
 	xfs_dir2_leaf_entry_t	*firstlep;	/* pointer to first entry */
 	xfs_dir2_leaf_entry_t	*lastlep;	/* pointer to last entry */
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir2_leaf_entry *ents;
 
-	leaf = bp->b_addr;
 	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-	firstlep = &leaf->ents[first];
-	lastlep = &leaf->ents[last];
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
+	ents = xfs_dir3_leaf_ents_p(leaf);
+	firstlep = &ents[first];
+	lastlep = &ents[last];
 	xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
 		(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
 }
@@ -1329,34 +1529,38 @@ xfs_dir2_leaf_log_ents(
  * Log the header of the leaf1 or leafn block.
  */
 void
-xfs_dir2_leaf_log_header(
+xfs_dir3_leaf_log_header(
 	struct xfs_trans	*tp,
 	struct xfs_buf		*bp)
 {
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
 
-	leaf = bp->b_addr;
 	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
-	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
+
 	xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
-		(uint)(sizeof(leaf->hdr) - 1));
+			  xfs_dir3_leaf_hdr_size(leaf) - 1);
 }
 
 /*
  * Log the tail of the leaf1 block.
  */
 STATIC void
-xfs_dir2_leaf_log_tail(
+xfs_dir3_leaf_log_tail(
 	struct xfs_trans	*tp,
 	struct xfs_buf		*bp)
 {
-	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
 	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
-	xfs_mount_t		*mp;		/* filesystem mount point */
+	struct xfs_mount	*mp = tp->t_mountp;
+
+	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
+	       leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
 
-	mp = tp->t_mountp;
-	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
 	xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
 		(uint)(mp->m_dirblksize - 1));
@@ -1380,6 +1584,7 @@ xfs_dir2_leaf_lookup(
 	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
 	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
 
 	trace_xfs_dir2_leaf_lookup(args);
 
@@ -1391,12 +1596,14 @@ xfs_dir2_leaf_lookup(
 	}
 	tp = args->trans;
 	dp = args->dp;
-	xfs_dir2_leaf_check(dp, lbp);
+	xfs_dir3_leaf_check(dp->i_mount, lbp);
 	leaf = lbp->b_addr;
+	ents = xfs_dir3_leaf_ents_p(leaf);
 	/*
 	 * Get to the leaf entry and contained data entry address.
 	 */
-	lep = &leaf->ents[index];
+	lep = &ents[index];
+
 	/*
 	 * Point to the data entry.
 	 */
@@ -1440,18 +1647,23 @@ xfs_dir2_leaf_lookup_int(
 	xfs_trans_t		*tp;		/* transaction pointer */
 	xfs_dir2_db_t		cidb = -1;	/* case match data block no. */
 	enum xfs_dacmp		cmp;		/* name compare result */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
 
-	error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+	error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
 	if (error)
 		return error;
 
 	*lbpp = lbp;
 	leaf = lbp->b_addr;
-	xfs_dir2_leaf_check(dp, lbp);
+	xfs_dir3_leaf_check(mp, lbp);
+	ents = xfs_dir3_leaf_ents_p(leaf);
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
 	/*
 	 * Look for the first leaf entry with our hash value.
 	 */
@@ -1460,9 +1672,9 @@ xfs_dir2_leaf_lookup_int(
 	 * Loop over all the entries with the right hash value
 	 * looking to match the name.
 	 */
-	for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
-				be32_to_cpu(lep->hashval) == args->hashval;
-				lep++, index++) {
+	for (lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
 		/*
 		 * Skip over stale leaf entries.
 		 */
@@ -1479,7 +1691,7 @@ xfs_dir2_leaf_lookup_int(
 		if (newdb != curdb) {
 			if (dbp)
 				xfs_trans_brelse(tp, dbp);
-			error = xfs_dir2_data_read(tp, dp,
+			error = xfs_dir3_data_read(tp, dp,
 						   xfs_dir2_db_to_da(mp, newdb),
 						   -1, &dbp);
 			if (error) {
@@ -1520,7 +1732,7 @@ xfs_dir2_leaf_lookup_int(
 		ASSERT(cidb != -1);
 		if (cidb != curdb) {
 			xfs_trans_brelse(tp, dbp);
-			error = xfs_dir2_data_read(tp, dp,
+			error = xfs_dir3_data_read(tp, dp,
 						   xfs_dir2_db_to_da(mp, cidb),
 						   -1, &dbp);
 			if (error) {
@@ -1566,6 +1778,9 @@ xfs_dir2_leaf_removename(
 	int			needscan;	/* need to rescan data frees */
 	xfs_dir2_data_off_t	oldbest;	/* old value of best free */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	trace_xfs_dir2_leaf_removename(args);
 
@@ -1580,16 +1795,19 @@ xfs_dir2_leaf_removename(
 	mp = dp->i_mount;
 	leaf = lbp->b_addr;
 	hdr = dbp->b_addr;
-	xfs_dir2_data_check(dp, dbp);
+	xfs_dir3_data_check(dp, dbp);
+	bf = xfs_dir3_data_bestfree_p(hdr);
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = xfs_dir3_leaf_ents_p(leaf);
 	/*
 	 * Point to the leaf entry, use that to point to the data entry.
 	 */
-	lep = &leaf->ents[index];
+	lep = &ents[index];
 	db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
 	dep = (xfs_dir2_data_entry_t *)
 	      ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
 	needscan = needlog = 0;
-	oldbest = be16_to_cpu(hdr->bestfree[0].length);
+	oldbest = be16_to_cpu(bf[0].length);
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
 	bestsp = xfs_dir2_leaf_bests_p(ltp);
 	ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
@@ -1602,10 +1820,13 @@ xfs_dir2_leaf_removename(
 	/*
 	 * We just mark the leaf entry stale by putting a null in it.
 	 */
-	be16_add_cpu(&leaf->hdr.stale, 1);
-	xfs_dir2_leaf_log_header(tp, lbp);
+	leafhdr.stale++;
+	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(tp, lbp);
+
 	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-	xfs_dir2_leaf_log_ents(tp, lbp, index, index);
+	xfs_dir3_leaf_log_ents(tp, lbp, index, index);
+
 	/*
 	 * Scan the freespace in the data block again if necessary,
 	 * log the data block header if necessary.
@@ -1618,16 +1839,16 @@ xfs_dir2_leaf_removename(
 	 * If the longest freespace in the data block has changed,
 	 * put the new value in the bests table and log that.
 	 */
-	if (be16_to_cpu(hdr->bestfree[0].length) != oldbest) {
-		bestsp[db] = hdr->bestfree[0].length;
-		xfs_dir2_leaf_log_bests(tp, lbp, db, db);
+	if (be16_to_cpu(bf[0].length) != oldbest) {
+		bestsp[db] = bf[0].length;
+		xfs_dir3_leaf_log_bests(tp, lbp, db, db);
 	}
-	xfs_dir2_data_check(dp, dbp);
+	xfs_dir3_data_check(dp, dbp);
 	/*
 	 * If the data block is now empty then get rid of the data block.
 	 */
-	if (be16_to_cpu(hdr->bestfree[0].length) ==
-	    mp->m_dirblksize - (uint)sizeof(*hdr)) {
+	if (be16_to_cpu(bf[0].length) ==
+			mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)) {
 		ASSERT(db != mp->m_dirdatablk);
 		if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
 			/*
@@ -1638,7 +1859,7 @@ xfs_dir2_leaf_removename(
 			 */
 			if (error == ENOSPC && args->total == 0)
 				error = 0;
-			xfs_dir2_leaf_check(dp, lbp);
+			xfs_dir3_leaf_check(mp, lbp);
 			return error;
 		}
 		dbp = NULL;
@@ -1661,8 +1882,8 @@ xfs_dir2_leaf_removename(
 			memmove(&bestsp[db - i], bestsp,
 				(be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
 			be32_add_cpu(&ltp->bestcount, -(db - i));
-			xfs_dir2_leaf_log_tail(tp, lbp);
-			xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+			xfs_dir3_leaf_log_tail(tp, lbp);
+			xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
 		} else
 			bestsp[db] = cpu_to_be16(NULLDATAOFF);
 	}
@@ -1672,7 +1893,7 @@ xfs_dir2_leaf_removename(
 	else if (db != mp->m_dirdatablk)
 		dbp = NULL;
 
-	xfs_dir2_leaf_check(dp, lbp);
+	xfs_dir3_leaf_check(mp, lbp);
 	/*
 	 * See if we can convert to block form.
 	 */
@@ -1695,6 +1916,7 @@ xfs_dir2_leaf_replace(
 	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
 	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
 
 	trace_xfs_dir2_leaf_replace(args);
 
@@ -1706,10 +1928,11 @@ xfs_dir2_leaf_replace(
 	}
 	dp = args->dp;
 	leaf = lbp->b_addr;
+	ents = xfs_dir3_leaf_ents_p(leaf);
 	/*
 	 * Point to the leaf entry, get data address from it.
 	 */
-	lep = &leaf->ents[index];
+	lep = &ents[index];
 	/*
 	 * Point to the data entry.
 	 */
@@ -1723,7 +1946,7 @@ xfs_dir2_leaf_replace(
 	dep->inumber = cpu_to_be64(args->inumber);
 	tp = args->trans;
 	xfs_dir2_data_log_entry(tp, dbp, dep);
-	xfs_dir2_leaf_check(dp, lbp);
+	xfs_dir3_leaf_check(dp->i_mount, lbp);
 	xfs_trans_brelse(tp, lbp);
 	return 0;
 }
@@ -1745,17 +1968,22 @@ xfs_dir2_leaf_search_hash(
 	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
 	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
 	int			mid=0;		/* current leaf index */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	leaf = lbp->b_addr;
+	ents = xfs_dir3_leaf_ents_p(leaf);
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
 #ifndef __KERNEL__
-	if (!leaf->hdr.count)
+	if (!leafhdr.count)
 		return 0;
 #endif
 	/*
 	 * Note, the table cannot be empty, so we have to go through the loop.
 	 * Binary search the leaf entries looking for our hash value.
 	 */
-	for (lep = leaf->ents, low = 0, high = be16_to_cpu(leaf->hdr.count) - 1,
+	for (lep = ents, low = 0, high = leafhdr.count - 1,
 		hashwant = args->hashval;
 	     low <= high; ) {
 		mid = (low + high) >> 1;
@@ -1807,7 +2035,7 @@ xfs_dir2_leaf_trim_data(
 	/*
 	 * Read the offending data block.  We need its buffer.
 	 */
-	error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
+	error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
 	if (error)
 		return error;
 
@@ -1817,10 +2045,12 @@ xfs_dir2_leaf_trim_data(
 #ifdef DEBUG
 {
 	struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
+	struct xfs_dir2_data_free *bf = xfs_dir3_data_bestfree_p(hdr);
 
-	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
-	ASSERT(be16_to_cpu(hdr->bestfree[0].length) ==
-	       mp->m_dirblksize - (uint)sizeof(*hdr));
+	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
+	ASSERT(be16_to_cpu(bf[0].length) ==
+	       mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr));
 	ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
 }
 #endif
@@ -1839,23 +2069,29 @@ xfs_dir2_leaf_trim_data(
 	bestsp = xfs_dir2_leaf_bests_p(ltp);
 	be32_add_cpu(&ltp->bestcount, -1);
 	memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
-	xfs_dir2_leaf_log_tail(tp, lbp);
-	xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+	xfs_dir3_leaf_log_tail(tp, lbp);
+	xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
 	return 0;
 }
 
 static inline size_t
-xfs_dir2_leaf_size(
-	struct xfs_dir2_leaf_hdr	*hdr,
+xfs_dir3_leaf_size(
+	struct xfs_dir3_icleaf_hdr	*hdr,
 	int				counts)
 {
-	int			entries;
+	int	entries;
+	int	hdrsize;
+
+	entries = hdr->count - hdr->stale;
+	if (hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
+	    hdr->magic == XFS_DIR2_LEAFN_MAGIC)
+		hdrsize = sizeof(struct xfs_dir2_leaf_hdr);
+	else
+		hdrsize = sizeof(struct xfs_dir3_leaf_hdr);
 
-	entries = be16_to_cpu(hdr->count) - be16_to_cpu(hdr->stale);
-	return sizeof(xfs_dir2_leaf_hdr_t) +
-	    entries * sizeof(xfs_dir2_leaf_entry_t) +
-	    counts * sizeof(xfs_dir2_data_off_t) +
-	    sizeof(xfs_dir2_leaf_tail_t);
+	return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t)
+	               + counts * sizeof(xfs_dir2_data_off_t)
+		       + sizeof(xfs_dir2_leaf_tail_t);
 }
 
 /*
@@ -1879,6 +2115,8 @@ xfs_dir2_node_to_leaf(
 	xfs_mount_t		*mp;		/* filesystem mount point */
 	int			rval;		/* successful free trim? */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir3_icfree_hdr freehdr;
 
 	/*
 	 * There's more than a leaf level in the btree, so there must
@@ -1928,7 +2166,11 @@ xfs_dir2_node_to_leaf(
 		return 0;
 	lbp = state->path.blk[0].bp;
 	leaf = lbp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
+
 	/*
 	 * Read the freespace block.
 	 */
@@ -1936,44 +2178,49 @@ xfs_dir2_node_to_leaf(
 	if (error)
 		return error;
 	free = fbp->b_addr;
-	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
-	ASSERT(!free->hdr.firstdb);
+	xfs_dir3_free_hdr_from_disk(&freehdr, free);
+
+	ASSERT(!freehdr.firstdb);
 
 	/*
 	 * Now see if the leafn and free data will fit in a leaf1.
 	 * If not, release the buffer and give up.
 	 */
-	if (xfs_dir2_leaf_size(&leaf->hdr, be32_to_cpu(free->hdr.nvalid)) >
-			mp->m_dirblksize) {
+	if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > mp->m_dirblksize) {
 		xfs_trans_brelse(tp, fbp);
 		return 0;
 	}
 
 	/*
 	 * If the leaf has any stale entries in it, compress them out.
-	 * The compact routine will log the header.
 	 */
-	if (be16_to_cpu(leaf->hdr.stale))
-		xfs_dir2_leaf_compact(args, lbp);
-	else
-		xfs_dir2_leaf_log_header(tp, lbp);
+	if (leafhdr.stale)
+		xfs_dir3_leaf_compact(args, &leafhdr, lbp);
 
-	lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
-	leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
+	lbp->b_ops = &xfs_dir3_leaf1_buf_ops;
+	xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF);
+	leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC)
+					? XFS_DIR2_LEAF1_MAGIC
+					: XFS_DIR3_LEAF1_MAGIC;
 
 	/*
 	 * Set up the leaf tail from the freespace block.
 	 */
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
-	ltp->bestcount = free->hdr.nvalid;
+	ltp->bestcount = cpu_to_be32(freehdr.nvalid);
+
 	/*
 	 * Set up the leaf bests table.
 	 */
-	memcpy(xfs_dir2_leaf_bests_p(ltp), free->bests,
-		be32_to_cpu(ltp->bestcount) * sizeof(xfs_dir2_data_off_t));
-	xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
-	xfs_dir2_leaf_log_tail(tp, lbp);
-	xfs_dir2_leaf_check(dp, lbp);
+	memcpy(xfs_dir2_leaf_bests_p(ltp), xfs_dir3_free_bests_p(mp, free),
+		freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
+
+	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(tp, lbp);
+	xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+	xfs_dir3_leaf_log_tail(tp, lbp);
+	xfs_dir3_leaf_check(mp, lbp);
+
 	/*
 	 * Get rid of the freespace block.
 	 */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 5980f9b7fa9..2226a00acd1 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -32,20 +33,14 @@
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_buf_item.h"
+#include "xfs_cksum.h"
 
 /*
  * Function declarations.
  */
 static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args,
 			      int index);
-#ifdef DEBUG
-static void xfs_dir2_leafn_check(struct xfs_inode *dp, struct xfs_buf *bp);
-#else
-#define	xfs_dir2_leafn_check(dp, bp)
-#endif
-static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, struct xfs_buf *bp_s,
-				    int start_s, struct xfs_buf *bp_d,
-				    int start_d, int count);
 static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
 				     xfs_da_state_blk_t *blk1,
 				     xfs_da_state_blk_t *blk2);
@@ -55,52 +50,126 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
 static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
 				     xfs_da_state_blk_t *fblk);
 
-static void
-xfs_dir2_free_verify(
+/*
+ * Check internal consistency of a leafn block.
+ */
+#ifdef DEBUG
+#define	xfs_dir3_leaf_check(mp, bp) \
+do { \
+	if (!xfs_dir3_leafn_check((mp), (bp))) \
+		ASSERT(0); \
+} while (0);
+
+static bool
+xfs_dir3_leafn_check(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
+	if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
+		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
+			return false;
+	} else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
+		return false;
+
+	return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+}
+#else
+#define	xfs_dir3_leaf_check(mp, bp)
+#endif
+
+static bool
+xfs_dir3_free_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_dir2_free_hdr *hdr = bp->b_addr;
-	int			block_ok = 0;
 
-	block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
-	if (!block_ok) {
-		XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
-				     XFS_ERRLEVEL_LOW, mp, hdr);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+		if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
+			return false;
+		if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+			return false;
+	} else {
+		if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
+			return false;
 	}
+
+	/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
+
+	return true;
 }
 
 static void
-xfs_dir2_free_read_verify(
+xfs_dir3_free_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dir2_free_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+	     !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+					  XFS_DIR3_FREE_CRC_OFF)) ||
+	    !xfs_dir3_free_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
 }
 
 static void
-xfs_dir2_free_write_verify(
+xfs_dir3_free_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dir2_free_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	struct xfs_dir3_blk_hdr	*hdr3 = bp->b_addr;
+
+	if (!xfs_dir3_free_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF);
 }
 
-static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
-	.verify_read = xfs_dir2_free_read_verify,
-	.verify_write = xfs_dir2_free_write_verify,
+const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+	.verify_read = xfs_dir3_free_read_verify,
+	.verify_write = xfs_dir3_free_write_verify,
 };
 
 
 static int
-__xfs_dir2_free_read(
+__xfs_dir3_free_read(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*dp,
 	xfs_dablk_t		fbno,
 	xfs_daddr_t		mappedbno,
 	struct xfs_buf		**bpp)
 {
-	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-				XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
+	int			err;
+
+	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+				XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
+
+	/* try read returns without an error or *bpp if it lands in a hole */
+	if (!err && tp && *bpp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+	return err;
 }
 
 int
@@ -110,7 +179,7 @@ xfs_dir2_free_read(
 	xfs_dablk_t		fbno,
 	struct xfs_buf		**bpp)
 {
-	return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
+	return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
 }
 
 static int
@@ -120,7 +189,96 @@ xfs_dir2_free_try_read(
 	xfs_dablk_t		fbno,
 	struct xfs_buf		**bpp)
 {
-	return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
+	return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
+}
+
+
+void
+xfs_dir3_free_hdr_from_disk(
+	struct xfs_dir3_icfree_hdr	*to,
+	struct xfs_dir2_free		*from)
+{
+	if (from->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)) {
+		to->magic = be32_to_cpu(from->hdr.magic);
+		to->firstdb = be32_to_cpu(from->hdr.firstdb);
+		to->nvalid = be32_to_cpu(from->hdr.nvalid);
+		to->nused = be32_to_cpu(from->hdr.nused);
+	} else {
+		struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
+
+		to->magic = be32_to_cpu(hdr3->hdr.magic);
+		to->firstdb = be32_to_cpu(hdr3->firstdb);
+		to->nvalid = be32_to_cpu(hdr3->nvalid);
+		to->nused = be32_to_cpu(hdr3->nused);
+	}
+
+	ASSERT(to->magic == XFS_DIR2_FREE_MAGIC ||
+	       to->magic == XFS_DIR3_FREE_MAGIC);
+}
+
+static void
+xfs_dir3_free_hdr_to_disk(
+	struct xfs_dir2_free		*to,
+	struct xfs_dir3_icfree_hdr	*from)
+{
+	ASSERT(from->magic == XFS_DIR2_FREE_MAGIC ||
+	       from->magic == XFS_DIR3_FREE_MAGIC);
+
+	if (from->magic == XFS_DIR2_FREE_MAGIC) {
+		to->hdr.magic = cpu_to_be32(from->magic);
+		to->hdr.firstdb = cpu_to_be32(from->firstdb);
+		to->hdr.nvalid = cpu_to_be32(from->nvalid);
+		to->hdr.nused = cpu_to_be32(from->nused);
+	} else {
+		struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
+
+		hdr3->hdr.magic = cpu_to_be32(from->magic);
+		hdr3->firstdb = cpu_to_be32(from->firstdb);
+		hdr3->nvalid = cpu_to_be32(from->nvalid);
+		hdr3->nused = cpu_to_be32(from->nused);
+	}
+}
+
+static int
+xfs_dir3_free_get_buf(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dir2_db_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_buf		*bp;
+	int			error;
+	struct xfs_dir3_icfree_hdr hdr;
+
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno),
+				   -1, &bp, XFS_DATA_FORK);
+	if (error)
+		return error;
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF);
+	bp->b_ops = &xfs_dir3_free_buf_ops;
+
+	/*
+	 * Initialize the new block to be empty, and remember
+	 * its first slot as our empty slot.
+	 */
+	memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr));
+	memset(&hdr, 0, sizeof(hdr));
+
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+		hdr.magic = XFS_DIR3_FREE_MAGIC;
+
+		hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
+		hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
+		uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+	} else
+		hdr.magic = XFS_DIR2_FREE_MAGIC;
+	xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr);
+	*bpp = bp;
+	return 0;
 }
 
 /*
@@ -134,13 +292,16 @@ xfs_dir2_free_log_bests(
 	int			last)		/* last entry to log */
 {
 	xfs_dir2_free_t		*free;		/* freespace structure */
+	__be16			*bests;
 
 	free = bp->b_addr;
-	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
+	bests = xfs_dir3_free_bests_p(tp->t_mountp, free);
+	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
 	xfs_trans_log_buf(tp, bp,
-		(uint)((char *)&free->bests[first] - (char *)free),
-		(uint)((char *)&free->bests[last] - (char *)free +
-		       sizeof(free->bests[0]) - 1));
+		(uint)((char *)&bests[first] - (char *)free),
+		(uint)((char *)&bests[last] - (char *)free +
+		       sizeof(bests[0]) - 1));
 }
 
 /*
@@ -154,9 +315,9 @@ xfs_dir2_free_log_header(
 	xfs_dir2_free_t		*free;		/* freespace structure */
 
 	free = bp->b_addr;
-	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
-	xfs_trans_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
-		(uint)(sizeof(xfs_dir2_free_hdr_t) - 1));
+	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+	       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
+	xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1);
 }
 
 /*
@@ -183,6 +344,7 @@ xfs_dir2_leaf_to_node(
 	xfs_dir2_data_off_t	off;		/* freespace entry value */
 	__be16			*to;		/* pointer to freespace entry */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icfree_hdr freehdr;
 
 	trace_xfs_dir2_leaf_to_node(args);
 
@@ -199,44 +361,53 @@ xfs_dir2_leaf_to_node(
 	/*
 	 * Get the buffer for the new freespace block.
 	 */
-	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
-				XFS_DATA_FORK);
+	error = xfs_dir3_free_get_buf(tp, dp, fdb, &fbp);
 	if (error)
 		return error;
-	fbp->b_ops = &xfs_dir2_free_buf_ops;
 
 	free = fbp->b_addr;
+	xfs_dir3_free_hdr_from_disk(&freehdr, free);
 	leaf = lbp->b_addr;
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
-	/*
-	 * Initialize the freespace block header.
-	 */
-	free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC);
-	free->hdr.firstdb = 0;
-	ASSERT(be32_to_cpu(ltp->bestcount) <= (uint)dp->i_d.di_size / mp->m_dirblksize);
-	free->hdr.nvalid = ltp->bestcount;
+	ASSERT(be32_to_cpu(ltp->bestcount) <=
+				(uint)dp->i_d.di_size / mp->m_dirblksize);
+
 	/*
 	 * Copy freespace entries from the leaf block to the new block.
 	 * Count active entries.
 	 */
-	for (i = n = 0, from = xfs_dir2_leaf_bests_p(ltp), to = free->bests;
-	     i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
+	from = xfs_dir2_leaf_bests_p(ltp);
+	to = xfs_dir3_free_bests_p(mp, free);
+	for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
 		if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
 			n++;
 		*to = cpu_to_be16(off);
 	}
-	free->hdr.nused = cpu_to_be32(n);
-
-	lbp->b_ops = &xfs_dir2_leafn_buf_ops;
-	leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
 
 	/*
-	 * Log everything.
+	 * Now initialize the freespace block header.
 	 */
-	xfs_dir2_leaf_log_header(tp, lbp);
+	freehdr.nused = n;
+	freehdr.nvalid = be32_to_cpu(ltp->bestcount);
+
+	xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr);
+	xfs_dir2_free_log_bests(tp, fbp, 0, freehdr.nvalid - 1);
 	xfs_dir2_free_log_header(tp, fbp);
-	xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1);
-	xfs_dir2_leafn_check(dp, lbp);
+
+	/*
+	 * Converting the leaf to a leafnode is just a matter of changing the
+	 * magic number and the ops. Do the change directly to the buffer as
+	 * it's less work (and less code) than decoding the header to host
+	 * format and back again.
+	 */
+	if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC))
+		leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+	else
+		leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
+	lbp->b_ops = &xfs_dir3_leafn_buf_ops;
+	xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
+	xfs_dir3_leaf_log_header(tp, lbp);
+	xfs_dir3_leaf_check(mp, lbp);
 	return 0;
 }
 
@@ -260,6 +431,8 @@ xfs_dir2_leafn_add(
 	int			lowstale;	/* previous stale entry */
 	xfs_mount_t		*mp;		/* filesystem mount point */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir2_leaf_entry *ents;
 
 	trace_xfs_dir2_leafn_add(args, index);
 
@@ -267,6 +440,8 @@ xfs_dir2_leafn_add(
 	mp = dp->i_mount;
 	tp = args->trans;
 	leaf = bp->b_addr;
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = xfs_dir3_leaf_ents_p(leaf);
 
 	/*
 	 * Quick check just to make sure we are not going to index
@@ -282,15 +457,15 @@ xfs_dir2_leafn_add(
 	 * a compact.
 	 */
 
-	if (be16_to_cpu(leaf->hdr.count) == xfs_dir2_max_leaf_ents(mp)) {
-		if (!leaf->hdr.stale)
+	if (leafhdr.count == xfs_dir3_max_leaf_ents(mp, leaf)) {
+		if (!leafhdr.stale)
 			return XFS_ERROR(ENOSPC);
-		compact = be16_to_cpu(leaf->hdr.stale) > 1;
+		compact = leafhdr.stale > 1;
 	} else
 		compact = 0;
-	ASSERT(index == 0 || be32_to_cpu(leaf->ents[index - 1].hashval) <= args->hashval);
-	ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
-	       be32_to_cpu(leaf->ents[index].hashval) >= args->hashval);
+	ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval);
+	ASSERT(index == leafhdr.count ||
+	       be32_to_cpu(ents[index].hashval) >= args->hashval);
 
 	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
 		return 0;
@@ -299,61 +474,51 @@ xfs_dir2_leafn_add(
 	 * Compact out all but one stale leaf entry.  Leaves behind
 	 * the entry closest to index.
 	 */
-	if (compact) {
-		xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale,
-			&lfloglow, &lfloghigh);
-	}
-	/*
-	 * Set impossible logging indices for this case.
-	 */
-	else if (leaf->hdr.stale) {
-		lfloglow = be16_to_cpu(leaf->hdr.count);
+	if (compact)
+		xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale,
+					 &highstale, &lfloglow, &lfloghigh);
+	else if (leafhdr.stale) {
+		/*
+		 * Set impossible logging indices for this case.
+		 */
+		lfloglow = leafhdr.count;
 		lfloghigh = -1;
 	}
 
 	/*
 	 * Insert the new entry, log everything.
 	 */
-	lep = xfs_dir2_leaf_find_entry(leaf, index, compact, lowstale,
+	lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
 				       highstale, &lfloglow, &lfloghigh);
 
 	lep->hashval = cpu_to_be32(args->hashval);
 	lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp,
 				args->blkno, args->index));
-	xfs_dir2_leaf_log_header(tp, bp);
-	xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
-	xfs_dir2_leafn_check(dp, bp);
+
+	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(tp, bp);
+	xfs_dir3_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
+	xfs_dir3_leaf_check(mp, bp);
 	return 0;
 }
 
 #ifdef DEBUG
-/*
- * Check internal consistency of a leafn block.
- */
-void
-xfs_dir2_leafn_check(
-	struct xfs_inode *dp,
-	struct xfs_buf	*bp)
+static void
+xfs_dir2_free_hdr_check(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	xfs_dir2_db_t	db)
 {
-	int		i;			/* leaf index */
-	xfs_dir2_leaf_t	*leaf;			/* leaf structure */
-	xfs_mount_t	*mp;			/* filesystem mount point */
-	int		stale;			/* count of stale leaves */
+	struct xfs_dir3_icfree_hdr hdr;
 
-	leaf = bp->b_addr;
-	mp = dp->i_mount;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-	ASSERT(be16_to_cpu(leaf->hdr.count) <= xfs_dir2_max_leaf_ents(mp));
-	for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) {
-		if (i + 1 < be16_to_cpu(leaf->hdr.count)) {
-			ASSERT(be32_to_cpu(leaf->ents[i].hashval) <=
-			       be32_to_cpu(leaf->ents[i + 1].hashval));
-		}
-		if (leaf->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
-			stale++;
-	}
-	ASSERT(be16_to_cpu(leaf->hdr.stale) == stale);
+	xfs_dir3_free_hdr_from_disk(&hdr, bp->b_addr);
+
+	ASSERT((hdr.firstdb % xfs_dir3_free_max_bests(mp)) == 0);
+	ASSERT(hdr.firstdb <= db);
+	ASSERT(db < hdr.firstdb + hdr.nvalid);
 }
+#else
+#define xfs_dir2_free_hdr_check(mp, dp, db)
 #endif	/* DEBUG */
 
 /*
@@ -365,15 +530,22 @@ xfs_dir2_leafn_lasthash(
 	struct xfs_buf	*bp,			/* leaf buffer */
 	int		*count)			/* count of entries in leaf */
 {
-	xfs_dir2_leaf_t	*leaf;			/* leaf structure */
+	struct xfs_dir2_leaf	*leaf = bp->b_addr;
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
+
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+
+	ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
+	       leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
 
-	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
 	if (count)
-		*count = be16_to_cpu(leaf->hdr.count);
-	if (!leaf->hdr.count)
+		*count = leafhdr.count;
+	if (!leafhdr.count)
 		return 0;
-	return be32_to_cpu(leaf->ents[be16_to_cpu(leaf->hdr.count) - 1].hashval);
+
+	ents = xfs_dir3_leaf_ents_p(leaf);
+	return be32_to_cpu(ents[leafhdr.count - 1].hashval);
 }
 
 /*
@@ -402,16 +574,19 @@ xfs_dir2_leafn_lookup_for_addname(
 	xfs_dir2_db_t		newdb;		/* new data block number */
 	xfs_dir2_db_t		newfdb;		/* new free block number */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-#ifdef __KERNEL__
-	ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
-#endif
-	xfs_dir2_leafn_check(dp, bp);
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = xfs_dir3_leaf_ents_p(leaf);
+
+	xfs_dir3_leaf_check(mp, bp);
+	ASSERT(leafhdr.count > 0);
+
 	/*
 	 * Look up the hash value in the leaf entries.
 	 */
@@ -424,15 +599,16 @@ xfs_dir2_leafn_lookup_for_addname(
 		curbp = state->extrablk.bp;
 		curfdb = state->extrablk.blkno;
 		free = curbp->b_addr;
-		ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
+		ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
+		       free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
 	}
 	length = xfs_dir2_data_entsize(args->namelen);
 	/*
 	 * Loop over leaf entries with the right hash value.
 	 */
-	for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
-				be32_to_cpu(lep->hashval) == args->hashval;
-				lep++, index++) {
+	for (lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
 		/*
 		 * Skip stale leaf entries.
 		 */
@@ -451,6 +627,8 @@ xfs_dir2_leafn_lookup_for_addname(
 		 * in hand, take a look at it.
 		 */
 		if (newdb != curdb) {
+			__be16 *bests;
+
 			curdb = newdb;
 			/*
 			 * Convert the data block to the free block
@@ -473,13 +651,8 @@ xfs_dir2_leafn_lookup_for_addname(
 				if (error)
 					return error;
 				free = curbp->b_addr;
-				ASSERT(be32_to_cpu(free->hdr.magic) ==
-					XFS_DIR2_FREE_MAGIC);
-				ASSERT((be32_to_cpu(free->hdr.firstdb) %
-					xfs_dir2_free_max_bests(mp)) == 0);
-				ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb);
-				ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) +
-					be32_to_cpu(free->hdr.nvalid));
+
+				xfs_dir2_free_hdr_check(mp, curbp, curdb);
 			}
 			/*
 			 * Get the index for our entry.
@@ -488,8 +661,8 @@ xfs_dir2_leafn_lookup_for_addname(
 			/*
 			 * If it has room, return it.
 			 */
-			if (unlikely(free->bests[fi] ==
-			    cpu_to_be16(NULLDATAOFF))) {
+			bests = xfs_dir3_free_bests_p(mp, free);
+			if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
 				XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
 							XFS_ERRLEVEL_LOW, mp);
 				if (curfdb != newfdb)
@@ -497,7 +670,7 @@ xfs_dir2_leafn_lookup_for_addname(
 				return XFS_ERROR(EFSCORRUPTED);
 			}
 			curfdb = newfdb;
-			if (be16_to_cpu(free->bests[fi]) >= length)
+			if (be16_to_cpu(bests[fi]) >= length)
 				goto out;
 		}
 	}
@@ -511,6 +684,12 @@ out:
 		state->extrablk.bp = curbp;
 		state->extrablk.index = fi;
 		state->extrablk.blkno = curfdb;
+
+		/*
+		 * Important: this magic number is not in the buffer - it's for
+		 * buffer type information and therefore only the free/data type
+		 * matters here, not whether CRCs are enabled or not.
+		 */
 		state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
 	} else {
 		state->extravalid = 0;
@@ -545,16 +724,19 @@ xfs_dir2_leafn_lookup_for_entry(
 	xfs_dir2_db_t		newdb;		/* new data block number */
 	xfs_trans_t		*tp;		/* transaction pointer */
 	enum xfs_dacmp		cmp;		/* comparison result */
+	struct xfs_dir2_leaf_entry *ents;
+	struct xfs_dir3_icleaf_hdr leafhdr;
 
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-#ifdef __KERNEL__
-	ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
-#endif
-	xfs_dir2_leafn_check(dp, bp);
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = xfs_dir3_leaf_ents_p(leaf);
+
+	xfs_dir3_leaf_check(mp, bp);
+	ASSERT(leafhdr.count > 0);
+
 	/*
 	 * Look up the hash value in the leaf entries.
 	 */
@@ -569,9 +751,9 @@ xfs_dir2_leafn_lookup_for_entry(
 	/*
 	 * Loop over leaf entries with the right hash value.
 	 */
-	for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
-				be32_to_cpu(lep->hashval) == args->hashval;
-				lep++, index++) {
+	for (lep = &ents[index];
+	     index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+	     lep++, index++) {
 		/*
 		 * Skip stale leaf entries.
 		 */
@@ -604,13 +786,13 @@ xfs_dir2_leafn_lookup_for_entry(
 				ASSERT(state->extravalid);
 				curbp = state->extrablk.bp;
 			} else {
-				error = xfs_dir2_data_read(tp, dp,
+				error = xfs_dir3_data_read(tp, dp,
 						xfs_dir2_db_to_da(mp, newdb),
 						-1, &curbp);
 				if (error)
 					return error;
 			}
-			xfs_dir2_data_check(dp, curbp);
+			xfs_dir3_data_check(dp, curbp);
 			curdb = newdb;
 		}
 		/*
@@ -638,13 +820,13 @@ xfs_dir2_leafn_lookup_for_entry(
 			state->extrablk.index = (int)((char *)dep -
 							(char *)curbp->b_addr);
 			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
-			curbp->b_ops = &xfs_dir2_data_buf_ops;
+			curbp->b_ops = &xfs_dir3_data_buf_ops;
+			xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
 			if (cmp == XFS_CMP_EXACT)
 				return XFS_ERROR(EEXIST);
 		}
 	}
-	ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
-					(args->op_flags & XFS_DA_OP_OKNOENT));
+	ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
 	if (curbp) {
 		if (args->cmpresult == XFS_CMP_DIFFERENT) {
 			/* Giving back last used data block. */
@@ -653,7 +835,8 @@ xfs_dir2_leafn_lookup_for_entry(
 			state->extrablk.index = -1;
 			state->extrablk.blkno = curdb;
 			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
-			curbp->b_ops = &xfs_dir2_data_buf_ops;
+			curbp->b_ops = &xfs_dir3_data_buf_ops;
+			xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
 		} else {
 			/* If the curbp is not the CI match block, drop it */
 			if (state->extrablk.bp != curbp)
@@ -689,52 +872,50 @@ xfs_dir2_leafn_lookup_int(
  * Log entries and headers.  Stale entries are preserved.
  */
 static void
-xfs_dir2_leafn_moveents(
-	xfs_da_args_t	*args,			/* operation arguments */
-	struct xfs_buf	*bp_s,			/* source leaf buffer */
-	int		start_s,		/* source leaf index */
-	struct xfs_buf	*bp_d,			/* destination leaf buffer */
-	int		start_d,		/* destination leaf index */
-	int		count)			/* count of leaves to copy */
+xfs_dir3_leafn_moveents(
+	xfs_da_args_t			*args,	/* operation arguments */
+	struct xfs_buf			*bp_s,	/* source */
+	struct xfs_dir3_icleaf_hdr	*shdr,
+	struct xfs_dir2_leaf_entry	*sents,
+	int				start_s,/* source leaf index */
+	struct xfs_buf			*bp_d,	/* destination */
+	struct xfs_dir3_icleaf_hdr	*dhdr,
+	struct xfs_dir2_leaf_entry	*dents,
+	int				start_d,/* destination leaf index */
+	int				count)	/* count of leaves to copy */
 {
-	xfs_dir2_leaf_t	*leaf_d;		/* destination leaf structure */
-	xfs_dir2_leaf_t	*leaf_s;		/* source leaf structure */
-	int		stale;			/* count stale leaves copied */
-	xfs_trans_t	*tp;			/* transaction pointer */
+	struct xfs_trans		*tp = args->trans;
+	int				stale;	/* count stale leaves copied */
 
 	trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
 
 	/*
 	 * Silently return if nothing to do.
 	 */
-	if (count == 0) {
+	if (count == 0)
 		return;
-	}
-	tp = args->trans;
-	leaf_s = bp_s->b_addr;
-	leaf_d = bp_d->b_addr;
+
 	/*
 	 * If the destination index is not the end of the current
 	 * destination leaf entries, open up a hole in the destination
 	 * to hold the new entries.
 	 */
-	if (start_d < be16_to_cpu(leaf_d->hdr.count)) {
-		memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d],
-			(be16_to_cpu(leaf_d->hdr.count) - start_d) *
-			sizeof(xfs_dir2_leaf_entry_t));
-		xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count,
-			count + be16_to_cpu(leaf_d->hdr.count) - 1);
+	if (start_d < dhdr->count) {
+		memmove(&dents[start_d + count], &dents[start_d],
+			(dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir3_leaf_log_ents(tp, bp_d, start_d + count,
+				       count + dhdr->count - 1);
 	}
 	/*
 	 * If the source has stale leaves, count the ones in the copy range
 	 * so we can update the header correctly.
 	 */
-	if (leaf_s->hdr.stale) {
+	if (shdr->stale) {
 		int	i;			/* temp leaf index */
 
 		for (i = start_s, stale = 0; i < start_s + count; i++) {
-			if (leaf_s->ents[i].address ==
-			    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+			if (sents[i].address ==
+					cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
 				stale++;
 		}
 	} else
@@ -742,29 +923,27 @@ xfs_dir2_leafn_moveents(
 	/*
 	 * Copy the leaf entries from source to destination.
 	 */
-	memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s],
+	memcpy(&dents[start_d], &sents[start_s],
 		count * sizeof(xfs_dir2_leaf_entry_t));
-	xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+	xfs_dir3_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+
 	/*
 	 * If there are source entries after the ones we copied,
 	 * delete the ones we copied by sliding the next ones down.
 	 */
-	if (start_s + count < be16_to_cpu(leaf_s->hdr.count)) {
-		memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count],
+	if (start_s + count < shdr->count) {
+		memmove(&sents[start_s], &sents[start_s + count],
 			count * sizeof(xfs_dir2_leaf_entry_t));
-		xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
+		xfs_dir3_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
 	}
+
 	/*
 	 * Update the headers and log them.
 	 */
-	be16_add_cpu(&leaf_s->hdr.count, -(count));
-	be16_add_cpu(&leaf_s->hdr.stale, -(stale));
-	be16_add_cpu(&leaf_d->hdr.count, count);
-	be16_add_cpu(&leaf_d->hdr.stale, stale);
-	xfs_dir2_leaf_log_header(tp, bp_s);
-	xfs_dir2_leaf_log_header(tp, bp_d);
-	xfs_dir2_leafn_check(args->dp, bp_s);
-	xfs_dir2_leafn_check(args->dp, bp_d);
+	shdr->count -= count;
+	shdr->stale -= stale;
+	dhdr->count += count;
+	dhdr->stale += stale;
 }
 
 /*
@@ -773,21 +952,25 @@ xfs_dir2_leafn_moveents(
  */
 int						/* sort order */
 xfs_dir2_leafn_order(
-	struct xfs_buf	*leaf1_bp,		/* leaf1 buffer */
-	struct xfs_buf	*leaf2_bp)		/* leaf2 buffer */
+	struct xfs_buf		*leaf1_bp,		/* leaf1 buffer */
+	struct xfs_buf		*leaf2_bp)		/* leaf2 buffer */
 {
-	xfs_dir2_leaf_t	*leaf1;			/* leaf1 structure */
-	xfs_dir2_leaf_t	*leaf2;			/* leaf2 structure */
-
-	leaf1 = leaf1_bp->b_addr;
-	leaf2 = leaf2_bp->b_addr;
-	ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-	ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-	if (be16_to_cpu(leaf1->hdr.count) > 0 &&
-	    be16_to_cpu(leaf2->hdr.count) > 0 &&
-	    (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) ||
-	     be32_to_cpu(leaf2->ents[be16_to_cpu(leaf2->hdr.count) - 1].hashval) <
-	     be32_to_cpu(leaf1->ents[be16_to_cpu(leaf1->hdr.count) - 1].hashval)))
+	struct xfs_dir2_leaf	*leaf1 = leaf1_bp->b_addr;
+	struct xfs_dir2_leaf	*leaf2 = leaf2_bp->b_addr;
+	struct xfs_dir2_leaf_entry *ents1;
+	struct xfs_dir2_leaf_entry *ents2;
+	struct xfs_dir3_icleaf_hdr hdr1;
+	struct xfs_dir3_icleaf_hdr hdr2;
+
+	xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1);
+	xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2);
+	ents1 = xfs_dir3_leaf_ents_p(leaf1);
+	ents2 = xfs_dir3_leaf_ents_p(leaf2);
+
+	if (hdr1.count > 0 && hdr2.count > 0 &&
+	    (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
+	     be32_to_cpu(ents2[hdr2.count - 1].hashval) <
+				be32_to_cpu(ents1[hdr1.count - 1].hashval)))
 		return 1;
 	return 0;
 }
@@ -811,11 +994,15 @@ xfs_dir2_leafn_rebalance(
 	xfs_dir2_leaf_t		*leaf1;		/* first leaf structure */
 	xfs_dir2_leaf_t		*leaf2;		/* second leaf structure */
 	int			mid;		/* midpoint leaf index */
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 	int			oldstale;	/* old count of stale leaves */
 #endif
 	int			oldsum;		/* old total leaf count */
 	int			swap;		/* swapped leaf blocks */
+	struct xfs_dir2_leaf_entry *ents1;
+	struct xfs_dir2_leaf_entry *ents2;
+	struct xfs_dir3_icleaf_hdr hdr1;
+	struct xfs_dir3_icleaf_hdr hdr2;
 
 	args = state->args;
 	/*
@@ -830,11 +1017,17 @@ xfs_dir2_leafn_rebalance(
 	}
 	leaf1 = blk1->bp->b_addr;
 	leaf2 = blk2->bp->b_addr;
-	oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count);
-#ifdef DEBUG
-	oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale);
+	xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1);
+	xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2);
+	ents1 = xfs_dir3_leaf_ents_p(leaf1);
+	ents2 = xfs_dir3_leaf_ents_p(leaf2);
+
+	oldsum = hdr1.count + hdr2.count;
+#if defined(DEBUG) || defined(XFS_WARN)
+	oldstale = hdr1.stale + hdr2.stale;
 #endif
 	mid = oldsum >> 1;
+
 	/*
 	 * If the old leaf count was odd then the new one will be even,
 	 * so we need to divide the new count evenly.
@@ -842,10 +1035,10 @@ xfs_dir2_leafn_rebalance(
 	if (oldsum & 1) {
 		xfs_dahash_t	midhash;	/* middle entry hash value */
 
-		if (mid >= be16_to_cpu(leaf1->hdr.count))
-			midhash = be32_to_cpu(leaf2->ents[mid - be16_to_cpu(leaf1->hdr.count)].hashval);
+		if (mid >= hdr1.count)
+			midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval);
 		else
-			midhash = be32_to_cpu(leaf1->ents[mid].hashval);
+			midhash = be32_to_cpu(ents1[mid].hashval);
 		isleft = args->hashval <= midhash;
 	}
 	/*
@@ -859,30 +1052,42 @@ xfs_dir2_leafn_rebalance(
 	 * Calculate moved entry count.  Positive means left-to-right,
 	 * negative means right-to-left.  Then move the entries.
 	 */
-	count = be16_to_cpu(leaf1->hdr.count) - mid + (isleft == 0);
+	count = hdr1.count - mid + (isleft == 0);
 	if (count > 0)
-		xfs_dir2_leafn_moveents(args, blk1->bp,
-			be16_to_cpu(leaf1->hdr.count) - count, blk2->bp, 0, count);
+		xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1,
+					hdr1.count - count, blk2->bp,
+					&hdr2, ents2, 0, count);
 	else if (count < 0)
-		xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp,
-			be16_to_cpu(leaf1->hdr.count), count);
-	ASSERT(be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count) == oldsum);
-	ASSERT(be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale) == oldstale);
+		xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0,
+					blk1->bp, &hdr1, ents1,
+					hdr1.count, count);
+
+	ASSERT(hdr1.count + hdr2.count == oldsum);
+	ASSERT(hdr1.stale + hdr2.stale == oldstale);
+
+	/* log the changes made when moving the entries */
+	xfs_dir3_leaf_hdr_to_disk(leaf1, &hdr1);
+	xfs_dir3_leaf_hdr_to_disk(leaf2, &hdr2);
+	xfs_dir3_leaf_log_header(args->trans, blk1->bp);
+	xfs_dir3_leaf_log_header(args->trans, blk2->bp);
+
+	xfs_dir3_leaf_check(args->dp->i_mount, blk1->bp);
+	xfs_dir3_leaf_check(args->dp->i_mount, blk2->bp);
+
 	/*
 	 * Mark whether we're inserting into the old or new leaf.
 	 */
-	if (be16_to_cpu(leaf1->hdr.count) < be16_to_cpu(leaf2->hdr.count))
+	if (hdr1.count < hdr2.count)
 		state->inleaf = swap;
-	else if (be16_to_cpu(leaf1->hdr.count) > be16_to_cpu(leaf2->hdr.count))
+	else if (hdr1.count > hdr2.count)
 		state->inleaf = !swap;
 	else
-		state->inleaf =
-			swap ^ (blk1->index <= be16_to_cpu(leaf1->hdr.count));
+		state->inleaf = swap ^ (blk1->index <= hdr1.count);
 	/*
 	 * Adjust the expected index for insertion.
 	 */
 	if (!state->inleaf)
-		blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
+		blk2->index = blk1->index - hdr1.count;
 
 	/*
 	 * Finally sanity check just to make sure we are not returning a
@@ -898,7 +1103,7 @@ xfs_dir2_leafn_rebalance(
 }
 
 static int
-xfs_dir2_data_block_free(
+xfs_dir3_data_block_free(
 	xfs_da_args_t		*args,
 	struct xfs_dir2_data_hdr *hdr,
 	struct xfs_dir2_free	*free,
@@ -909,57 +1114,66 @@ xfs_dir2_data_block_free(
 {
 	struct xfs_trans	*tp = args->trans;
 	int			logfree = 0;
+	__be16			*bests;
+	struct xfs_dir3_icfree_hdr freehdr;
 
-	if (!hdr) {
-		/* One less used entry in the free table.  */
-		be32_add_cpu(&free->hdr.nused, -1);
-		xfs_dir2_free_log_header(tp, fbp);
+	xfs_dir3_free_hdr_from_disk(&freehdr, free);
 
+	bests = xfs_dir3_free_bests_p(tp->t_mountp, free);
+	if (hdr) {
 		/*
-		 * If this was the last entry in the table, we can trim the
-		 * table size back.  There might be other entries at the end
-		 * referring to non-existent data blocks, get those too.
+		 * Data block is not empty, just set the free entry to the new
+		 * value.
 		 */
-		if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
-			int	i;		/* free entry index */
+		bests[findex] = cpu_to_be16(longest);
+		xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+		return 0;
+	}
 
-			for (i = findex - 1; i >= 0; i--) {
-				if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
-					break;
-			}
-			free->hdr.nvalid = cpu_to_be32(i + 1);
-			logfree = 0;
-		} else {
-			/* Not the last entry, just punch it out.  */
-			free->bests[findex] = cpu_to_be16(NULLDATAOFF);
-			logfree = 1;
-		}
-		/*
-		 * If there are no useful entries left in the block,
-		 * get rid of the block if we can.
-		 */
-		if (!free->hdr.nused) {
-			int error;
+	/* One less used entry in the free table. */
+	freehdr.nused--;
 
-			error = xfs_dir2_shrink_inode(args, fdb, fbp);
-			if (error == 0) {
-				fbp = NULL;
-				logfree = 0;
-			} else if (error != ENOSPC || args->total != 0)
-				return error;
-			/*
-			 * It's possible to get ENOSPC if there is no
-			 * space reservation.  In this case some one
-			 * else will eventually get rid of this block.
-			 */
+	/*
+	 * If this was the last entry in the table, we can trim the table size
+	 * back.  There might be other entries at the end referring to
+	 * non-existent data blocks, get those too.
+	 */
+	if (findex == freehdr.nvalid - 1) {
+		int	i;		/* free entry index */
+
+		for (i = findex - 1; i >= 0; i--) {
+			if (bests[i] != cpu_to_be16(NULLDATAOFF))
+				break;
 		}
+		freehdr.nvalid = i + 1;
+		logfree = 0;
 	} else {
+		/* Not the last entry, just punch it out.  */
+		bests[findex] = cpu_to_be16(NULLDATAOFF);
+		logfree = 1;
+	}
+
+	xfs_dir3_free_hdr_to_disk(free, &freehdr);
+	xfs_dir2_free_log_header(tp, fbp);
+
+	/*
+	 * If there are no useful entries left in the block, get rid of the
+	 * block if we can.
+	 */
+	if (!freehdr.nused) {
+		int error;
+
+		error = xfs_dir2_shrink_inode(args, fdb, fbp);
+		if (error == 0) {
+			fbp = NULL;
+			logfree = 0;
+		} else if (error != ENOSPC || args->total != 0)
+			return error;
 		/*
-		 * Data block is not empty, just set the free entry to the new
-		 * value.
+		 * It's possible to get ENOSPC if there is no
+		 * space reservation.  In this case some one
+		 * else will eventually get rid of this block.
 		 */
-		free->bests[findex] = cpu_to_be16(longest);
-		logfree = 1;
 	}
 
 	/* Log the free entry that changed, unless we got rid of it.  */
@@ -994,6 +1208,9 @@ xfs_dir2_leafn_remove(
 	int			needlog;	/* need to log data header */
 	int			needscan;	/* need to rescan data frees */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir2_data_free *bf;		/* bestfree table */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir2_leaf_entry *ents;
 
 	trace_xfs_dir2_leafn_remove(args, index);
 
@@ -1001,11 +1218,14 @@ xfs_dir2_leafn_remove(
 	tp = args->trans;
 	mp = dp->i_mount;
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = xfs_dir3_leaf_ents_p(leaf);
+
 	/*
 	 * Point to the entry we're removing.
 	 */
-	lep = &leaf->ents[index];
+	lep = &ents[index];
+
 	/*
 	 * Extract the data block and offset from the entry.
 	 */
@@ -1013,14 +1233,18 @@ xfs_dir2_leafn_remove(
 	ASSERT(dblk->blkno == db);
 	off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address));
 	ASSERT(dblk->index == off);
+
 	/*
 	 * Kill the leaf entry by marking it stale.
 	 * Log the leaf block changes.
 	 */
-	be16_add_cpu(&leaf->hdr.stale, 1);
-	xfs_dir2_leaf_log_header(tp, bp);
+	leafhdr.stale++;
+	xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
+	xfs_dir3_leaf_log_header(tp, bp);
+
 	lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
-	xfs_dir2_leaf_log_ents(tp, bp, index, index);
+	xfs_dir3_leaf_log_ents(tp, bp, index, index);
+
 	/*
 	 * Make the data entry free.  Keep track of the longest freespace
 	 * in the data block in case it changes.
@@ -1028,7 +1252,8 @@ xfs_dir2_leafn_remove(
 	dbp = dblk->bp;
 	hdr = dbp->b_addr;
 	dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
-	longest = be16_to_cpu(hdr->bestfree[0].length);
+	bf = xfs_dir3_data_bestfree_p(hdr);
+	longest = be16_to_cpu(bf[0].length);
 	needlog = needscan = 0;
 	xfs_dir2_data_make_free(tp, dbp, off,
 		xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan);
@@ -1040,12 +1265,12 @@ xfs_dir2_leafn_remove(
 		xfs_dir2_data_freescan(mp, hdr, &needlog);
 	if (needlog)
 		xfs_dir2_data_log_header(tp, dbp);
-	xfs_dir2_data_check(dp, dbp);
+	xfs_dir3_data_check(dp, dbp);
 	/*
 	 * If the longest data block freespace changes, need to update
 	 * the corresponding freeblock entry.
 	 */
-	if (longest < be16_to_cpu(hdr->bestfree[0].length)) {
+	if (longest < be16_to_cpu(bf[0].length)) {
 		int		error;		/* error return value */
 		struct xfs_buf	*fbp;		/* freeblock buffer */
 		xfs_dir2_db_t	fdb;		/* freeblock block number */
@@ -1062,20 +1287,25 @@ xfs_dir2_leafn_remove(
 		if (error)
 			return error;
 		free = fbp->b_addr;
-		ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
-		ASSERT(be32_to_cpu(free->hdr.firstdb) ==
-		       xfs_dir2_free_max_bests(mp) *
-		       (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+#ifdef DEBUG
+	{
+		struct xfs_dir3_icfree_hdr freehdr;
+		xfs_dir3_free_hdr_from_disk(&freehdr, free);
+		ASSERT(freehdr.firstdb == xfs_dir3_free_max_bests(mp) *
+					  (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+	}
+#endif
 		/*
 		 * Calculate which entry we need to fix.
 		 */
 		findex = xfs_dir2_db_to_fdindex(mp, db);
-		longest = be16_to_cpu(hdr->bestfree[0].length);
+		longest = be16_to_cpu(bf[0].length);
 		/*
 		 * If the data block is now empty we can get rid of it
 		 * (usually).
 		 */
-		if (longest == mp->m_dirblksize - (uint)sizeof(*hdr)) {
+		if (longest == mp->m_dirblksize -
+			       xfs_dir3_data_entry_offset(hdr)) {
 			/*
 			 * Try to punch out the data block.
 			 */
@@ -1096,21 +1326,19 @@ xfs_dir2_leafn_remove(
 		 * If we got rid of the data block, we can eliminate that entry
 		 * in the free block.
 		 */
-		error = xfs_dir2_data_block_free(args, hdr, free,
+		error = xfs_dir3_data_block_free(args, hdr, free,
 						 fdb, findex, fbp, longest);
 		if (error)
 			return error;
 	}
 
-	xfs_dir2_leafn_check(dp, bp);
+	xfs_dir3_leaf_check(mp, bp);
 	/*
 	 * Return indication of whether this leaf block is empty enough
 	 * to justify trying to join it with a neighbor.
 	 */
-	*rval =
-		((uint)sizeof(leaf->hdr) +
-		 (uint)sizeof(leaf->ents[0]) *
-		 (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale))) <
+	*rval = (xfs_dir3_leaf_hdr_size(leaf) +
+		 (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
 		mp->m_dir_magicpct;
 	return 0;
 }
@@ -1143,11 +1371,11 @@ xfs_dir2_leafn_split(
 	/*
 	 * Initialize the new leaf block.
 	 */
-	error = xfs_dir2_leaf_init(args, xfs_dir2_da_to_db(mp, blkno),
-		&newblk->bp, XFS_DIR2_LEAFN_MAGIC);
-	if (error) {
+	error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(mp, blkno),
+				      &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+	if (error)
 		return error;
-	}
+
 	newblk->blkno = blkno;
 	newblk->magic = XFS_DIR2_LEAFN_MAGIC;
 	/*
@@ -1155,7 +1383,7 @@ xfs_dir2_leafn_split(
 	 * block into the leaves.
 	 */
 	xfs_dir2_leafn_rebalance(state, oldblk, newblk);
-	error = xfs_da_blk_link(state, oldblk, newblk);
+	error = xfs_da3_blk_link(state, oldblk, newblk);
 	if (error) {
 		return error;
 	}
@@ -1171,8 +1399,8 @@ xfs_dir2_leafn_split(
 	 */
 	oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
 	newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
-	xfs_dir2_leafn_check(args->dp, oldblk->bp);
-	xfs_dir2_leafn_check(args->dp, newblk->bp);
+	xfs_dir3_leaf_check(mp, oldblk->bp);
+	xfs_dir3_leaf_check(mp, newblk->bp);
 	return error;
 }
 
@@ -1198,9 +1426,10 @@ xfs_dir2_leafn_toosmall(
 	int			error;		/* error return value */
 	int			forward;	/* sibling block direction */
 	int			i;		/* sibling counter */
-	xfs_da_blkinfo_t	*info;		/* leaf block header */
 	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
 	int			rval;		/* result from path_shift */
+	struct xfs_dir3_icleaf_hdr leafhdr;
+	struct xfs_dir2_leaf_entry *ents;
 
 	/*
 	 * Check for the degenerate case of the block being over 50% full.
@@ -1208,11 +1437,13 @@ xfs_dir2_leafn_toosmall(
 	 * to coalesce with a sibling.
 	 */
 	blk = &state->path.blk[state->path.active - 1];
-	info = blk->bp->b_addr;
-	ASSERT(info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-	leaf = (xfs_dir2_leaf_t *)info;
-	count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
-	bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]);
+	leaf = blk->bp->b_addr;
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+	ents = xfs_dir3_leaf_ents_p(leaf);
+	xfs_dir3_leaf_check(state->args->dp->i_mount, blk->bp);
+
+	count = leafhdr.count - leafhdr.stale;
+	bytes = xfs_dir3_leaf_hdr_size(leaf) + count * sizeof(ents[0]);
 	if (bytes > (state->blocksize >> 1)) {
 		/*
 		 * Blk over 50%, don't try to join.
@@ -1231,9 +1462,9 @@ xfs_dir2_leafn_toosmall(
 		 * Make altpath point to the block we want to keep and
 		 * path point to the block we want to drop (this one).
 		 */
-		forward = (info->forw != 0);
+		forward = (leafhdr.forw != 0);
 		memcpy(&state->altpath, &state->path, sizeof(state->path));
-		error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+		error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
 			&rval);
 		if (error)
 			return error;
@@ -1247,15 +1478,17 @@ xfs_dir2_leafn_toosmall(
 	 * We prefer coalescing with the lower numbered sibling so as
 	 * to shrink a directory over time.
 	 */
-	forward = be32_to_cpu(info->forw) < be32_to_cpu(info->back);
+	forward = leafhdr.forw < leafhdr.back;
 	for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
-		blkno = forward ? be32_to_cpu(info->forw) : be32_to_cpu(info->back);
+		struct xfs_dir3_icleaf_hdr hdr2;
+
+		blkno = forward ? leafhdr.forw : leafhdr.back;
 		if (blkno == 0)
 			continue;
 		/*
 		 * Read the sibling leaf block.
 		 */
-		error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
+		error = xfs_dir3_leafn_read(state->args->trans, state->args->dp,
 					    blkno, -1, &bp);
 		if (error)
 			return error;
@@ -1263,13 +1496,15 @@ xfs_dir2_leafn_toosmall(
 		/*
 		 * Count bytes in the two blocks combined.
 		 */
-		leaf = (xfs_dir2_leaf_t *)info;
-		count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
+		count = leafhdr.count - leafhdr.stale;
 		bytes = state->blocksize - (state->blocksize >> 2);
+
 		leaf = bp->b_addr;
-		ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-		count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale);
-		bytes -= count * (uint)sizeof(leaf->ents[0]);
+		xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf);
+		ents = xfs_dir3_leaf_ents_p(leaf);
+		count += hdr2.count - hdr2.stale;
+		bytes -= count * sizeof(ents[0]);
+
 		/*
 		 * Fits with at least 25% to spare.
 		 */
@@ -1291,10 +1526,10 @@ xfs_dir2_leafn_toosmall(
 	 */
 	memcpy(&state->altpath, &state->path, sizeof(state->path));
 	if (blkno < blk->blkno)
-		error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+		error = xfs_da3_path_shift(state, &state->altpath, forward, 0,
 			&rval);
 	else
-		error = xfs_da_path_shift(state, &state->path, forward, 0,
+		error = xfs_da3_path_shift(state, &state->path, forward, 0,
 			&rval);
 	if (error) {
 		return error;
@@ -1316,34 +1551,53 @@ xfs_dir2_leafn_unbalance(
 	xfs_da_args_t		*args;		/* operation arguments */
 	xfs_dir2_leaf_t		*drop_leaf;	/* dead leaf structure */
 	xfs_dir2_leaf_t		*save_leaf;	/* surviving leaf structure */
+	struct xfs_dir3_icleaf_hdr savehdr;
+	struct xfs_dir3_icleaf_hdr drophdr;
+	struct xfs_dir2_leaf_entry *sents;
+	struct xfs_dir2_leaf_entry *dents;
 
 	args = state->args;
 	ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
 	ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
 	drop_leaf = drop_blk->bp->b_addr;
 	save_leaf = save_blk->bp->b_addr;
-	ASSERT(drop_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-	ASSERT(save_leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+
+	xfs_dir3_leaf_hdr_from_disk(&savehdr, save_leaf);
+	xfs_dir3_leaf_hdr_from_disk(&drophdr, drop_leaf);
+	sents = xfs_dir3_leaf_ents_p(save_leaf);
+	dents = xfs_dir3_leaf_ents_p(drop_leaf);
+
 	/*
 	 * If there are any stale leaf entries, take this opportunity
 	 * to purge them.
 	 */
-	if (drop_leaf->hdr.stale)
-		xfs_dir2_leaf_compact(args, drop_blk->bp);
-	if (save_leaf->hdr.stale)
-		xfs_dir2_leaf_compact(args, save_blk->bp);
+	if (drophdr.stale)
+		xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp);
+	if (savehdr.stale)
+		xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp);
+
 	/*
 	 * Move the entries from drop to the appropriate end of save.
 	 */
-	drop_blk->hashval = be32_to_cpu(drop_leaf->ents[be16_to_cpu(drop_leaf->hdr.count) - 1].hashval);
+	drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
 	if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
-		xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0,
-			be16_to_cpu(drop_leaf->hdr.count));
+		xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+					save_blk->bp, &savehdr, sents, 0,
+					drophdr.count);
 	else
-		xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp,
-			be16_to_cpu(save_leaf->hdr.count), be16_to_cpu(drop_leaf->hdr.count));
-	save_blk->hashval = be32_to_cpu(save_leaf->ents[be16_to_cpu(save_leaf->hdr.count) - 1].hashval);
-	xfs_dir2_leafn_check(args->dp, save_blk->bp);
+		xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
+					save_blk->bp, &savehdr, sents,
+					savehdr.count, drophdr.count);
+	save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
+
+	/* log the changes made when moving the entries */
+	xfs_dir3_leaf_hdr_to_disk(save_leaf, &savehdr);
+	xfs_dir3_leaf_hdr_to_disk(drop_leaf, &drophdr);
+	xfs_dir3_leaf_log_header(args->trans, save_blk->bp);
+	xfs_dir3_leaf_log_header(args->trans, drop_blk->bp);
+
+	xfs_dir3_leaf_check(args->dp->i_mount, save_blk->bp);
+	xfs_dir3_leaf_check(args->dp->i_mount, drop_blk->bp);
 }
 
 /*
@@ -1372,7 +1626,7 @@ xfs_dir2_node_addname(
 	 * Look up the name.  We're not supposed to find it, but
 	 * this gives us the insertion point.
 	 */
-	error = xfs_da_node_lookup_int(state, &rval);
+	error = xfs_da3_node_lookup_int(state, &rval);
 	if (error)
 		rval = error;
 	if (rval != ENOENT) {
@@ -1398,7 +1652,7 @@ xfs_dir2_node_addname(
 		 * It worked, fix the hash values up the btree.
 		 */
 		if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
-			xfs_da_fixhashpath(state, &state->path);
+			xfs_da3_fixhashpath(state, &state->path);
 	} else {
 		/*
 		 * It didn't work, we need to split the leaf block.
@@ -1410,7 +1664,7 @@ xfs_dir2_node_addname(
 		/*
 		 * Split the leaf block and insert the new entry.
 		 */
-		rval = xfs_da_split(state);
+		rval = xfs_da3_split(state);
 	}
 done:
 	xfs_da_state_free(state);
@@ -1447,6 +1701,9 @@ xfs_dir2_node_addname_int(
 	int			needscan;	/* need to rescan data frees */
 	__be16			*tagp;		/* data entry tag pointer */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	__be16			*bests;
+	struct xfs_dir3_icfree_hdr freehdr;
+	struct xfs_dir2_data_free *bf;
 
 	dp = args->dp;
 	mp = dp->i_mount;
@@ -1464,36 +1721,37 @@ xfs_dir2_node_addname_int(
 		 */
 		ifbno = fblk->blkno;
 		free = fbp->b_addr;
-		ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
 		findex = fblk->index;
+		bests = xfs_dir3_free_bests_p(mp, free);
+		xfs_dir3_free_hdr_from_disk(&freehdr, free);
+
 		/*
 		 * This means the free entry showed that the data block had
 		 * space for our entry, so we remembered it.
 		 * Use that data block.
 		 */
 		if (findex >= 0) {
-			ASSERT(findex < be32_to_cpu(free->hdr.nvalid));
-			ASSERT(be16_to_cpu(free->bests[findex]) != NULLDATAOFF);
-			ASSERT(be16_to_cpu(free->bests[findex]) >= length);
-			dbno = be32_to_cpu(free->hdr.firstdb) + findex;
-		}
-		/*
-		 * The data block looked at didn't have enough room.
-		 * We'll start at the beginning of the freespace entries.
-		 */
-		else {
+			ASSERT(findex < freehdr.nvalid);
+			ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
+			ASSERT(be16_to_cpu(bests[findex]) >= length);
+			dbno = freehdr.firstdb + findex;
+		} else {
+			/*
+			 * The data block looked at didn't have enough room.
+			 * We'll start at the beginning of the freespace entries.
+			 */
 			dbno = -1;
 			findex = 0;
 		}
-	}
-	/*
-	 * Didn't come in with a freespace block, so don't have a data block.
-	 */
-	else {
+	} else {
+		/*
+		 * Didn't come in with a freespace block, so no data block.
+		 */
 		ifbno = dbno = -1;
 		fbp = NULL;
 		findex = 0;
 	}
+
 	/*
 	 * If we don't have a data block yet, we're going to scan the
 	 * freespace blocks looking for one.  Figure out what the
@@ -1547,20 +1805,26 @@ xfs_dir2_node_addname_int(
 			if (!fbp)
 				continue;
 			free = fbp->b_addr;
-			ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
 			findex = 0;
 		}
 		/*
 		 * Look at the current free entry.  Is it good enough?
+		 *
+		 * The bests initialisation should be where the bufer is read in
+		 * the above branch. But gcc is too stupid to realise that bests
+		 * and the freehdr are actually initialised if they are placed
+		 * there, so we have to do it here to avoid warnings. Blech.
 		 */
-		if (be16_to_cpu(free->bests[findex]) != NULLDATAOFF &&
-		    be16_to_cpu(free->bests[findex]) >= length)
-			dbno = be32_to_cpu(free->hdr.firstdb) + findex;
+		bests = xfs_dir3_free_bests_p(mp, free);
+		xfs_dir3_free_hdr_from_disk(&freehdr, free);
+		if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+		    be16_to_cpu(bests[findex]) >= length)
+			dbno = freehdr.firstdb + findex;
 		else {
 			/*
 			 * Are we done with the freeblock?
 			 */
-			if (++findex == be32_to_cpu(free->hdr.nvalid)) {
+			if (++findex == freehdr.nvalid) {
 				/*
 				 * Drop the block.
 				 */
@@ -1588,7 +1852,7 @@ xfs_dir2_node_addname_int(
 		if (unlikely((error = xfs_dir2_grow_inode(args,
 							 XFS_DIR2_DATA_SPACE,
 							 &dbno)) ||
-		    (error = xfs_dir2_data_init(args, dbno, &dbp))))
+		    (error = xfs_dir3_data_init(args, dbno, &dbp))))
 			return error;
 
 		/*
@@ -1614,11 +1878,11 @@ xfs_dir2_node_addname_int(
 		 * If there wasn't a freespace block, the read will
 		 * return a NULL fbp.  Allocate and initialize a new one.
 		 */
-		if( fbp == NULL ) {
-			if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
-							&fbno))) {
+		if (!fbp) {
+			error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+						    &fbno);
+			if (error)
 				return error;
-			}
 
 			if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
 				xfs_alert(mp,
@@ -1646,27 +1910,22 @@ xfs_dir2_node_addname_int(
 			/*
 			 * Get a buffer for the new block.
 			 */
-			error = xfs_da_get_buf(tp, dp,
-					       xfs_dir2_db_to_da(mp, fbno),
-					       -1, &fbp, XFS_DATA_FORK);
+			error = xfs_dir3_free_get_buf(tp, dp, fbno, &fbp);
 			if (error)
 				return error;
-			fbp->b_ops = &xfs_dir2_free_buf_ops;
+			free = fbp->b_addr;
+			bests = xfs_dir3_free_bests_p(mp, free);
+			xfs_dir3_free_hdr_from_disk(&freehdr, free);
 
 			/*
-			 * Initialize the new block to be empty, and remember
-			 * its first slot as our empty slot.
+			 * Remember the first slot as our empty slot.
 			 */
-			free = fbp->b_addr;
-			free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC);
-			free->hdr.firstdb = cpu_to_be32(
-				(fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
-				xfs_dir2_free_max_bests(mp));
-			free->hdr.nvalid = 0;
-			free->hdr.nused = 0;
+			freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
+					xfs_dir3_free_max_bests(mp);
 		} else {
 			free = fbp->b_addr;
-			ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
+			bests = xfs_dir3_free_bests_p(mp, free);
+			xfs_dir3_free_hdr_from_disk(&freehdr, free);
 		}
 
 		/*
@@ -1677,20 +1936,21 @@ xfs_dir2_node_addname_int(
 		 * If it's after the end of the current entries in the
 		 * freespace block, extend that table.
 		 */
-		if (findex >= be32_to_cpu(free->hdr.nvalid)) {
-			ASSERT(findex < xfs_dir2_free_max_bests(mp));
-			free->hdr.nvalid = cpu_to_be32(findex + 1);
+		if (findex >= freehdr.nvalid) {
+			ASSERT(findex < xfs_dir3_free_max_bests(mp));
+			freehdr.nvalid = findex + 1;
 			/*
 			 * Tag new entry so nused will go up.
 			 */
-			free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+			bests[findex] = cpu_to_be16(NULLDATAOFF);
 		}
 		/*
 		 * If this entry was for an empty data block
 		 * (this should always be true) then update the header.
 		 */
-		if (free->bests[findex] == cpu_to_be16(NULLDATAOFF)) {
-			be32_add_cpu(&free->hdr.nused, 1);
+		if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
+			freehdr.nused++;
+			xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr);
 			xfs_dir2_free_log_header(tp, fbp);
 		}
 		/*
@@ -1699,7 +1959,8 @@ xfs_dir2_node_addname_int(
 		 * change again.
 		 */
 		hdr = dbp->b_addr;
-		free->bests[findex] = hdr->bestfree[0].length;
+		bf = xfs_dir3_data_bestfree_p(hdr);
+		bests[findex] = bf[0].length;
 		logfree = 1;
 	}
 	/*
@@ -1715,19 +1976,20 @@ xfs_dir2_node_addname_int(
 		/*
 		 * Read the data block in.
 		 */
-		error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+		error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
 					   -1, &dbp);
 		if (error)
 			return error;
 		hdr = dbp->b_addr;
+		bf = xfs_dir3_data_bestfree_p(hdr);
 		logfree = 0;
 	}
-	ASSERT(be16_to_cpu(hdr->bestfree[0].length) >= length);
+	ASSERT(be16_to_cpu(bf[0].length) >= length);
 	/*
 	 * Point to the existing unused space.
 	 */
 	dup = (xfs_dir2_data_unused_t *)
-	      ((char *)hdr + be16_to_cpu(hdr->bestfree[0].offset));
+	      ((char *)hdr + be16_to_cpu(bf[0].offset));
 	needscan = needlog = 0;
 	/*
 	 * Mark the first part of the unused space, inuse for us.
@@ -1758,8 +2020,9 @@ xfs_dir2_node_addname_int(
 	/*
 	 * If the freespace entry is now wrong, update it.
 	 */
-	if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(hdr->bestfree[0].length)) {
-		free->bests[findex] = hdr->bestfree[0].length;
+	bests = xfs_dir3_free_bests_p(mp, free); /* gcc is so stupid */
+	if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
+		bests[findex] = bf[0].length;
 		logfree = 1;
 	}
 	/*
@@ -1777,7 +2040,7 @@ xfs_dir2_node_addname_int(
 
 /*
  * Lookup an entry in a node-format directory.
- * All the real work happens in xfs_da_node_lookup_int.
+ * All the real work happens in xfs_da3_node_lookup_int.
  * The only real output is the inode number of the entry.
  */
 int						/* error */
@@ -1802,7 +2065,7 @@ xfs_dir2_node_lookup(
 	/*
 	 * Fill in the path to the entry in the cursor.
 	 */
-	error = xfs_da_node_lookup_int(state, &rval);
+	error = xfs_da3_node_lookup_int(state, &rval);
 	if (error)
 		rval = error;
 	else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
@@ -1857,7 +2120,7 @@ xfs_dir2_node_removename(
 	/*
 	 * Look up the entry we're deleting, set up the cursor.
 	 */
-	error = xfs_da_node_lookup_int(state, &rval);
+	error = xfs_da3_node_lookup_int(state, &rval);
 	if (error)
 		rval = error;
 	/*
@@ -1881,12 +2144,12 @@ xfs_dir2_node_removename(
 	/*
 	 * Fix the hash values up the btree.
 	 */
-	xfs_da_fixhashpath(state, &state->path);
+	xfs_da3_fixhashpath(state, &state->path);
 	/*
 	 * If we need to join leaf blocks, do it.
 	 */
 	if (rval && state->path.active > 1)
-		error = xfs_da_join(state);
+		error = xfs_da3_join(state);
 	/*
 	 * If no errors so far, try conversion to leaf format.
 	 */
@@ -1928,7 +2191,7 @@ xfs_dir2_node_replace(
 	/*
 	 * Lookup the entry to change in the btree.
 	 */
-	error = xfs_da_node_lookup_int(state, &rval);
+	error = xfs_da3_node_lookup_int(state, &rval);
 	if (error) {
 		rval = error;
 	}
@@ -1937,19 +2200,22 @@ xfs_dir2_node_replace(
 	 * and locked it.  But paranoia is good.
 	 */
 	if (rval == EEXIST) {
+		struct xfs_dir2_leaf_entry *ents;
 		/*
 		 * Find the leaf entry.
 		 */
 		blk = &state->path.blk[state->path.active - 1];
 		ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
 		leaf = blk->bp->b_addr;
-		lep = &leaf->ents[blk->index];
+		ents = xfs_dir3_leaf_ents_p(leaf);
+		lep = &ents[blk->index];
 		ASSERT(state->extravalid);
 		/*
 		 * Point to the data entry.
 		 */
 		hdr = state->extrablk.bp->b_addr;
-		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
+		       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
 		dep = (xfs_dir2_data_entry_t *)
 		      ((char *)hdr +
 		       xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address)));
@@ -1995,6 +2261,7 @@ xfs_dir2_node_trim_free(
 	xfs_dir2_free_t		*free;		/* freespace structure */
 	xfs_mount_t		*mp;		/* filesystem mount point */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_dir3_icfree_hdr freehdr;
 
 	dp = args->dp;
 	mp = dp->i_mount;
@@ -2012,11 +2279,12 @@ xfs_dir2_node_trim_free(
 	if (!bp)
 		return 0;
 	free = bp->b_addr;
-	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
+	xfs_dir3_free_hdr_from_disk(&freehdr, free);
+
 	/*
 	 * If there are used entries, there's nothing to do.
 	 */
-	if (be32_to_cpu(free->hdr.nused) > 0) {
+	if (freehdr.nused > 0) {
 		xfs_trans_brelse(tp, bp);
 		*rvalp = 0;
 		return 0;
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 7da79f6515f..0511cda4a71 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -30,11 +30,11 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
 				const unsigned char *name, int len);
 
 /* xfs_dir2_block.c */
-extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
 
 extern int xfs_dir2_block_addname(struct xfs_da_args *args);
-extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
-		xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
+		struct dir_context *ctx);
 extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_block_removename(struct xfs_da_args *args);
 extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -43,17 +43,18 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
 
 /* xfs_dir2_data.c */
 #ifdef DEBUG
-#define	xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
+#define	xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp);
 #else
-#define	xfs_dir2_data_check(dp,bp)
+#define	xfs_dir3_data_check(dp,bp)
 #endif
 
-extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
 
-extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
-extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
 		xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
-extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
+extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
 		xfs_dablk_t bno, xfs_daddr_t mapped_bno);
 
 extern struct xfs_dir2_data_free *
@@ -61,7 +62,7 @@ xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
 		struct xfs_dir2_data_unused *dup, int *loghead);
 extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
 		struct xfs_dir2_data_hdr *hdr, int *loghead);
-extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
 		struct xfs_buf **bpp);
 extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
 		struct xfs_dir2_data_entry *dep);
@@ -77,24 +78,26 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
 		xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
 
 /* xfs_dir2_leaf.c */
-extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
+extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
 
-extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
 		xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
 extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
 		struct xfs_buf *dbp);
 extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
-extern void xfs_dir2_leaf_compact(struct xfs_da_args *args,
-		struct xfs_buf *bp);
-extern void xfs_dir2_leaf_compact_x1(struct xfs_buf *bp, int *indexp,
+extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
+		struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp);
+extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
+		struct xfs_dir2_leaf_entry *ents, int *indexp,
 		int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
-extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent,
-		size_t bufsize, xfs_off_t *offset, filldir_t filldir);
-extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno,
-		struct xfs_buf **bpp, int magic);
-extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
+extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
+		size_t bufsize);
+extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
+		struct xfs_buf **bpp, __uint16_t magic);
+extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
 		int first, int last);
-extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp,
+extern void xfs_dir3_leaf_log_header(struct xfs_trans *tp,
 		struct xfs_buf *bp);
 extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
@@ -104,11 +107,18 @@ extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
 extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args,
 		struct xfs_buf *lbp, xfs_dir2_db_t db);
 extern struct xfs_dir2_leaf_entry *
-xfs_dir2_leaf_find_entry(struct xfs_dir2_leaf *leaf, int index, int compact,
-		int lowstale, int highstale,
-		int *lfloglow, int *lfloghigh);
+xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
+		struct xfs_dir2_leaf_entry *ents, int index, int compact,
+		int lowstale, int highstale, int *lfloglow, int *lfloghigh);
 extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
 
+extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
+		struct xfs_dir2_leaf *from);
+extern void xfs_dir3_leaf_hdr_to_disk(struct xfs_dir2_leaf *to,
+		struct xfs_dir3_icleaf_hdr *from);
+extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp,
+		struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
+
 /* xfs_dir2_node.c */
 extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
 		struct xfs_buf *lbp);
@@ -143,8 +153,7 @@ extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
 		int size, xfs_dir2_sf_hdr_t *sfhp);
 extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
 extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
-extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent,
-		xfs_off_t *offset, filldir_t filldir);
+extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
 extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
 extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 1b9fc3ec7e4..97676a347da 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -278,7 +278,7 @@ xfs_dir2_block_to_sf(
 	 * Set up to loop over the block's entries.
 	 */
 	btp = xfs_dir2_block_tail_p(mp, hdr);
-	ptr = (char *)(hdr + 1);
+	ptr = (char *)xfs_dir3_data_entry_p(hdr);
 	endptr = (char *)xfs_dir2_block_leaf_p(btp);
 	sfep = xfs_dir2_sf_firstentry(sfp);
 	/*
@@ -535,7 +535,7 @@ xfs_dir2_sf_addname_hard(
 	 * to insert the new entry.
 	 * If it's going to end up at the end then oldsfep will point there.
 	 */
-	for (offset = XFS_DIR2_DATA_FIRST_OFFSET,
+	for (offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount),
 	      oldsfep = xfs_dir2_sf_firstentry(oldsfp),
 	      add_datasize = xfs_dir2_data_entsize(args->namelen),
 	      eof = (char *)oldsfep == &buf[old_isize];
@@ -617,7 +617,7 @@ xfs_dir2_sf_addname_pick(
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 	size = xfs_dir2_data_entsize(args->namelen);
-	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
 	sfep = xfs_dir2_sf_firstentry(sfp);
 	holefit = 0;
 	/*
@@ -688,7 +688,7 @@ xfs_dir2_sf_check(
 	dp = args->dp;
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount);
 	ino = xfs_dir2_sf_get_parent_ino(sfp);
 	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
 
@@ -768,9 +768,7 @@ xfs_dir2_sf_create(
 int						/* error */
 xfs_dir2_sf_getdents(
 	xfs_inode_t		*dp,		/* incore directory inode */
-	void			*dirent,
-	xfs_off_t		*offset,
-	filldir_t		filldir)
+	struct dir_context	*ctx)
 {
 	int			i;		/* shortform entry number */
 	xfs_mount_t		*mp;		/* filesystem mount point */
@@ -802,7 +800,7 @@ xfs_dir2_sf_getdents(
 	/*
 	 * If the block number in the offset is out of range, we're done.
 	 */
-	if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
+	if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
 		return 0;
 
 	/*
@@ -812,29 +810,27 @@ xfs_dir2_sf_getdents(
 	 * mp->m_dirdatablk.
 	 */
 	dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-					     XFS_DIR2_DATA_DOT_OFFSET);
+					     XFS_DIR3_DATA_DOT_OFFSET(mp));
 	dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
-						XFS_DIR2_DATA_DOTDOT_OFFSET);
+						XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
 
 	/*
 	 * Put . entry unless we're starting past it.
 	 */
-	if (*offset <= dot_offset) {
-		if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
-			*offset = dot_offset & 0x7fffffff;
+	if (ctx->pos <= dot_offset) {
+		ctx->pos = dot_offset & 0x7fffffff;
+		if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
 			return 0;
-		}
 	}
 
 	/*
 	 * Put .. entry unless we're starting past it.
 	 */
-	if (*offset <= dotdot_offset) {
+	if (ctx->pos <= dotdot_offset) {
 		ino = xfs_dir2_sf_get_parent_ino(sfp);
-		if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
-			*offset = dotdot_offset & 0x7fffffff;
+		ctx->pos = dotdot_offset & 0x7fffffff;
+		if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
 			return 0;
-		}
 	}
 
 	/*
@@ -845,21 +841,20 @@ xfs_dir2_sf_getdents(
 		off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
 				xfs_dir2_sf_get_offset(sfep));
 
-		if (*offset > off) {
+		if (ctx->pos > off) {
 			sfep = xfs_dir2_sf_nextentry(sfp, sfep);
 			continue;
 		}
 
 		ino = xfs_dir2_sfe_get_ino(sfp, sfep);
-		if (filldir(dirent, (char *)sfep->name, sfep->namelen,
-			    off & 0x7fffffff, ino, DT_UNKNOWN)) {
-			*offset = off & 0x7fffffff;
+		ctx->pos = off & 0x7fffffff;
+		if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
+			    ino, DT_UNKNOWN))
 			return 0;
-		}
 		sfep = xfs_dir2_sf_nextentry(sfp, sfep);
 	}
 
-	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+	ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
 			0x7fffffff;
 	return 0;
 }
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 9e1bf5294c9..f01012de06d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -36,6 +36,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_trans_priv.h"
 #include "xfs_qm.h"
+#include "xfs_cksum.h"
 #include "xfs_trace.h"
 
 /*
@@ -85,17 +86,23 @@ xfs_qm_dqdestroy(
  */
 void
 xfs_qm_adjust_dqlimits(
-	xfs_mount_t		*mp,
-	xfs_disk_dquot_t	*d)
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*dq)
 {
-	xfs_quotainfo_t		*q = mp->m_quotainfo;
+	struct xfs_quotainfo	*q = mp->m_quotainfo;
+	struct xfs_disk_dquot	*d = &dq->q_core;
+	int			prealloc = 0;
 
 	ASSERT(d->d_id);
 
-	if (q->qi_bsoftlimit && !d->d_blk_softlimit)
+	if (q->qi_bsoftlimit && !d->d_blk_softlimit) {
 		d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
-	if (q->qi_bhardlimit && !d->d_blk_hardlimit)
+		prealloc = 1;
+	}
+	if (q->qi_bhardlimit && !d->d_blk_hardlimit) {
 		d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+		prealloc = 1;
+	}
 	if (q->qi_isoftlimit && !d->d_ino_softlimit)
 		d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
 	if (q->qi_ihardlimit && !d->d_ino_hardlimit)
@@ -104,6 +111,9 @@ xfs_qm_adjust_dqlimits(
 		d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
 	if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
 		d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+
+	if (prealloc)
+		xfs_dquot_set_prealloc_limits(dq);
 }
 
 /*
@@ -239,6 +249,11 @@ xfs_qm_init_dquot_blk(
 		d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
 		d->dd_diskdq.d_id = cpu_to_be32(curid);
 		d->dd_diskdq.d_flags = type;
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+			xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+					 XFS_DQUOT_CRC_OFF);
+		}
 	}
 
 	xfs_trans_dquot_buf(tp, bp,
@@ -248,25 +263,95 @@ xfs_qm_init_dquot_blk(
 	xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
 
-static void
+/*
+ * Initialize the dynamic speculative preallocation thresholds. The lo/hi
+ * watermarks correspond to the soft and hard limits by default. If a soft limit
+ * is not specified, we use 95% of the hard limit.
+ */
+void
+xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
+{
+	__uint64_t space;
+
+	dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+	dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+	if (!dqp->q_prealloc_lo_wmark) {
+		dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
+		do_div(dqp->q_prealloc_lo_wmark, 100);
+		dqp->q_prealloc_lo_wmark *= 95;
+	}
+
+	space = dqp->q_prealloc_hi_wmark;
+
+	do_div(space, 100);
+	dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space;
+	dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3;
+	dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
+}
+
+STATIC bool
+xfs_dquot_buf_verify_crc(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	int			ndquots;
+	int			i;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return true;
+
+	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_qm_calc_dquots_per_chunk(mp,
+					XFS_BB_TO_FSB(mp, bp->b_length));
+
+	for (i = 0; i < ndquots; i++, d++) {
+		if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF))
+			return false;
+		if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+			return false;
+	}
+	return true;
+}
+
+STATIC bool
 xfs_dquot_buf_verify(
+	struct xfs_mount	*mp,
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
-	struct xfs_disk_dquot	*ddq;
 	xfs_dqid_t		id = 0;
+	int			ndquots;
 	int			i;
 
 	/*
+	 * if we are in log recovery, the quota subsystem has not been
+	 * initialised so we have no quotainfo structure. In that case, we need
+	 * to manually calculate the number of dquots in the buffer.
+	 */
+	if (mp->m_quotainfo)
+		ndquots = mp->m_quotainfo->qi_dqperchunk;
+	else
+		ndquots = xfs_qm_calc_dquots_per_chunk(mp, bp->b_length);
+
+	/*
 	 * On the first read of the buffer, verify that each dquot is valid.
 	 * We don't know what the id of the dquot is supposed to be, just that
 	 * they should be increasing monotonically within the buffer. If the
 	 * first id is corrupt, then it will fail on the second dquot in the
 	 * buffer so corruptions could point to the wrong dquot in this case.
 	 */
-	for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
-		int	error;
+	for (i = 0; i < ndquots; i++) {
+		struct xfs_disk_dquot	*ddq;
+		int			error;
 
 		ddq = &d[i].dd_diskdq;
 
@@ -274,27 +359,41 @@ xfs_dquot_buf_verify(
 			id = be32_to_cpu(ddq->d_id);
 
 		error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
-					"xfs_dquot_read_verify");
-		if (error) {
-			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
-			xfs_buf_ioerror(bp, EFSCORRUPTED);
-			break;
-		}
+				       "xfs_dquot_buf_verify");
+		if (error)
+			return false;
 	}
+	return true;
 }
 
 static void
 xfs_dquot_buf_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dquot_buf_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
 }
 
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
 void
 xfs_dquot_buf_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_dquot_buf_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (!xfs_dquot_buf_verify(mp, bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
 }
 
 const struct xfs_buf_ops xfs_dquot_buf_ops = {
@@ -471,13 +570,13 @@ xfs_qm_dqtobp(
 	xfs_buf_t		**O_bpp,
 	uint			flags)
 {
-	xfs_bmbt_irec_t map;
-	int		nmaps = 1, error;
-	xfs_buf_t	*bp;
-	xfs_inode_t	*quotip = XFS_DQ_TO_QIP(dqp);
-	xfs_mount_t	*mp = dqp->q_mount;
-	xfs_dqid_t	id = be32_to_cpu(dqp->q_core.d_id);
-	xfs_trans_t	*tp = (tpp ? *tpp : NULL);
+	struct xfs_bmbt_irec	map;
+	int			nmaps = 1, error;
+	struct xfs_buf		*bp;
+	struct xfs_inode	*quotip = xfs_dq_to_quota_inode(dqp);
+	struct xfs_mount	*mp = dqp->q_mount;
+	xfs_dqid_t		id = be32_to_cpu(dqp->q_core.d_id);
+	struct xfs_trans	*tp = (tpp ? *tpp : NULL);
 
 	dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
 
@@ -612,15 +711,9 @@ xfs_qm_dqread(
 	if (flags & XFS_QMOPT_DQALLOC) {
 		tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
 		error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
-				XFS_WRITE_LOG_RES(mp) +
-				/*
-				 * Round the chunklen up to the next multiple
-				 * of 128 (buf log item chunk size)).
-				 */
-				BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128,
-				0,
-				XFS_TRANS_PERM_LOG_RES,
-				XFS_WRITE_LOG_COUNT);
+					  XFS_QM_DQALLOC_LOG_RES(mp), 0,
+					  XFS_TRANS_PERM_LOG_RES,
+					  XFS_WRITE_LOG_COUNT);
 		if (error)
 			goto error1;
 		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
@@ -654,6 +747,9 @@ xfs_qm_dqread(
 	dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
 	dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
 
+	/* initialize the dquot speculative prealloc thresholds */
+	xfs_dquot_set_prealloc_limits(dqp);
+
 	/* Mark the buf so that this will stay incore a little longer */
 	xfs_buf_set_ref(bp, XFS_DQUOT_REF);
 
@@ -708,7 +804,7 @@ xfs_qm_dqget(
 	xfs_dquot_t	**O_dqpp) /* OUT : locked incore dquot */
 {
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
-	struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+	struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
 	struct xfs_dquot	*dqp;
 	int			error;
 
@@ -1041,6 +1137,23 @@ xfs_qm_dqflush(
 					&dqp->q_logitem.qli_item.li_lsn);
 
 	/*
+	 * copy the lsn into the on-disk dquot now while we have the in memory
+	 * dquot here. This can't be done later in the write verifier as we
+	 * can't get access to the log item at that point in time.
+	 *
+	 * We also calculate the CRC here so that the on-disk dquot in the
+	 * buffer always has a valid CRC. This ensures there is no possibility
+	 * of a dquot without an up-to-date CRC getting to disk.
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
+
+		dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+		xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+	}
+
+	/*
 	 * Attach an iodone routine so that we can remove this dquot from the
 	 * AIL and release the flush lock once the dquot is synced to disk.
 	 */
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c694a8469c4..b596626249b 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -32,6 +32,13 @@
 struct xfs_mount;
 struct xfs_trans;
 
+enum {
+	XFS_QLOWSP_1_PCNT = 0,
+	XFS_QLOWSP_3_PCNT,
+	XFS_QLOWSP_5_PCNT,
+	XFS_QLOWSP_MAX
+};
+
 /*
  * The incore dquot structure
  */
@@ -51,6 +58,9 @@ typedef struct xfs_dquot {
 	xfs_qcnt_t	 q_res_bcount;	/* total regular nblks used+reserved */
 	xfs_qcnt_t	 q_res_icount;	/* total inos allocd+reserved */
 	xfs_qcnt_t	 q_res_rtbcount;/* total realtime blks used+reserved */
+	xfs_qcnt_t	 q_prealloc_lo_wmark;/* prealloc throttle wmark */
+	xfs_qcnt_t	 q_prealloc_hi_wmark;/* prealloc disabled wmark */
+	int64_t		 q_low_space[XFS_QLOWSP_MAX];
 	struct mutex	 q_qlock;	/* quota lock */
 	struct completion q_flush;	/* flush completion queue */
 	atomic_t          q_pincount;	/* dquot pin count */
@@ -133,10 +143,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
 #define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
 #define XFS_QM_ISPDQ(dqp)	((dqp)->dq_flags & XFS_DQ_PROJ)
 #define XFS_QM_ISGDQ(dqp)	((dqp)->dq_flags & XFS_DQ_GROUP)
-#define XFS_DQ_TO_QINF(dqp)	((dqp)->q_mount->m_quotainfo)
-#define XFS_DQ_TO_QIP(dqp)	(XFS_QM_ISUDQ(dqp) ? \
-				 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
-				 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
 
 extern int		xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
 					uint, struct xfs_dquot	**);
@@ -145,14 +151,16 @@ extern int		xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
 extern void		xfs_qm_dqunpin_wait(xfs_dquot_t *);
 extern void		xfs_qm_adjust_dqtimers(xfs_mount_t *,
 					xfs_disk_dquot_t *);
-extern void		xfs_qm_adjust_dqlimits(xfs_mount_t *,
-					xfs_disk_dquot_t *);
+extern void		xfs_qm_adjust_dqlimits(struct xfs_mount *,
+					       struct xfs_dquot *);
 extern int		xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
 					xfs_dqid_t, uint, uint, xfs_dquot_t **);
 extern void		xfs_qm_dqput(xfs_dquot_t *);
 
 extern void		xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
 
+extern void		xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
+
 static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
 {
 	xfs_dqlock(dqp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 610456054dc..35d3f5b041d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -66,7 +66,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
 	int i;
 	int64_t fsid;
 
-	if (random32() % randfactor)
+	if (prandom_u32() % randfactor)
 		return 0;
 
 	memcpy(&fsid, fsidp, sizeof(xfs_fsid_t));
@@ -178,7 +178,7 @@ xfs_corruption_error(
 	inst_t			*ra)
 {
 	if (level <= xfs_error_level)
-		xfs_hex_dump(p, 16);
+		xfs_hex_dump(p, 64);
 	xfs_error_report(tag, level, mp, filename, linenum, ra);
 	xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
 }
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index a83611849ce..c585bc64639 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -48,7 +48,7 @@ static int xfs_fileid_length(int fileid_type)
 	case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
 		return 6;
 	}
-	return 255; /* invalid */
+	return FILEID_INVALID;
 }
 
 STATIC int
@@ -90,7 +90,7 @@ xfs_fs_encode_fh(
 	len = xfs_fileid_length(fileid_type);
 	if (*max_len < len) {
 		*max_len = len;
-		return 255;
+		return FILEID_INVALID;
 	}
 	*max_len = len;
 
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index feb36d7551a..452920a3f03 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -50,9 +50,8 @@ xfs_efi_item_free(
  * Freeing the efi requires that we remove it from the AIL if it has already
  * been placed there. However, the EFI may not yet have been placed in the AIL
  * when called by xfs_efi_release() from EFD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the
- * test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
- * the EFI.
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the EFI.
  */
 STATIC void
 __xfs_efi_release(
@@ -60,7 +59,7 @@ __xfs_efi_release(
 {
 	struct xfs_ail		*ailp = efip->efi_item.li_ailp;
 
-	if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
+	if (atomic_dec_and_test(&efip->efi_refcount)) {
 		spin_lock(&ailp->xa_lock);
 		/* xfs_trans_ail_delete() drops the AIL lock. */
 		xfs_trans_ail_delete(ailp, &efip->efi_item,
@@ -126,8 +125,8 @@ xfs_efi_item_pin(
  * which the EFI is manipulated during a transaction.  If we are being asked to
  * remove the EFI it's because the transaction has been cancelled and by
  * definition that means the EFI cannot be in the AIL so remove it from the
- * transaction and free it.  Otherwise coordinate with xfs_efi_release() (via
- * XFS_EFI_COMMITTED) to determine who gets to free the EFI.
+ * transaction and free it.  Otherwise coordinate with xfs_efi_release()
+ * to determine who gets to free the EFI.
  */
 STATIC void
 xfs_efi_item_unpin(
@@ -171,19 +170,13 @@ xfs_efi_item_unlock(
 
 /*
  * The EFI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.  For bulk transaction committed
- * processing, the EFI may be processed but not yet unpinned prior to the EFD
- * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
- * when processing the EFD.
+ * the lsn at which it's been logged.
  */
 STATIC xfs_lsn_t
 xfs_efi_item_committed(
 	struct xfs_log_item	*lip,
 	xfs_lsn_t		lsn)
 {
-	struct xfs_efi_log_item	*efip = EFI_ITEM(lip);
-
-	set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
 	return lsn;
 }
 
@@ -241,6 +234,7 @@ xfs_efi_init(
 	efip->efi_format.efi_nextents = nextents;
 	efip->efi_format.efi_id = (__psint_t)(void*)efip;
 	atomic_set(&efip->efi_next_extent, 0);
+	atomic_set(&efip->efi_refcount, 2);
 
 	return efip;
 }
@@ -310,8 +304,14 @@ xfs_efi_release(xfs_efi_log_item_t	*efip,
 		uint			nextents)
 {
 	ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
-	if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
+	if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
+		/* recovery needs us to drop the EFI reference, too */
+		if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
+			__xfs_efi_release(efip);
+
 		__xfs_efi_release(efip);
+		/* efip may now have been freed, do not reference it again. */
+	}
 }
 
 static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 375f68e4253..432222418c5 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -114,16 +114,20 @@ typedef struct xfs_efd_log_format_64 {
  * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
  */
 #define	XFS_EFI_RECOVERED	1
-#define	XFS_EFI_COMMITTED	2
 
 /*
- * This is the "extent free intention" log item.  It is used
- * to log the fact that some extents need to be free.  It is
- * used in conjunction with the "extent free done" log item
- * described below.
+ * This is the "extent free intention" log item.  It is used to log the fact
+ * that some extents need to be free.  It is used in conjunction with the
+ * "extent free done" log item described below.
+ *
+ * The EFI is reference counted so that it is not freed prior to both the EFI
+ * and EFD being committed and unpinned. This ensures that when the last
+ * reference goes away the EFI will always be in the AIL as it has been
+ * unpinned, regardless of whether the EFD is processed before or after the EFI.
  */
 typedef struct xfs_efi_log_item {
 	xfs_log_item_t		efi_item;
+	atomic_t		efi_refcount;
 	atomic_t		efi_next_extent;
 	unsigned long		efi_flags;	/* misc flags */
 	xfs_efi_log_format_t	efi_format;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 67284edb84d..de3dc98f4e8 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
 #include "xfs_ioctl.h"
 #include "xfs_trace.h"
 
+#include <linux/aio.h>
 #include <linux/dcache.h>
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
@@ -775,8 +776,6 @@ xfs_file_aio_write(
 	if (ocount == 0)
 		return 0;
 
-	sb_start_write(inode->i_sb);
-
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		ret = -EIO;
 		goto out;
@@ -800,7 +799,6 @@ xfs_file_aio_write(
 	}
 
 out:
-	sb_end_write(inode->i_sb);
 	return ret;
 }
 
@@ -811,7 +809,7 @@ xfs_file_fallocate(
 	loff_t		offset,
 	loff_t		len)
 {
-	struct inode	*inode = file->f_path.dentry->d_inode;
+	struct inode	*inode = file_inode(file);
 	long		error;
 	loff_t		new_size = 0;
 	xfs_flock64_t	bf;
@@ -893,7 +891,7 @@ xfs_dir_open(
 	 */
 	mode = xfs_ilock_map_shared(ip);
 	if (ip->i_d.di_nextents > 0)
-		xfs_dir2_data_readahead(NULL, ip, 0, -1);
+		xfs_dir3_data_readahead(NULL, ip, 0, -1);
 	xfs_iunlock(ip, mode);
 	return 0;
 }
@@ -908,11 +906,10 @@ xfs_file_release(
 
 STATIC int
 xfs_file_readdir(
-	struct file	*filp,
-	void		*dirent,
-	filldir_t	filldir)
+	struct file	*file,
+	struct dir_context *ctx)
 {
-	struct inode	*inode = filp->f_path.dentry->d_inode;
+	struct inode	*inode = file_inode(file);
 	xfs_inode_t	*ip = XFS_I(inode);
 	int		error;
 	size_t		bufsize;
@@ -931,8 +928,7 @@ xfs_file_readdir(
 	 */
 	bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
 
-	error = xfs_readdir(ip, dirent, bufsize,
-				(xfs_off_t *)&filp->f_pos, filldir);
+	error = xfs_readdir(ip, ctx, bufsize);
 	if (error)
 		return -error;
 	return 0;
@@ -1272,8 +1268,7 @@ xfs_seek_data(
 	}
 
 out:
-	if (offset != file->f_pos)
-		file->f_pos = offset;
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out_unlock:
 	xfs_iunlock_map_shared(ip, lock);
@@ -1381,8 +1376,7 @@ out:
 	 * situation in particular.
 	 */
 	offset = min_t(loff_t, offset, isize);
-	if (offset != file->f_pos)
-		file->f_pos = offset;
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out_unlock:
 	xfs_iunlock_map_shared(ip, lock);
@@ -1434,7 +1428,7 @@ const struct file_operations xfs_file_operations = {
 const struct file_operations xfs_dir_file_operations = {
 	.open		= xfs_dir_open,
 	.read		= generic_read_dir,
-	.readdir	= xfs_file_readdir,
+	.iterate	= xfs_file_readdir,
 	.llseek		= generic_file_llseek,
 	.unlocked_ioctl	= xfs_file_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 6dda3f949b0..d0469554539 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_PROJID32	0x0800  /* 32-bit project IDs	*/
 #define XFS_FSOP_GEOM_FLAGS_DIRV2CI	0x1000	/* ASCII only CI names	*/
 #define XFS_FSOP_GEOM_FLAGS_LAZYSB	0x4000	/* lazy superblock counters */
+#define XFS_FSOP_GEOM_FLAGS_V5SB	0x8000	/* version 5 superblock */
 
 
 /*
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 94eaeedc549..614eb0cc360 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -99,7 +99,9 @@ xfs_fs_geometry(
 			(xfs_sb_version_hasattr2(&mp->m_sb) ?
 				XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
 			(xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
-				XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
+				XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) |
+			(xfs_sb_version_hascrc(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_V5SB : 0);
 		geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
 				mp->m_sb.sb_logsectsize : BBSIZE;
 		geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -174,7 +176,7 @@ xfs_growfs_data_private(
 	if (!bp)
 		return EIO;
 	if (bp->b_error) {
-		int	error = bp->b_error;
+		error = bp->b_error;
 		xfs_buf_relse(bp);
 		return error;
 	}
@@ -247,6 +249,9 @@ xfs_growfs_data_private(
 		tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
 		agf->agf_freeblks = cpu_to_be32(tmpsize);
 		agf->agf_longest = cpu_to_be32(tmpsize);
+		if (xfs_sb_version_hascrc(&mp->m_sb))
+			uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid);
+
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
 		if (error)
@@ -265,6 +270,11 @@ xfs_growfs_data_private(
 		}
 
 		agfl = XFS_BUF_TO_AGFL(bp);
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+			agfl->agfl_seqno = cpu_to_be32(agno);
+			uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
+		}
 		for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
 			agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
 
@@ -296,8 +306,11 @@ xfs_growfs_data_private(
 		agi->agi_freecount = 0;
 		agi->agi_newino = cpu_to_be32(NULLAGINO);
 		agi->agi_dirino = cpu_to_be32(NULLAGINO);
+		if (xfs_sb_version_hascrc(&mp->m_sb))
+			uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
 		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
 			agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
 		if (error)
@@ -316,7 +329,13 @@ xfs_growfs_data_private(
 			goto error0;
 		}
 
-		xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
+		if (xfs_sb_version_hascrc(&mp->m_sb))
+			xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1,
+						agno, XFS_BTREE_CRC_BLOCKS);
+		else
+			xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1,
+						agno, 0);
+
 		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
 		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
 		arec->ar_blockcount = cpu_to_be32(
@@ -339,7 +358,13 @@ xfs_growfs_data_private(
 			goto error0;
 		}
 
-		xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
+		if (xfs_sb_version_hascrc(&mp->m_sb))
+			xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1,
+						agno, XFS_BTREE_CRC_BLOCKS);
+		else
+			xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1,
+						agno, 0);
+
 		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
 		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
 		arec->ar_blockcount = cpu_to_be32(
@@ -363,7 +388,12 @@ xfs_growfs_data_private(
 			goto error0;
 		}
 
-		xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
+		if (xfs_sb_version_hascrc(&mp->m_sb))
+			xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0,
+						agno, XFS_BTREE_CRC_BLOCKS);
+		else
+			xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0,
+						agno, 0);
 
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
@@ -709,8 +739,8 @@ xfs_fs_log_dummy(
 	int		error;
 
 	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-					XFS_DEFAULT_LOG_COUNT);
+	error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
+				  XFS_DEFAULT_LOG_COUNT);
 	if (error) {
 		xfs_trans_cancel(tp, 0);
 		return error;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index a815412eab8..7a0c17d7ec0 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -36,6 +36,9 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_bmap.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
 
 
 /*
@@ -148,12 +151,16 @@ xfs_check_agi_freecount(
 #endif
 
 /*
- * Initialise a new set of inodes.
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
  */
-STATIC int
+int
 xfs_ialloc_inode_init(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
+	struct list_head	*buffer_list,
 	xfs_agnumber_t		agno,
 	xfs_agblock_t		agbno,
 	xfs_agblock_t		length,
@@ -165,6 +172,7 @@ xfs_ialloc_inode_init(
 	int			version;
 	int			i, j;
 	xfs_daddr_t		d;
+	xfs_ino_t		ino = 0;
 
 	/*
 	 * Loop over the new block(s), filling in the inodes.
@@ -183,13 +191,41 @@ xfs_ialloc_inode_init(
 	}
 
 	/*
-	 * Figure out what version number to use in the inodes we create.
-	 * If the superblock version has caught up to the one that supports
-	 * the new inode format, then use the new inode version.  Otherwise
-	 * use the old version so that old kernels will continue to be
-	 * able to use the file system.
+	 * Figure out what version number to use in the inodes we create.  If
+	 * the superblock version has caught up to the one that supports the new
+	 * inode format, then use the new inode version.  Otherwise use the old
+	 * version so that old kernels will continue to be able to use the file
+	 * system.
+	 *
+	 * For v3 inodes, we also need to write the inode number into the inode,
+	 * so calculate the first inode number of the chunk here as
+	 * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+	 * across multiple filesystem blocks (such as a cluster) and so cannot
+	 * be used in the cluster buffer loop below.
+	 *
+	 * Further, because we are writing the inode directly into the buffer
+	 * and calculating a CRC on the entire inode, we have ot log the entire
+	 * inode so that the entire range the CRC covers is present in the log.
+	 * That means for v3 inode we log the entire buffer rather than just the
+	 * inode cores.
 	 */
-	if (xfs_sb_version_hasnlink(&mp->m_sb))
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		version = 3;
+		ino = XFS_AGINO_TO_INO(mp, agno,
+				       XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+
+		/*
+		 * log the initialisation that is about to take place as an
+		 * logical operation. This means the transaction does not
+		 * need to log the physical changes to the inode buffers as log
+		 * recovery will know what initialisation is actually needed.
+		 * Hence we only need to log the buffers as "ordered" buffers so
+		 * they track in the AIL as if they were physically logged.
+		 */
+		if (tp)
+			xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
+					mp->m_sb.sb_inodesize, length, gen);
+	} else if (xfs_sb_version_hasnlink(&mp->m_sb))
 		version = 2;
 	else
 		version = 1;
@@ -204,27 +240,58 @@ xfs_ialloc_inode_init(
 					 XBF_UNMAPPED);
 		if (!fbuf)
 			return ENOMEM;
-		/*
-		 * Initialize all inodes in this buffer and then log them.
-		 *
-		 * XXX: It would be much better if we had just one transaction
-		 *	to log a whole cluster of inodes instead of all the
-		 *	individual transactions causing a lot of log traffic.
-		 */
+
+		/* Initialize the inode buffers and log them appropriately. */
 		fbuf->b_ops = &xfs_inode_buf_ops;
-		xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+		xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
 		for (i = 0; i < ninodes; i++) {
 			int	ioffset = i << mp->m_sb.sb_inodelog;
-			uint	isize = sizeof(struct xfs_dinode);
+			uint	isize = xfs_dinode_size(version);
 
 			free = xfs_make_iptr(mp, fbuf, i);
 			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 			free->di_version = version;
 			free->di_gen = cpu_to_be32(gen);
 			free->di_next_unlinked = cpu_to_be32(NULLAGINO);
-			xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+
+			if (version == 3) {
+				free->di_ino = cpu_to_be64(ino);
+				ino++;
+				uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+				xfs_dinode_calc_crc(mp, free);
+			} else if (tp) {
+				/* just log the inode core */
+				xfs_trans_log_buf(tp, fbuf, ioffset,
+						  ioffset + isize - 1);
+			}
+		}
+
+		if (tp) {
+			/*
+			 * Mark the buffer as an inode allocation buffer so it
+			 * sticks in AIL at the point of this allocation
+			 * transaction. This ensures the they are on disk before
+			 * the tail of the log can be moved past this
+			 * transaction (i.e. by preventing relogging from moving
+			 * it forward in the log).
+			 */
+			xfs_trans_inode_alloc_buf(tp, fbuf);
+			if (version == 3) {
+				/*
+				 * Mark the buffer as ordered so that they are
+				 * not physically logged in the transaction but
+				 * still tracked in the AIL as part of the
+				 * transaction and pin the log appropriately.
+				 */
+				xfs_trans_ordered_buf(tp, fbuf);
+				xfs_trans_log_buf(tp, fbuf, 0,
+						  BBTOB(fbuf->b_length) - 1);
+			}
+		} else {
+			fbuf->b_flags |= XBF_DONE;
+			xfs_buf_delwri_queue(fbuf, buffer_list);
+			xfs_buf_relse(fbuf);
 		}
-		xfs_trans_inode_alloc_buf(tp, fbuf);
 	}
 	return 0;
 }
@@ -269,7 +336,7 @@ xfs_ialloc_ag_alloc(
 	 * First try to allocate inodes contiguous with the last-allocated
 	 * chunk of inodes.  If the filesystem is striped, this will fill
 	 * an entire stripe unit with inodes.
- 	 */
+	 */
 	agi = XFS_BUF_TO_AGI(agbp);
 	newino = be32_to_cpu(agi->agi_newino);
 	agno = be32_to_cpu(agi->agi_seqno);
@@ -279,8 +346,6 @@ xfs_ialloc_ag_alloc(
 		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
 		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 		args.type = XFS_ALLOCTYPE_THIS_BNO;
-		args.mod = args.total = args.wasdel = args.isfl =
-			args.userdata = args.minalignslop = 0;
 		args.prod = 1;
 
 		/*
@@ -333,8 +398,6 @@ xfs_ialloc_ag_alloc(
 		 * Allocate a fixed-size extent of inodes.
 		 */
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
-		args.mod = args.total = args.wasdel = args.isfl =
-			args.userdata = args.minalignslop = 0;
 		args.prod = 1;
 		/*
 		 * Allow space for the inode btree to split.
@@ -372,8 +435,8 @@ xfs_ialloc_ag_alloc(
 	 * rather than a linear progression to prevent the next generation
 	 * number from being easily guessable.
 	 */
-	error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
-			args.len, random32());
+	error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
+			args.len, prandom_u32());
 
 	if (error)
 		return error;
@@ -585,8 +648,7 @@ xfs_ialloc_get_rec(
 	struct xfs_btree_cur	*cur,
 	xfs_agino_t		agino,
 	xfs_inobt_rec_incore_t	*rec,
-	int			*done,
-	int			left)
+	int			*done)
 {
 	int                     error;
 	int			i;
@@ -694,12 +756,12 @@ xfs_dialloc_ag(
 		    pag->pagl_leftrec != NULLAGINO &&
 		    pag->pagl_rightrec != NULLAGINO) {
 			error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
-						   &trec, &doneleft, 1);
+						   &trec, &doneleft);
 			if (error)
 				goto error1;
 
 			error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
-						   &rec, &doneright, 0);
+						   &rec, &doneright);
 			if (error)
 				goto error1;
 		} else {
@@ -1457,6 +1519,7 @@ xfs_ialloc_log_agi(
 	/*
 	 * Log the allocation group inode header buffer.
 	 */
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
 	xfs_trans_log_buf(tp, bp, first, last);
 }
 
@@ -1474,19 +1537,23 @@ xfs_check_agi_unlinked(
 #define xfs_check_agi_unlinked(agi)
 #endif
 
-static void
+static bool
 xfs_agi_verify(
 	struct xfs_buf	*bp)
 {
 	struct xfs_mount *mp = bp->b_target->bt_mount;
 	struct xfs_agi	*agi = XFS_BUF_TO_AGI(bp);
-	int		agi_ok;
 
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+			return false;
 	/*
 	 * Validate the magic number of the agi block.
 	 */
-	agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
-		XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
+	if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+		return false;
+	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
+		return false;
 
 	/*
 	 * during growfs operations, the perag is not fully initialised,
@@ -1494,30 +1561,52 @@ xfs_agi_verify(
 	 * use it by using uncached buffers that don't have the perag attached
 	 * so we can detect and avoid this problem.
 	 */
-	if (bp->b_pag)
-		agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
-						bp->b_pag->pag_agno;
+	if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
+		return false;
 
-	if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
-			XFS_RANDOM_IALLOC_READ_AGI))) {
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-	}
 	xfs_check_agi_unlinked(agi);
+	return true;
 }
 
 static void
 xfs_agi_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_agi_verify(bp);
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	int		agi_ok = 1;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+					  offsetof(struct xfs_agi, agi_crc));
+	agi_ok = agi_ok && xfs_agi_verify(bp);
+
+	if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+			XFS_RANDOM_IALLOC_READ_AGI))) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
 }
 
 static void
 xfs_agi_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_agi_verify(bp);
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_agi_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+			 offsetof(struct xfs_agi, agi_crc));
 }
 
 const struct xfs_buf_ops xfs_agi_buf_ops = {
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index c8da3df271e..68c07320f09 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,6 +150,14 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
 int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
 		xfs_inobt_rec_incore_t *rec, int *stat);
 
+/*
+ * Inode chunk initialisation routine
+ */
+int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
+			  struct list_head *buffer_list,
+			  xfs_agnumber_t agno, xfs_agblock_t agbno,
+			  xfs_agblock_t length, unsigned int gen);
+
 extern const struct xfs_buf_ops xfs_agi_buf_ops;
 
 #endif	/* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index bec344b3650..5448eb6b8c1 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -34,6 +34,7 @@
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_cksum.h"
 
 
 STATIC int
@@ -182,52 +183,88 @@ xfs_inobt_key_diff(
 			  cur->bc_rec.i.ir_startino;
 }
 
-void
+static int
 xfs_inobt_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
 	unsigned int		level;
-	int			sblock_ok; /* block passes checks */
 
-	/* magic number and level verification */
-	level = be16_to_cpu(block->bb_level);
-	sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
-		    level < mp->m_in_maxlevels;
+	/*
+	 * During growfs operations, we can't verify the exact owner as the
+	 * perag is not fully initialised and hence not attached to the buffer.
+	 *
+	 * Similarly, during log recovery we will have a perag structure
+	 * attached, but the agi information will not yet have been initialised
+	 * from the on disk AGI. We don't currently use any of this information,
+	 * but beware of the landmine (i.e. need to check pag->pagi_init) if we
+	 * ever do.
+	 */
+	switch (block->bb_magic) {
+	case cpu_to_be32(XFS_IBT_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+			return false;
+		if (pag &&
+		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_IBT_MAGIC):
+		break;
+	default:
+		return 0;
+	}
 
-	/* numrecs verification */
-	sblock_ok = sblock_ok &&
-		be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
+	/* numrecs and level verification */
+	level = be16_to_cpu(block->bb_level);
+	if (level >= mp->m_in_maxlevels)
+		return false;
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
+		return false;
 
 	/* sibling pointer verification */
-	sblock_ok = sblock_ok &&
-		(block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
-		 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
-		block->bb_u.s.bb_leftsib &&
-		(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
-		 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
-		block->bb_u.s.bb_rightsib;
-
-	if (!sblock_ok) {
-		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
-		xfs_buf_ioerror(bp, EFSCORRUPTED);
-	}
+	if (!block->bb_u.s.bb_leftsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+	if (!block->bb_u.s.bb_rightsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+
+	return true;
 }
 
 static void
 xfs_inobt_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_inobt_verify(bp);
+	if (!(xfs_btree_sblock_verify_crc(bp) &&
+	      xfs_inobt_verify(bp))) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+				     bp->b_target->bt_mount, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
 }
 
 static void
 xfs_inobt_write_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_inobt_verify(bp);
+	if (!xfs_inobt_verify(bp)) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+				     bp->b_target->bt_mount, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
 }
 
 const struct xfs_buf_ops xfs_inobt_buf_ops = {
@@ -235,7 +272,7 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = {
 	.verify_write = xfs_inobt_write_verify,
 };
 
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 STATIC int
 xfs_inobt_keys_inorder(
 	struct xfs_btree_cur	*cur,
@@ -273,7 +310,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
 	.init_ptr_from_cur	= xfs_inobt_init_ptr_from_cur,
 	.key_diff		= xfs_inobt_key_diff,
 	.buf_ops		= &xfs_inobt_buf_ops,
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 	.keys_inorder		= xfs_inobt_keys_inorder,
 	.recs_inorder		= xfs_inobt_recs_inorder,
 #endif
@@ -301,6 +338,8 @@ xfs_inobt_init_cursor(
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 
 	cur->bc_ops = &xfs_inobt_ops;
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
 
 	cur->bc_private.a.agbp = agbp;
 	cur->bc_private.a.agno = agno;
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 25c0239a8ea..3ac36b7642e 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -29,7 +29,8 @@ struct xfs_mount;
 /*
  * There is a btree for the inode map per allocation group.
  */
-#define	XFS_IBT_MAGIC	0x49414254	/* 'IABT' */
+#define	XFS_IBT_MAGIC		0x49414254	/* 'IABT' */
+#define	XFS_IBT_CRC_MAGIC	0x49414233	/* 'IAB3' */
 
 typedef	__uint64_t	xfs_inofree_t;
 #define	XFS_INODES_PER_CHUNK		(NBBY * sizeof(xfs_inofree_t))
@@ -76,10 +77,10 @@ typedef __be32 xfs_inobt_ptr_t;
 
 /*
  * Btree block header size depends on a superblock flag.
- *
- * (not quite yet, but soon)
  */
-#define XFS_INOBT_BLOCK_LEN(mp)	XFS_BTREE_SBLOCK_LEN
+#define XFS_INOBT_BLOCK_LEN(mp) \
+	(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
 
 /*
  * Record, key, and pointer address macros for btree blocks.
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e3e92..9560dc1f15a 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -335,7 +335,8 @@ xfs_iget_cache_miss(
 	iflags = XFS_INEW;
 	if (flags & XFS_IGET_DONTCACHE)
 		iflags |= XFS_IDONTCACHE;
-	ip->i_udquot = ip->i_gdquot = NULL;
+	ip->i_udquot = NULL;
+	ip->i_gdquot = NULL;
 	xfs_iflags_set(ip, iflags);
 
 	/* insert the new inode */
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index e0f138c70a2..a01afbb3909 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,7 +40,6 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
 void xfs_eofblocks_worker(struct work_struct *);
 
-int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
 		int flags, void *args),
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 00000000000..7716a4e7375
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2008-2010, 2013 Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_error.h"
+#include "xfs_icreate_item.h"
+
+kmem_zone_t	*xfs_icreate_zone;		/* inode create item zone */
+
+static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_icreate_item, ic_item);
+}
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We only need one iovec for the icreate log structure.
+ */
+STATIC uint
+xfs_icreate_item_size(
+	struct xfs_log_item	*lip)
+{
+	return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode create log item.
+ */
+STATIC void
+xfs_icreate_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_iovec	*log_vector)
+{
+	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
+
+	log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
+	log_vector->i_len  = sizeof(struct xfs_icreate_log);
+	log_vector->i_type = XLOG_REG_TYPE_ICREATE;
+}
+
+
+/* Pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+
+/* pinning has no meaning for the create item, so just return. */
+STATIC void
+xfs_icreate_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+}
+
+STATIC void
+xfs_icreate_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
+
+	if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+		kmem_zone_free(xfs_icreate_zone, icp);
+	return;
+}
+
+/*
+ * Because we have ordered buffers being tracked in the AIL for the inode
+ * creation, we don't need the create item after this. Hence we can free
+ * the log item and return -1 to tell the caller we're done with the item.
+ */
+STATIC xfs_lsn_t
+xfs_icreate_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_icreate_item	*icp = ICR_ITEM(lip);
+
+	kmem_zone_free(xfs_icreate_zone, icp);
+	return (xfs_lsn_t)-1;
+}
+
+/* item can never get into the AIL */
+STATIC uint
+xfs_icreate_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	ASSERT(0);
+	return XFS_ITEM_SUCCESS;
+}
+
+/* Ordered buffers do the dependency tracking here, so this does nothing. */
+STATIC void
+xfs_icreate_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+static struct xfs_item_ops xfs_icreate_item_ops = {
+	.iop_size	= xfs_icreate_item_size,
+	.iop_format	= xfs_icreate_item_format,
+	.iop_pin	= xfs_icreate_item_pin,
+	.iop_unpin	= xfs_icreate_item_unpin,
+	.iop_push	= xfs_icreate_item_push,
+	.iop_unlock	= xfs_icreate_item_unlock,
+	.iop_committed	= xfs_icreate_item_committed,
+	.iop_committing = xfs_icreate_item_committing,
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ *
+ * Inode extents can only reside within an AG. Hence specify the starting
+ * block for the inode chunk by offset within an AG as well as the
+ * length of the allocated extent.
+ *
+ * This joins the item to the transaction and marks it dirty so
+ * that we don't need a separate call to do this, nor does the
+ * caller need to know anything about the icreate item.
+ */
+void
+xfs_icreate_log(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	unsigned int		count,
+	unsigned int		inode_size,
+	xfs_agblock_t		length,
+	unsigned int		generation)
+{
+	struct xfs_icreate_item	*icp;
+
+	icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+
+	xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
+			  &xfs_icreate_item_ops);
+
+	icp->ic_format.icl_type = XFS_LI_ICREATE;
+	icp->ic_format.icl_size = 1;	/* single vector */
+	icp->ic_format.icl_ag = cpu_to_be32(agno);
+	icp->ic_format.icl_agbno = cpu_to_be32(agbno);
+	icp->ic_format.icl_count = cpu_to_be32(count);
+	icp->ic_format.icl_isize = cpu_to_be32(inode_size);
+	icp->ic_format.icl_length = cpu_to_be32(length);
+	icp->ic_format.icl_gen = cpu_to_be32(generation);
+
+	xfs_trans_add_item(tp, &icp->ic_item);
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
new file mode 100644
index 00000000000..88ba8aa0bc4
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2008-2010, Dave Chinner
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_ICREATE_ITEM_H
+#define XFS_ICREATE_ITEM_H	1
+
+/*
+ * on disk log item structure
+ *
+ * Log recovery assumes the first two entries are the type and size and they fit
+ * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
+ * decoding can be done correctly.
+ */
+struct xfs_icreate_log {
+	__uint16_t	icl_type;	/* type of log format structure */
+	__uint16_t	icl_size;	/* size of log format structure */
+	__be32		icl_ag;		/* ag being allocated in */
+	__be32		icl_agbno;	/* start block of inode range */
+	__be32		icl_count;	/* number of inodes to initialise */
+	__be32		icl_isize;	/* size of inodes */
+	__be32		icl_length;	/* length of extent to initialise */
+	__be32		icl_gen;	/* inode generation number to use */
+};
+
+/* in memory log item structure */
+struct xfs_icreate_item {
+	struct xfs_log_item	ic_item;
+	struct xfs_icreate_log	ic_format;
+};
+
+extern kmem_zone_t *xfs_icreate_zone;	/* inode create item zone */
+
+void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
+			xfs_agblock_t agbno, unsigned int count,
+			unsigned int inode_size, xfs_agblock_t length,
+			unsigned int generation);
+
+#endif	/* XFS_ICREATE_ITEM_H */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 66282dcb821..9ecfe1e559f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -44,6 +44,7 @@
 #include "xfs_quota.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
+#include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 
@@ -286,7 +287,7 @@ xfs_ilock_demote(
 	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 }
 
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 int
 xfs_isilocked(
 	xfs_inode_t		*ip,
@@ -786,6 +787,7 @@ xfs_iformat_btree(
 	xfs_dinode_t		*dip,
 	int			whichfork)
 {
+	struct xfs_mount	*mp = ip->i_mount;
 	xfs_bmdr_block_t	*dfp;
 	xfs_ifork_t		*ifp;
 	/* REFERENCED */
@@ -794,7 +796,7 @@ xfs_iformat_btree(
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
-	size = XFS_BMAP_BROOT_SPACE(dfp);
+	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
 	nrecs = be16_to_cpu(dfp->bb_numrecs);
 
 	/*
@@ -805,14 +807,14 @@ xfs_iformat_btree(
 	 * blocks.
 	 */
 	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
-			XFS_IFORK_MAXEXT(ip, whichfork) ||
+					XFS_IFORK_MAXEXT(ip, whichfork) ||
 		     XFS_BMDR_SPACE_CALC(nrecs) >
-			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
+					XFS_DFORK_SIZE(dip, mp, whichfork) ||
 		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
-		xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
-			(unsigned long long) ip->i_ino);
+		xfs_warn(mp, "corrupt inode %Lu (btree).",
+					(unsigned long long) ip->i_ino);
 		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
-				 ip->i_mount, dip);
+					 mp, dip);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
@@ -823,8 +825,7 @@ xfs_iformat_btree(
 	 * Copy and convert from the on-disk structure
 	 * to the in-memory structure.
 	 */
-	xfs_bmdr_to_bmbt(ip->i_mount, dfp,
-			 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+	xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
 			 ifp->if_broot, size);
 	ifp->if_flags &= ~XFS_IFEXTENTS;
 	ifp->if_flags |= XFS_IFBROOT;
@@ -866,6 +867,17 @@ xfs_dinode_from_disk(
 	to->di_dmstate	= be16_to_cpu(from->di_dmstate);
 	to->di_flags	= be16_to_cpu(from->di_flags);
 	to->di_gen	= be32_to_cpu(from->di_gen);
+
+	if (to->di_version == 3) {
+		to->di_changecount = be64_to_cpu(from->di_changecount);
+		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
+		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+		to->di_flags2 = be64_to_cpu(from->di_flags2);
+		to->di_ino = be64_to_cpu(from->di_ino);
+		to->di_lsn = be64_to_cpu(from->di_lsn);
+		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &from->di_uuid);
+	}
 }
 
 void
@@ -902,6 +914,17 @@ xfs_dinode_to_disk(
 	to->di_dmstate = cpu_to_be16(from->di_dmstate);
 	to->di_flags = cpu_to_be16(from->di_flags);
 	to->di_gen = cpu_to_be32(from->di_gen);
+
+	if (from->di_version == 3) {
+		to->di_changecount = cpu_to_be64(from->di_changecount);
+		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+		to->di_flags2 = cpu_to_be64(from->di_flags2);
+		to->di_ino = cpu_to_be64(from->di_ino);
+		to->di_lsn = cpu_to_be64(from->di_lsn);
+		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+		uuid_copy(&to->di_uuid, &from->di_uuid);
+	}
 }
 
 STATIC uint
@@ -962,8 +985,54 @@ xfs_dic2xflags(
 				(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 }
 
+static bool
+xfs_dinode_verify(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip)
+{
+	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
+		return false;
+
+	/* only version 3 or greater inodes are extensively verified here */
+	if (dip->di_version < 3)
+		return true;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return false;
+	if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+			      offsetof(struct xfs_dinode, di_crc)))
+		return false;
+	if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+		return false;
+	if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	return true;
+}
+
+void
+xfs_dinode_calc_crc(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dip)
+{
+	__uint32_t		crc;
+
+	if (dip->di_version < 3)
+		return;
+
+	ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
+	crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+			      offsetof(struct xfs_dinode, di_crc));
+	dip->di_crc = xfs_end_cksum(crc);
+}
+
 /*
  * Read the disk inode attributes into the in-core inode structure.
+ *
+ * If we are initialising a new inode and we are not utilising the
+ * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core
+ * with a random generation number. If we are keeping inodes around, we need to
+ * read the inode cluster to get the existing generation number off disk.
  */
 int
 xfs_iread(
@@ -983,6 +1052,22 @@ xfs_iread(
 	if (error)
 		return error;
 
+	/* shortcut IO on inode allocation if possible */
+	if ((iget_flags & XFS_IGET_CREATE) &&
+	    !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+		/* initialise the on-disk inode core */
+		memset(&ip->i_d, 0, sizeof(ip->i_d));
+		ip->i_d.di_magic = XFS_DINODE_MAGIC;
+		ip->i_d.di_gen = prandom_u32();
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			ip->i_d.di_version = 3;
+			ip->i_d.di_ino = ip->i_ino;
+			uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+		} else
+			ip->i_d.di_version = 2;
+		return 0;
+	}
+
 	/*
 	 * Get pointers to the on-disk inode and the buffer containing it.
 	 */
@@ -990,17 +1075,13 @@ xfs_iread(
 	if (error)
 		return error;
 
-	/*
-	 * If we got something that isn't an inode it means someone
-	 * (nfs or dmi) has a stale handle.
-	 */
-	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) {
-#ifdef DEBUG
-		xfs_alert(mp,
-			"%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
-			__func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
-#endif /* DEBUG */
-		error = XFS_ERROR(EINVAL);
+	/* even unallocated inodes are verified */
+	if (!xfs_dinode_verify(mp, ip, dip)) {
+		xfs_alert(mp, "%s: validation failed for inode %lld failed",
+				__func__, ip->i_ino);
+
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
+		error = XFS_ERROR(EFSCORRUPTED);
 		goto out_brelse;
 	}
 
@@ -1022,10 +1103,20 @@ xfs_iread(
 			goto out_brelse;
 		}
 	} else {
+		/*
+		 * Partial initialisation of the in-core inode. Just the bits
+		 * that xfs_ialloc won't overwrite or relies on being correct.
+		 */
 		ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
 		ip->i_d.di_version = dip->di_version;
 		ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
 		ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
+
+		if (dip->di_version == 3) {
+			ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
+			uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
+		}
+
 		/*
 		 * Make sure to pull in the mode here as well in
 		 * case the inode is released without being used.
@@ -1063,17 +1154,16 @@ xfs_iread(
 	xfs_buf_set_ref(bp, XFS_INO_REF);
 
 	/*
-	 * Use xfs_trans_brelse() to release the buffer containing the
-	 * on-disk inode, because it was acquired with xfs_trans_read_buf()
-	 * in xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
+	 * Use xfs_trans_brelse() to release the buffer containing the on-disk
+	 * inode, because it was acquired with xfs_trans_read_buf() in
+	 * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
 	 * brelse().  If we're within a transaction, then xfs_trans_brelse()
 	 * will only release the buffer if it is not dirty within the
 	 * transaction.  It will be OK to release the buffer in this case,
-	 * because inodes on disk are never destroyed and we will be
-	 * locking the new in-core inode before putting it in the hash
-	 * table where other processes can find it.  Thus we don't have
-	 * to worry about the inode being changed just because we released
-	 * the buffer.
+	 * because inodes on disk are never destroyed and we will be locking the
+	 * new in-core inode before putting it in the cache where other
+	 * processes can find it.  Thus we don't have to worry about the inode
+	 * being changed just because we released the buffer.
 	 */
  out_brelse:
 	xfs_trans_brelse(tp, bp);
@@ -1161,6 +1251,7 @@ xfs_ialloc(
 	xfs_buf_t	**ialloc_context,
 	xfs_inode_t	**ipp)
 {
+	struct xfs_mount *mp = tp->t_mountp;
 	xfs_ino_t	ino;
 	xfs_inode_t	*ip;
 	uint		flags;
@@ -1187,7 +1278,7 @@ xfs_ialloc(
 	 * This is because we're setting fields here we need
 	 * to prevent others from looking at until we're done.
 	 */
-	error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
+	error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
 			 XFS_ILOCK_EXCL, &ip);
 	if (error)
 		return error;
@@ -1208,7 +1299,7 @@ xfs_ialloc(
 	 * the inode version number now.  This way we only do the conversion
 	 * here rather than here and in the flush/logging code.
 	 */
-	if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
+	if (xfs_sb_version_hasnlink(&mp->m_sb) &&
 	    ip->i_d.di_version == 1) {
 		ip->i_d.di_version = 2;
 		/*
@@ -1258,6 +1349,19 @@ xfs_ialloc(
 	ip->i_d.di_dmevmask = 0;
 	ip->i_d.di_dmstate = 0;
 	ip->i_d.di_flags = 0;
+
+	if (ip->i_d.di_version == 3) {
+		ASSERT(ip->i_d.di_ino == ino);
+		ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
+		ip->i_d.di_crc = 0;
+		ip->i_d.di_changecount = 1;
+		ip->i_d.di_lsn = 0;
+		ip->i_d.di_flags2 = 0;
+		memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
+		ip->i_d.di_crtime = ip->i_d.di_mtime;
+	}
+
+
 	flags = XFS_ILOG_CORE;
 	switch (mode & S_IFMT) {
 	case S_IFIFO:
@@ -1554,6 +1658,10 @@ xfs_iunlink(
 		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
 		offset = ip->i_imap.im_boffset +
 			offsetof(xfs_dinode_t, di_next_unlinked);
+
+		/* need to recalc the inode CRC if appropriate */
+		xfs_dinode_calc_crc(mp, dip);
+
 		xfs_trans_inode_buf(tp, ibp);
 		xfs_trans_log_buf(tp, ibp, offset,
 				  (offset + sizeof(xfs_agino_t) - 1));
@@ -1639,6 +1747,10 @@ xfs_iunlink_remove(
 			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
 			offset = ip->i_imap.im_boffset +
 				offsetof(xfs_dinode_t, di_next_unlinked);
+
+			/* need to recalc the inode CRC if appropriate */
+			xfs_dinode_calc_crc(mp, dip);
+
 			xfs_trans_inode_buf(tp, ibp);
 			xfs_trans_log_buf(tp, ibp, offset,
 					  (offset + sizeof(xfs_agino_t) - 1));
@@ -1712,6 +1824,10 @@ xfs_iunlink_remove(
 			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
 			offset = ip->i_imap.im_boffset +
 				offsetof(xfs_dinode_t, di_next_unlinked);
+
+			/* need to recalc the inode CRC if appropriate */
+			xfs_dinode_calc_crc(mp, dip);
+
 			xfs_trans_inode_buf(tp, ibp);
 			xfs_trans_log_buf(tp, ibp, offset,
 					  (offset + sizeof(xfs_agino_t) - 1));
@@ -1725,6 +1841,10 @@ xfs_iunlink_remove(
 		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
 		ASSERT(next_agino != 0);
 		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+
+		/* need to recalc the inode CRC if appropriate */
+		xfs_dinode_calc_crc(mp, last_dip);
+
 		xfs_trans_inode_buf(tp, last_ibp);
 		xfs_trans_log_buf(tp, last_ibp, offset,
 				  (offset + sizeof(xfs_agino_t) - 1));
@@ -1928,8 +2048,6 @@ xfs_ifree(
 	int			error;
 	int			delete;
 	xfs_ino_t		first_ino;
-	xfs_dinode_t    	*dip;
-	xfs_buf_t       	*ibp;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(ip->i_d.di_nlink == 0);
@@ -1942,14 +2060,13 @@ xfs_ifree(
 	 * Pull the on-disk inode from the AGI unlinked list.
 	 */
 	error = xfs_iunlink_remove(tp, ip);
-	if (error != 0) {
+	if (error)
 		return error;
-	}
 
 	error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
-	if (error != 0) {
+	if (error)
 		return error;
-	}
+
 	ip->i_d.di_mode = 0;		/* mark incore inode as free */
 	ip->i_d.di_flags = 0;
 	ip->i_d.di_dmevmask = 0;
@@ -1961,31 +2078,10 @@ xfs_ifree(
 	 * by reincarnations of this inode.
 	 */
 	ip->i_d.di_gen++;
-
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp,
-			       0, 0);
-	if (error)
-		return error;
-
-        /*
-	* Clear the on-disk di_mode. This is to prevent xfs_bulkstat
-	* from picking up this inode when it is reclaimed (its incore state
-	* initialzed but not flushed to disk yet). The in-core di_mode is
-	* already cleared  and a corresponding transaction logged.
-	* The hack here just synchronizes the in-core to on-disk
-	* di_mode value in advance before the actual inode sync to disk.
-	* This is OK because the inode is already unlinked and would never
-	* change its di_mode again for this inode generation.
-	* This is a temporary hack that would require a proper fix
-	* in the future.
-	*/
-	dip->di_mode = 0;
-
-	if (delete) {
+	if (delete)
 		error = xfs_ifree_cluster(ip, tp, first_ino);
-	}
 
 	return error;
 }
@@ -2037,7 +2133,7 @@ xfs_iroot_realloc(
 		 * allocate it now and get out.
 		 */
 		if (ifp->if_broot_bytes == 0) {
-			new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
+			new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
 			ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
 			ifp->if_broot_bytes = (int)new_size;
 			return;
@@ -2051,9 +2147,9 @@ xfs_iroot_realloc(
 		 */
 		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
 		new_max = cur_max + rec_diff;
-		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
 		ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-				(size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
+				XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
 				KM_SLEEP | KM_NOFS);
 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
@@ -2061,7 +2157,7 @@ xfs_iroot_realloc(
 						     (int)new_size);
 		ifp->if_broot_bytes = (int)new_size;
 		ASSERT(ifp->if_broot_bytes <=
-			XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+			XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip));
 		memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
 		return;
 	}
@@ -2076,7 +2172,7 @@ xfs_iroot_realloc(
 	new_max = cur_max + rec_diff;
 	ASSERT(new_max >= 0);
 	if (new_max > 0)
-		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
 	else
 		new_size = 0;
 	if (new_size > 0) {
@@ -2084,7 +2180,8 @@ xfs_iroot_realloc(
 		/*
 		 * First copy over the btree block header.
 		 */
-		memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
+		memcpy(new_broot, ifp->if_broot,
+			XFS_BMBT_BLOCK_LEN(ip->i_mount));
 	} else {
 		new_broot = NULL;
 		ifp->if_flags &= ~XFS_IFBROOT;
@@ -2114,7 +2211,7 @@ xfs_iroot_realloc(
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = (int)new_size;
 	ASSERT(ifp->if_broot_bytes <=
-		XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+		XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip));
 	return;
 }
 
@@ -2379,9 +2476,6 @@ xfs_iflush_fork(
 	char			*cp;
 	xfs_ifork_t		*ifp;
 	xfs_mount_t		*mp;
-#ifdef XFS_TRANS_DEBUG
-	int			first;
-#endif
 	static const short	brootflag[2] =
 		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
 	static const short	dataflag[2] =
@@ -2430,7 +2524,7 @@ xfs_iflush_fork(
 			ASSERT(ifp->if_broot != NULL);
 			ASSERT(ifp->if_broot_bytes <=
 			       (XFS_IFORK_SIZE(ip, whichfork) +
-				XFS_BROOT_SIZE_ADJ));
+				XFS_BROOT_SIZE_ADJ(ip)));
 			xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
 				(xfs_bmdr_block_t *)cp,
 				XFS_DFORK_SIZE(dip, mp, whichfork));
@@ -2718,23 +2812,18 @@ abort_out:
 
 STATIC int
 xfs_iflush_int(
-	xfs_inode_t		*ip,
-	xfs_buf_t		*bp)
+	struct xfs_inode	*ip,
+	struct xfs_buf		*bp)
 {
-	xfs_inode_log_item_t	*iip;
-	xfs_dinode_t		*dip;
-	xfs_mount_t		*mp;
-#ifdef XFS_TRANS_DEBUG
-	int			first;
-#endif
+	struct xfs_inode_log_item *iip = ip->i_itemp;
+	struct xfs_dinode	*dip;
+	struct xfs_mount	*mp = ip->i_mount;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(xfs_isiflocked(ip));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
-
-	iip = ip->i_itemp;
-	mp = ip->i_mount;
+	ASSERT(iip != NULL && iip->ili_fields != 0);
 
 	/* set *dip = inode's place in the buffer */
 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -2795,9 +2884,9 @@ xfs_iflush_int(
 	}
 	/*
 	 * bump the flush iteration count, used to detect flushes which
-	 * postdate a log record during recovery.
+	 * postdate a log record during recovery. This is redundant as we now
+	 * log every change and hence this can't happen. Still, it doesn't hurt.
 	 */
-
 	ip->i_d.di_flushiter++;
 
 	/*
@@ -2873,41 +2962,30 @@ xfs_iflush_int(
 	 * need the AIL lock, because it is a 64 bit value that cannot be read
 	 * atomically.
 	 */
-	if (iip != NULL && iip->ili_fields != 0) {
-		iip->ili_last_fields = iip->ili_fields;
-		iip->ili_fields = 0;
-		iip->ili_logged = 1;
+	iip->ili_last_fields = iip->ili_fields;
+	iip->ili_fields = 0;
+	iip->ili_logged = 1;
 
-		xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
-					&iip->ili_item.li_lsn);
+	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+				&iip->ili_item.li_lsn);
 
-		/*
-		 * Attach the function xfs_iflush_done to the inode's
-		 * buffer.  This will remove the inode from the AIL
-		 * and unlock the inode's flush lock when the inode is
-		 * completely written to disk.
-		 */
-		xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
+	/*
+	 * Attach the function xfs_iflush_done to the inode's
+	 * buffer.  This will remove the inode from the AIL
+	 * and unlock the inode's flush lock when the inode is
+	 * completely written to disk.
+	 */
+	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
 
-		ASSERT(bp->b_fspriv != NULL);
-		ASSERT(bp->b_iodone != NULL);
-	} else {
-		/*
-		 * We're flushing an inode which is not in the AIL and has
-		 * not been logged.  For this case we can immediately drop
-		 * the inode flush lock because we can avoid the whole
-		 * AIL state thing.  It's OK to drop the flush lock now,
-		 * because we've already locked the buffer and to do anything
-		 * you really need both.
-		 */
-		if (iip != NULL) {
-			ASSERT(iip->ili_logged == 0);
-			ASSERT(iip->ili_last_fields == 0);
-			ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
-		}
-		xfs_ifunlock(ip);
-	}
+	/* update the lsn in the on disk inode if required */
+	if (ip->i_d.di_version == 3)
+		dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
+
+	/* generate the checksum. */
+	xfs_dinode_calc_crc(mp, dip);
 
+	ASSERT(bp->b_fspriv != NULL);
+	ASSERT(bp->b_iodone != NULL);
 	return 0;
 
 corrupt_out:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 22baf6ea4fa..91129794aae 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -150,13 +150,38 @@ typedef struct xfs_icdinode {
 	__uint16_t	di_dmstate;	/* DMIG state info */
 	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
 	__uint32_t	di_gen;		/* generation number */
+
+	/* di_next_unlinked is the only non-core field in the old dinode */
+	xfs_agino_t	di_next_unlinked;/* agi unlinked list ptr */
+
+	/* start of the extended dinode, writable fields */
+	__uint32_t	di_crc;		/* CRC of the inode */
+	__uint64_t	di_changecount;	/* number of attribute changes */
+	xfs_lsn_t	di_lsn;		/* flush sequence */
+	__uint64_t	di_flags2;	/* more random flags */
+	__uint8_t	di_pad2[16];	/* more padding for future expansion */
+
+	/* fields only written to during inode creation */
+	xfs_ictimestamp_t di_crtime;	/* time created */
+	xfs_ino_t	di_ino;		/* inode number */
+	uuid_t		di_uuid;	/* UUID of the filesystem */
+
+	/* structure must be padded to 64 bit alignment */
 } xfs_icdinode_t;
 
+static inline uint xfs_icdinode_size(int version)
+{
+	if (version == 3)
+		return sizeof(struct xfs_icdinode);
+	return offsetof(struct xfs_icdinode, di_next_unlinked);
+}
+
 /*
  * Flags for xfs_ichgtime().
  */
 #define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
 #define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
+#define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
 
 /*
  * Per-fork incore inode flags.
@@ -180,10 +205,11 @@ typedef struct xfs_icdinode {
 #define XFS_IFORK_DSIZE(ip) \
 	(XFS_IFORK_Q(ip) ? \
 		XFS_IFORK_BOFF(ip) : \
-		XFS_LITINO((ip)->i_mount))
+		XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
 #define XFS_IFORK_ASIZE(ip) \
 	(XFS_IFORK_Q(ip) ? \
-		XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
+		XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
+			XFS_IFORK_BOFF(ip) : \
 		0)
 #define XFS_IFORK_SIZE(ip,w) \
 	((w) == XFS_DATA_FORK ? \
@@ -419,6 +445,7 @@ static inline void xfs_iflock(struct xfs_inode *ip)
 static inline void xfs_ifunlock(struct xfs_inode *ip)
 {
 	xfs_iflags_clear(ip, XFS_IFLOCK);
+	smp_mb();
 	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
 }
 
@@ -554,6 +581,7 @@ int		xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
 			       struct xfs_buf **, uint, uint);
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *,
 			  struct xfs_inode *, uint);
+void		xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
 void		xfs_dinode_to_disk(struct xfs_dinode *,
 				   struct xfs_icdinode *);
 void		xfs_idestroy_fork(struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d041d47d9d8..f76ff52e43c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -179,7 +179,7 @@ xfs_inode_item_format(
 	nvecs	     = 1;
 
 	vecp->i_addr = &ip->i_d;
-	vecp->i_len  = sizeof(struct xfs_icdinode);
+	vecp->i_len  = xfs_icdinode_size(ip->i_d.di_version);
 	vecp->i_type = XLOG_REG_TYPE_ICORE;
 	vecp++;
 	nvecs++;
@@ -269,17 +269,6 @@ xfs_inode_item_format(
 		} else {
 			ASSERT(!(iip->ili_fields &
 				 XFS_ILOG_DBROOT));
-#ifdef XFS_TRANS_DEBUG
-			if (iip->ili_root_size > 0) {
-				ASSERT(iip->ili_root_size ==
-				       ip->i_df.if_broot_bytes);
-				ASSERT(memcmp(iip->ili_orig_root,
-					    ip->i_df.if_broot,
-					    iip->ili_root_size) == 0);
-			} else {
-				ASSERT(ip->i_df.if_broot_bytes == 0);
-			}
-#endif
 			iip->ili_fields &= ~XFS_ILOG_DBROOT;
 		}
 		break;
@@ -678,11 +667,6 @@ void
 xfs_inode_item_destroy(
 	xfs_inode_t	*ip)
 {
-#ifdef XFS_TRANS_DEBUG
-	if (ip->i_itemp->ili_root_size != 0) {
-		kmem_free(ip->i_itemp->ili_orig_root);
-	}
-#endif
 	kmem_zone_free(xfs_ili_zone, ip->i_itemp);
 }
 
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 376d4d0b263..779812fb3d8 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -148,10 +148,6 @@ typedef struct xfs_inode_log_item {
 						      data exts */
 	struct xfs_bmbt_rec	*ili_aextents_buf; /* array of logged
 						      attr exts */
-#ifdef XFS_TRANS_DEBUG
-	int			ili_root_size;
-	char			*ili_orig_root;
-#endif
 	xfs_inode_log_format_t	ili_format;	   /* logged structure */
 } xfs_inode_log_item_t;
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c1c3ef88a26..5e999680094 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -80,7 +80,7 @@ xfs_find_handle(
 		f = fdget(hreq->fd);
 		if (!f.file)
 			return -EBADF;
-		inode = f.file->f_path.dentry->d_inode;
+		inode = file_inode(f.file);
 	} else {
 		error = user_lpath((const char __user *)hreq->path, &path);
 		if (error)
@@ -168,7 +168,7 @@ xfs_handle_to_dentry(
 	/*
 	 * Only allow handle opens under a directory.
 	 */
-	if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
+	if (!S_ISDIR(file_inode(parfilp)->i_mode))
 		return ERR_PTR(-ENOTDIR);
 
 	if (hlen != sizeof(xfs_handle_t))
@@ -422,9 +422,12 @@ xfs_attrlist_by_handle(
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
-	if (!kbuf)
-		goto out_dput;
+	kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL);
+	if (!kbuf) {
+		kbuf = kmem_zalloc_large(al_hreq.buflen);
+		if (!kbuf)
+			goto out_dput;
+	}
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
 	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
@@ -436,7 +439,10 @@ xfs_attrlist_by_handle(
 		error = -EFAULT;
 
  out_kfree:
-	kfree(kbuf);
+	if (is_vmalloc_addr(kbuf))
+		kmem_free_large(kbuf);
+	else
+		kmem_free(kbuf);
  out_dput:
 	dput(dentry);
 	return error;
@@ -1334,7 +1340,7 @@ xfs_file_ioctl(
 	unsigned int		cmd,
 	unsigned long		p)
 {
-	struct inode		*inode = filp->f_path.dentry->d_inode;
+	struct inode		*inode = file_inode(filp);
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	void			__user *arg = (void __user *)p;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 1244274a567..c0c66259cc9 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -373,9 +373,12 @@ xfs_compat_attrlist_by_handle(
 		return PTR_ERR(dentry);
 
 	error = -ENOMEM;
-	kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
-	if (!kbuf)
-		goto out_dput;
+	kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL);
+	if (!kbuf) {
+		kbuf = kmem_zalloc_large(al_hreq.buflen);
+		if (!kbuf)
+			goto out_dput;
+	}
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
 	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
@@ -387,7 +390,10 @@ xfs_compat_attrlist_by_handle(
 		error = -EFAULT;
 
  out_kfree:
-	kfree(kbuf);
+	if (is_vmalloc_addr(kbuf))
+		kmem_free_large(kbuf);
+	else
+		kmem_free(kbuf);
  out_dput:
 	dput(dentry);
 	return error;
@@ -530,7 +536,7 @@ xfs_file_compat_ioctl(
 	unsigned		cmd,
 	unsigned long		p)
 {
-	struct inode		*inode = filp->f_path.dentry->d_inode;
+	struct inode		*inode = file_inode(filp);
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	void			__user *arg = (void __user *)p;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 364818eef40..6a709642229 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -42,6 +42,8 @@
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
 
 
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
@@ -282,6 +284,15 @@ xfs_iomap_eof_want_preallocate(
 		return 0;
 
 	/*
+	 * If the file is smaller than the minimum prealloc and we are using
+	 * dynamic preallocation, don't do any preallocation at all as it is
+	 * likely this is the only write to the file that is going to be done.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
+	    XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
+		return 0;
+
+	/*
 	 * If there are any real blocks past eof, then don't
 	 * do any speculative allocation.
 	 */
@@ -311,6 +322,121 @@ xfs_iomap_eof_want_preallocate(
 }
 
 /*
+ * Determine the initial size of the preallocation. We are beyond the current
+ * EOF here, but we need to take into account whether this is a sparse write or
+ * an extending write when determining the preallocation size.  Hence we need to
+ * look up the extent that ends at the current write offset and use the result
+ * to determine the preallocation size.
+ *
+ * If the extent is a hole, then preallocation is essentially disabled.
+ * Otherwise we take the size of the preceeding data extent as the basis for the
+ * preallocation size. If the size of the extent is greater than half the
+ * maximum extent length, then use the current offset as the basis. This ensures
+ * that for large files the preallocation size always extends to MAXEXTLEN
+ * rather than falling short due to things like stripe unit/width alignment of
+ * real extents.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_eof_prealloc_initial_size(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_bmbt_irec_t		*imap,
+	int			nimaps)
+{
+	xfs_fileoff_t   start_fsb;
+	int		imaps = 1;
+	int		error;
+
+	ASSERT(nimaps >= imaps);
+
+	/* if we are using a specific prealloc size, return now */
+	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+		return 0;
+
+	/* If the file is small, then use the minimum prealloc */
+	if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
+		return 0;
+
+	/*
+	 * As we write multiple pages, the offset will always align to the
+	 * start of a page and hence point to a hole at EOF. i.e. if the size is
+	 * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
+	 * will return FSB 1. Hence if there are blocks in the file, we want to
+	 * point to the block prior to the EOF block and not the hole that maps
+	 * directly at @offset.
+	 */
+	start_fsb = XFS_B_TO_FSB(mp, offset);
+	if (start_fsb)
+		start_fsb--;
+	error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
+	if (error)
+		return 0;
+
+	ASSERT(imaps == 1);
+	if (imap[0].br_startblock == HOLESTARTBLOCK)
+		return 0;
+	if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
+		return imap[0].br_blockcount << 1;
+	return XFS_B_TO_FSB(mp, offset);
+}
+
+STATIC bool
+xfs_quota_need_throttle(
+	struct xfs_inode *ip,
+	int type,
+	xfs_fsblock_t alloc_blocks)
+{
+	struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+
+	if (!dq || !xfs_this_quota_on(ip->i_mount, type))
+		return false;
+
+	/* no hi watermark, no throttle */
+	if (!dq->q_prealloc_hi_wmark)
+		return false;
+
+	/* under the lo watermark, no throttle */
+	if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark)
+		return false;
+
+	return true;
+}
+
+STATIC void
+xfs_quota_calc_throttle(
+	struct xfs_inode *ip,
+	int type,
+	xfs_fsblock_t *qblocks,
+	int *qshift)
+{
+	int64_t freesp;
+	int shift = 0;
+	struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+
+	/* over hi wmark, squash the prealloc completely */
+	if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
+		*qblocks = 0;
+		return;
+	}
+
+	freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount;
+	if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
+		shift = 2;
+		if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
+			shift += 2;
+		if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT])
+			shift += 2;
+	}
+
+	/* only overwrite the throttle values if we are more aggressive */
+	if ((freesp >> shift) < (*qblocks >> *qshift)) {
+		*qblocks = freesp;
+		*qshift = shift;
+	}
+}
+
+/*
  * If we don't have a user specified preallocation size, dynamically increase
  * the preallocation size as the size of the file grows. Cap the maximum size
  * at a single extent or less if the filesystem is near full. The closer the
@@ -319,52 +445,95 @@ xfs_iomap_eof_want_preallocate(
 STATIC xfs_fsblock_t
 xfs_iomap_prealloc_size(
 	struct xfs_mount	*mp,
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	struct xfs_bmbt_irec	*imap,
+	int			nimaps)
 {
 	xfs_fsblock_t		alloc_blocks = 0;
+	int			shift = 0;
+	int64_t			freesp;
+	xfs_fsblock_t		qblocks;
+	int			qshift = 0;
 
-	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
-		int shift = 0;
-		int64_t freesp;
+	alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
+							   imap, nimaps);
+	if (!alloc_blocks)
+		goto check_writeio;
+	qblocks = alloc_blocks;
 
-		/*
-		 * rounddown_pow_of_two() returns an undefined result
-		 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
-		 * ensure we always pass in a non-zero value.
-		 */
-		alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
-		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
-					rounddown_pow_of_two(alloc_blocks));
-
-		xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
-		freesp = mp->m_sb.sb_fdblocks;
-		if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
-			shift = 2;
-			if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
-				shift++;
-			if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
-				shift++;
-			if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
-				shift++;
-			if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
-				shift++;
-		}
-		if (shift)
-			alloc_blocks >>= shift;
-
-		/*
-		 * If we are still trying to allocate more space than is
-		 * available, squash the prealloc hard. This can happen if we
-		 * have a large file on a small filesystem and the above
-		 * lowspace thresholds are smaller than MAXEXTLEN.
-		 */
-		while (alloc_blocks >= freesp)
-			alloc_blocks >>= 4;
+	/*
+	 * MAXEXTLEN is not a power of two value but we round the prealloc down
+	 * to the nearest power of two value after throttling. To prevent the
+	 * round down from unconditionally reducing the maximum supported prealloc
+	 * size, we round up first, apply appropriate throttling, round down and
+	 * cap the value to MAXEXTLEN.
+	 */
+	alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
+				       alloc_blocks);
+
+	xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+	freesp = mp->m_sb.sb_fdblocks;
+	if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+		shift = 2;
+		if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+			shift++;
+		if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+			shift++;
+		if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+			shift++;
+		if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+			shift++;
 	}
 
+	/*
+	 * Check each quota to cap the prealloc size and provide a shift
+	 * value to throttle with.
+	 */
+	if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
+		xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift);
+	if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
+		xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift);
+	if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
+		xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift);
+
+	/*
+	 * The final prealloc size is set to the minimum of free space available
+	 * in each of the quotas and the overall filesystem.
+	 *
+	 * The shift throttle value is set to the maximum value as determined by
+	 * the global low free space values and per-quota low free space values.
+	 */
+	alloc_blocks = MIN(alloc_blocks, qblocks);
+	shift = MAX(shift, qshift);
+
+	if (shift)
+		alloc_blocks >>= shift;
+	/*
+	 * rounddown_pow_of_two() returns an undefined result if we pass in
+	 * alloc_blocks = 0.
+	 */
+	if (alloc_blocks)
+		alloc_blocks = rounddown_pow_of_two(alloc_blocks);
+	if (alloc_blocks > MAXEXTLEN)
+		alloc_blocks = MAXEXTLEN;
+
+	/*
+	 * If we are still trying to allocate more space than is
+	 * available, squash the prealloc hard. This can happen if we
+	 * have a large file on a small filesystem and the above
+	 * lowspace thresholds are smaller than MAXEXTLEN.
+	 */
+	while (alloc_blocks && alloc_blocks >= freesp)
+		alloc_blocks >>= 4;
+
+check_writeio:
 	if (alloc_blocks < mp->m_writeio_blocks)
 		alloc_blocks = mp->m_writeio_blocks;
 
+	trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
+				      mp->m_writeio_blocks);
+
 	return alloc_blocks;
 }
 
@@ -399,7 +568,6 @@ xfs_iomap_write_delay(
 	extsz = xfs_get_extsz_hint(ip);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-
 	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
 				imap, XFS_WRITE_IMAPS, &prealloc);
 	if (error)
@@ -407,7 +575,10 @@ xfs_iomap_write_delay(
 
 retry:
 	if (prealloc) {
-		xfs_fsblock_t	alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+		xfs_fsblock_t	alloc_blocks;
+
+		alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
+						       XFS_WRITE_IMAPS);
 
 		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
 		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d82efaa2ac7..c69bbc493cb 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -455,6 +455,28 @@ xfs_vn_getattr(
 	return 0;
 }
 
+static void
+xfs_setattr_mode(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct iattr		*iattr)
+{
+	struct inode	*inode = VFS_I(ip);
+	umode_t		mode = iattr->ia_mode;
+
+	ASSERT(tp);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+		mode &= ~S_ISGID;
+
+	ip->i_d.di_mode &= S_IFMT;
+	ip->i_d.di_mode |= mode & ~S_IFMT;
+
+	inode->i_mode &= S_IFMT;
+	inode->i_mode |= mode & ~S_IFMT;
+}
+
 int
 xfs_setattr_nonsize(
 	struct xfs_inode	*ip,
@@ -606,18 +628,8 @@ xfs_setattr_nonsize(
 	/*
 	 * Change file access modes.
 	 */
-	if (mask & ATTR_MODE) {
-		umode_t mode = iattr->ia_mode;
-
-		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-			mode &= ~S_ISGID;
-
-		ip->i_d.di_mode &= S_IFMT;
-		ip->i_d.di_mode |= mode & ~S_IFMT;
-
-		inode->i_mode &= S_IFMT;
-		inode->i_mode |= mode & ~S_IFMT;
-	}
+	if (mask & ATTR_MODE)
+		xfs_setattr_mode(tp, ip, iattr);
 
 	/*
 	 * Change file access or modified times.
@@ -714,9 +726,8 @@ xfs_setattr_size(
 		return XFS_ERROR(error);
 
 	ASSERT(S_ISREG(ip->i_d.di_mode));
-	ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
-			ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
-			ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+	ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
+			ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
 
 	if (!(flags & XFS_ATTR_NOLOCK)) {
 		lock_flags |= XFS_IOLOCK_EXCL;
@@ -860,6 +871,12 @@ xfs_setattr_size(
 		xfs_inode_clear_eofblocks_tag(ip);
 	}
 
+	/*
+	 * Change file access modes.
+	 */
+	if (mask & ATTR_MODE)
+		xfs_setattr_mode(tp, ip, iattr);
+
 	if (mask & ATTR_CTIME) {
 		inode->i_ctime = iattr->ia_ctime;
 		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
@@ -970,7 +987,8 @@ xfs_fiemap_format(
 	if (bmv->bmv_oflags & BMV_OF_PREALLOC)
 		fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
 	else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
-		fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+		fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
+				 FIEMAP_EXTENT_UNKNOWN);
 		physical = 0;   /* no block yet */
 	}
 	if (bmv->bmv_oflags & BMV_OF_LAST)
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2ea7d402188..bc92c5306a1 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -43,7 +43,7 @@ xfs_internal_inum(
 {
 	return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
 		(xfs_sb_version_hasquota(&mp->m_sb) &&
-		 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
+		 xfs_is_quota_inode(&mp->m_sb, ino)));
 }
 
 /*
@@ -383,11 +383,13 @@ xfs_bulkstat(
 			 * Also start read-ahead now for this chunk.
 			 */
 			if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+				struct blk_plug	plug;
 				/*
 				 * Loop over all clusters in the next chunk.
 				 * Do a readahead if there are any allocated
 				 * inodes in that cluster.
 				 */
+				blk_start_plug(&plug);
 				agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
 				for (chunkidx = 0;
 				     chunkidx < XFS_INODES_PER_CHUNK;
@@ -399,6 +401,7 @@ xfs_bulkstat(
 							agbno, nbcluster,
 							&xfs_inode_buf_ops);
 				}
+				blk_finish_plug(&plug);
 				irbp->ir_startino = r.ir_startino;
 				irbp->ir_freecount = r.ir_freecount;
 				irbp->ir_free = r.ir_free;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index fe7e4df85a7..800f896a6cc 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -72,6 +72,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/list_sort.h>
+#include <linux/ratelimit.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -292,22 +293,34 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 #define ASSERT_ALWAYS(expr)	\
 	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
-#ifndef DEBUG
-#define ASSERT(expr)	((void)0)
+#ifdef DEBUG
+#define ASSERT(expr)	\
+	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
 #ifndef STATIC
-# define STATIC static noinline
+# define STATIC noinline
 #endif
 
-#else /* DEBUG */
+#else	/* !DEBUG */
+
+#ifdef XFS_WARN
 
 #define ASSERT(expr)	\
-	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+	(unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
 
 #ifndef STATIC
-# define STATIC noinline
+# define STATIC static noinline
+#endif
+
+#else	/* !DEBUG && !XFS_WARN */
+
+#define ASSERT(expr)	((void)0)
+
+#ifndef STATIC
+# define STATIC static noinline
 #endif
 
+#endif /* XFS_WARN */
 #endif /* DEBUG */
 
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 46bd9d52ab5..d852a2b3e1f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -120,7 +120,7 @@ xlog_verify_iclog(
 	struct xlog		*log,
 	struct xlog_in_core	*iclog,
 	int			count,
-	boolean_t		syncing);
+	bool                    syncing);
 STATIC void
 xlog_verify_tail_lsn(
 	struct xlog		*log,
@@ -1737,7 +1737,7 @@ xlog_sync(
 	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
 	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
 
-	xlog_verify_iclog(log, iclog, count, B_TRUE);
+	xlog_verify_iclog(log, iclog, count, true);
 
 	/* account for log which doesn't start at block #0 */
 	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
@@ -1963,6 +1963,10 @@ xlog_write_calc_vec_length(
 		headers++;
 
 	for (lv = log_vector; lv; lv = lv->lv_next) {
+		/* we don't write ordered log vectors */
+		if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
+			continue;
+
 		headers += lv->lv_niovecs;
 
 		for (i = 0; i < lv->lv_niovecs; i++) {
@@ -2216,7 +2220,7 @@ xlog_write(
 	index = 0;
 	lv = log_vector;
 	vecp = lv->lv_iovecp;
-	while (lv && index < lv->lv_niovecs) {
+	while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
 		void		*ptr;
 		int		log_offset;
 
@@ -2236,13 +2240,22 @@ xlog_write(
 		 * This loop writes out as many regions as can fit in the amount
 		 * of space which was allocated by xlog_state_get_iclog_space().
 		 */
-		while (lv && index < lv->lv_niovecs) {
-			struct xfs_log_iovec	*reg = &vecp[index];
+		while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
+			struct xfs_log_iovec	*reg;
 			struct xlog_op_header	*ophdr;
 			int			start_rec_copy;
 			int			copy_len;
 			int			copy_off;
+			bool			ordered = false;
+
+			/* ordered log vectors have no regions to write */
+			if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+				ASSERT(lv->lv_niovecs == 0);
+				ordered = true;
+				goto next_lv;
+			}
 
+			reg = &vecp[index];
 			ASSERT(reg->i_len % sizeof(__int32_t) == 0);
 			ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
 
@@ -2302,12 +2315,13 @@ xlog_write(
 				break;
 
 			if (++index == lv->lv_niovecs) {
+next_lv:
 				lv = lv->lv_next;
 				index = 0;
 				if (lv)
 					vecp = lv->lv_iovecp;
 			}
-			if (record_cnt == 0) {
+			if (record_cnt == 0 && ordered == false) {
 				if (!lv)
 					return 0;
 				break;
@@ -3485,7 +3499,7 @@ xlog_ticket_alloc(
 	tic->t_curr_res		= unit_bytes;
 	tic->t_cnt		= cnt;
 	tic->t_ocnt		= cnt;
-	tic->t_tid		= random32();
+	tic->t_tid		= prandom_u32();
 	tic->t_clientid		= client;
 	tic->t_flags		= XLOG_TIC_INITED;
 	tic->t_trans_type	= 0;
@@ -3611,7 +3625,7 @@ xlog_verify_iclog(
 	struct xlog		*log,
 	struct xlog_in_core	*iclog,
 	int			count,
-	boolean_t		syncing)
+	bool                    syncing)
 {
 	xlog_op_header_t	*ophead;
 	xlog_in_core_t		*icptr;
@@ -3659,7 +3673,7 @@ xlog_verify_iclog(
 		/* clientid is only 1 byte */
 		field_offset = (__psint_t)
 			       ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
-		if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+		if (!syncing || (field_offset & 0x1ff)) {
 			clientid = ophead->oh_clientid;
 		} else {
 			idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
@@ -3682,7 +3696,7 @@ xlog_verify_iclog(
 		/* check length */
 		field_offset = (__psint_t)
 			       ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
-		if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+		if (!syncing || (field_offset & 0x1ff)) {
 			op_len = be32_to_cpu(ophead->oh_len);
 		} else {
 			idx = BTOBBT((__psint_t)&ophead->oh_len -
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 5caee96059d..fb630e496c1 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -88,7 +88,8 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XLOG_REG_TYPE_UNMOUNT		17
 #define XLOG_REG_TYPE_COMMIT		18
 #define XLOG_REG_TYPE_TRANSHDR		19
-#define XLOG_REG_TYPE_MAX		19
+#define XLOG_REG_TYPE_ICREATE		20
+#define XLOG_REG_TYPE_MAX		20
 
 typedef struct xfs_log_iovec {
 	void		*i_addr;	/* beginning address of region */
@@ -105,6 +106,8 @@ struct xfs_log_vec {
 	int			lv_buf_len;	/* size of formatted buffer */
 };
 
+#define XFS_LOG_VEC_ORDERED	(-1)
+
 /*
  * Structure used to pass callback function and the function's argument
  * to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ddc4529d07d..02b9cf3f825 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -127,6 +127,7 @@ xlog_cil_prepare_log_vecs(
 		int	index;
 		int	len = 0;
 		uint	niovecs;
+		bool	ordered = false;
 
 		/* Skip items which aren't dirty in this transaction. */
 		if (!(lidp->lid_flags & XFS_LID_DIRTY))
@@ -137,14 +138,30 @@ xlog_cil_prepare_log_vecs(
 		if (!niovecs)
 			continue;
 
+		/*
+		 * Ordered items need to be tracked but we do not wish to write
+		 * them. We need a logvec to track the object, but we do not
+		 * need an iovec or buffer to be allocated for copying data.
+		 */
+		if (niovecs == XFS_LOG_VEC_ORDERED) {
+			ordered = true;
+			niovecs = 0;
+		}
+
 		new_lv = kmem_zalloc(sizeof(*new_lv) +
 				niovecs * sizeof(struct xfs_log_iovec),
-				KM_SLEEP);
+				KM_SLEEP|KM_NOFS);
+
+		new_lv->lv_item = lidp->lid_item;
+		new_lv->lv_niovecs = niovecs;
+		if (ordered) {
+			/* track as an ordered logvec */
+			new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+			goto next;
+		}
 
 		/* The allocated iovec region lies beyond the log vector. */
 		new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
-		new_lv->lv_niovecs = niovecs;
-		new_lv->lv_item = lidp->lid_item;
 
 		/* build the vector array and calculate it's length */
 		IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
@@ -165,6 +182,7 @@ xlog_cil_prepare_log_vecs(
 		}
 		ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
 
+next:
 		if (!ret_lv)
 			ret_lv = new_lv;
 		else
@@ -191,8 +209,18 @@ xfs_cil_prepare_item(
 
 	if (old) {
 		/* existing lv on log item, space used is a delta */
-		ASSERT(!list_empty(&lv->lv_item->li_cil));
-		ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+		ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) ||
+			old->lv_buf_len == XFS_LOG_VEC_ORDERED);
+
+		/*
+		 * If the new item is ordered, keep the old one that is already
+		 * tracking dirty or ordered regions
+		 */
+		if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
+			ASSERT(!lv->lv_buf);
+			kmem_free(lv);
+			return;
+		}
 
 		*len += lv->lv_buf_len - old->lv_buf_len;
 		*diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
@@ -201,10 +229,11 @@ xfs_cil_prepare_item(
 	} else {
 		/* new lv, must pin the log item */
 		ASSERT(!lv->lv_item->li_lv);
-		ASSERT(list_empty(&lv->lv_item->li_cil));
 
-		*len += lv->lv_buf_len;
-		*diff_iovecs += lv->lv_niovecs;
+		if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+			*len += lv->lv_buf_len;
+			*diff_iovecs += lv->lv_niovecs;
+		}
 		IOP_PIN(lv->lv_item);
 
 	}
@@ -259,18 +288,24 @@ xlog_cil_insert_items(
 	 * We can do this safely because the context can't checkpoint until we
 	 * are done so it doesn't matter exactly how we update the CIL.
 	 */
-	for (lv = log_vector; lv; lv = lv->lv_next)
-		xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
-
-	/* account for space used by new iovec headers  */
-	len += diff_iovecs * sizeof(xlog_op_header_t);
-
 	spin_lock(&cil->xc_cil_lock);
+	for (lv = log_vector; lv; ) {
+		struct xfs_log_vec *next = lv->lv_next;
 
-	/* move the items to the tail of the CIL */
-	for (lv = log_vector; lv; lv = lv->lv_next)
+		ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil));
+		lv->lv_next = NULL;
+
+		/*
+		 * xfs_cil_prepare_item() may free the lv, so move the item on
+		 * the CIL first.
+		 */
 		list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
+		xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
+		lv = next;
+	}
 
+	/* account for space used by new iovec headers  */
+	len += diff_iovecs * sizeof(xlog_op_header_t);
 	ctx->nvecs += diff_iovecs;
 
 	/*
@@ -381,9 +416,7 @@ xlog_cil_push(
 	struct xfs_cil_ctx	*new_ctx;
 	struct xlog_in_core	*commit_iclog;
 	struct xlog_ticket	*tic;
-	int			num_lv;
 	int			num_iovecs;
-	int			len;
 	int			error = 0;
 	struct xfs_trans_header thdr;
 	struct xfs_log_iovec	lhdr;
@@ -428,12 +461,9 @@ xlog_cil_push(
 	 * side which is currently locked out by the flush lock.
 	 */
 	lv = NULL;
-	num_lv = 0;
 	num_iovecs = 0;
-	len = 0;
 	while (!list_empty(&cil->xc_cil)) {
 		struct xfs_log_item	*item;
-		int			i;
 
 		item = list_first_entry(&cil->xc_cil,
 					struct xfs_log_item, li_cil);
@@ -444,11 +474,7 @@ xlog_cil_push(
 			lv->lv_next = item->li_lv;
 		lv = item->li_lv;
 		item->li_lv = NULL;
-
-		num_lv++;
 		num_iovecs += lv->lv_niovecs;
-		for (i = 0; i < lv->lv_niovecs; i++)
-			len += lv->lv_iovecp[i].i_len;
 	}
 
 	/*
@@ -668,10 +694,6 @@ xlog_cil_push_foreground(
  * transaction to the checkpoint context so we carry the busy extents through
  * to checkpoint completion, and then unlock all the items in the transaction.
  *
- * For more specific information about the order of operations in
- * xfs_log_commit_cil() please refer to the comments in
- * xfs_trans_commit_iclog().
- *
  * Called with the context lock already held in read mode to lock out
  * background commit, returns without it held once background commits are
  * allowed again.
@@ -705,6 +727,7 @@ xfs_log_commit_cil(
 	if (commit_lsn)
 		*commit_lsn = log->l_cilp->xc_ctx->sequence;
 
+	/* xlog_cil_insert_items() destroys log_vector list */
 	xlog_cil_insert_items(log, log_vector, tp->t_ticket);
 
 	/* check we didn't blow the reservation */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 16d8d12ea3b..b9ea262dd1c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -468,7 +468,6 @@ struct xfs_cil {
  * threshold, yet give us plenty of space for aggregation on large logs.
  */
 #define XLOG_CIL_SPACE_LIMIT(log)	(log->l_logsize >> 3)
-#define XLOG_CIL_HARD_SPACE_LIMIT(log)	(3 * (log->l_logsize >> 4))
 
 /*
  * ticket grant locks, queues and accounting have their own cachlines
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 96fcbb85ff8..6fcc910a50b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -29,6 +29,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
@@ -44,6 +45,15 @@
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_icreate_item.h"
+
+/* Need all the magic numbers and buffer ops structures from these headers */
+#include "xfs_symlink.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_attr_remote.h"
 
 STATIC int
 xlog_find_zeroed(
@@ -1442,9 +1452,8 @@ xlog_recover_find_tid(
 	xlog_tid_t		tid)
 {
 	xlog_recover_t		*trans;
-	struct hlist_node	*n;
 
-	hlist_for_each_entry(trans, n, head, r_list) {
+	hlist_for_each_entry(trans, head, r_list) {
 		if (trans->r_log_tid == tid)
 			return trans;
 	}
@@ -1591,10 +1600,53 @@ xlog_recover_add_to_trans(
 }
 
 /*
- * Sort the log items in the transaction. Cancelled buffers need
- * to be put first so they are processed before any items that might
- * modify the buffers. If they are cancelled, then the modifications
- * don't need to be replayed.
+ * Sort the log items in the transaction.
+ *
+ * The ordering constraints are defined by the inode allocation and unlink
+ * behaviour. The rules are:
+ *
+ *	1. Every item is only logged once in a given transaction. Hence it
+ *	   represents the last logged state of the item. Hence ordering is
+ *	   dependent on the order in which operations need to be performed so
+ *	   required initial conditions are always met.
+ *
+ *	2. Cancelled buffers are recorded in pass 1 in a separate table and
+ *	   there's nothing to replay from them so we can simply cull them
+ *	   from the transaction. However, we can't do that until after we've
+ *	   replayed all the other items because they may be dependent on the
+ *	   cancelled buffer and replaying the cancelled buffer can remove it
+ *	   form the cancelled buffer table. Hence they have tobe done last.
+ *
+ *	3. Inode allocation buffers must be replayed before inode items that
+ *	   read the buffer and replay changes into it. For filesystems using the
+ *	   ICREATE transactions, this means XFS_LI_ICREATE objects need to get
+ *	   treated the same as inode allocation buffers as they create and
+ *	   initialise the buffers directly.
+ *
+ *	4. Inode unlink buffers must be replayed after inode items are replayed.
+ *	   This ensures that inodes are completely flushed to the inode buffer
+ *	   in a "free" state before we remove the unlinked inode list pointer.
+ *
+ * Hence the ordering needs to be inode allocation buffers first, inode items
+ * second, inode unlink buffers third and cancelled buffers last.
+ *
+ * But there's a problem with that - we can't tell an inode allocation buffer
+ * apart from a regular buffer, so we can't separate them. We can, however,
+ * tell an inode unlink buffer from the others, and so we can separate them out
+ * from all the other buffers and move them to last.
+ *
+ * Hence, 4 lists, in order from head to tail:
+ *	- buffer_list for all buffers except cancelled/inode unlink buffers
+ *	- item_list for all non-buffer items
+ *	- inode_buffer_list for inode unlink buffers
+ *	- cancel_list for the cancelled buffers
+ *
+ * Note that we add objects to the tail of the lists so that first-to-last
+ * ordering is preserved within the lists. Adding objects to the head of the
+ * list means when we traverse from the head we walk them in last-to-first
+ * order. For cancelled buffers and inode unlink buffers this doesn't matter,
+ * but for all other items there may be specific ordering that we need to
+ * preserve.
  */
 STATIC int
 xlog_recover_reorder_trans(
@@ -1604,19 +1656,32 @@ xlog_recover_reorder_trans(
 {
 	xlog_recover_item_t	*item, *n;
 	LIST_HEAD(sort_list);
+	LIST_HEAD(cancel_list);
+	LIST_HEAD(buffer_list);
+	LIST_HEAD(inode_buffer_list);
+	LIST_HEAD(inode_list);
 
 	list_splice_init(&trans->r_itemq, &sort_list);
 	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
 		xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
 
 		switch (ITEM_TYPE(item)) {
+		case XFS_LI_ICREATE:
+			list_move_tail(&item->ri_list, &buffer_list);
+			break;
 		case XFS_LI_BUF:
-			if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+			if (buf_f->blf_flags & XFS_BLF_CANCEL) {
 				trace_xfs_log_recover_item_reorder_head(log,
 							trans, item, pass);
-				list_move(&item->ri_list, &trans->r_itemq);
+				list_move(&item->ri_list, &cancel_list);
+				break;
+			}
+			if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+				list_move(&item->ri_list, &inode_buffer_list);
 				break;
 			}
+			list_move_tail(&item->ri_list, &buffer_list);
+			break;
 		case XFS_LI_INODE:
 		case XFS_LI_DQUOT:
 		case XFS_LI_QUOTAOFF:
@@ -1624,7 +1689,7 @@ xlog_recover_reorder_trans(
 		case XFS_LI_EFI:
 			trace_xfs_log_recover_item_reorder_tail(log,
 							trans, item, pass);
-			list_move_tail(&item->ri_list, &trans->r_itemq);
+			list_move_tail(&item->ri_list, &inode_list);
 			break;
 		default:
 			xfs_warn(log->l_mp,
@@ -1635,6 +1700,14 @@ xlog_recover_reorder_trans(
 		}
 	}
 	ASSERT(list_empty(&sort_list));
+	if (!list_empty(&buffer_list))
+		list_splice(&buffer_list, &trans->r_itemq);
+	if (!list_empty(&inode_list))
+		list_splice_tail(&inode_list, &trans->r_itemq);
+	if (!list_empty(&inode_buffer_list))
+		list_splice_tail(&inode_buffer_list, &trans->r_itemq);
+	if (!list_empty(&cancel_list))
+		list_splice_tail(&cancel_list, &trans->r_itemq);
 	return 0;
 }
 
@@ -1787,6 +1860,13 @@ xlog_recover_do_inode_buffer(
 
 	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
 
+	/*
+	 * Post recovery validation only works properly on CRC enabled
+	 * filesystems.
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		bp->b_ops = &xfs_inode_buf_ops;
+
 	inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
 	for (i = 0; i < inodes_per_buf; i++) {
 		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
@@ -1852,12 +1932,216 @@ xlog_recover_do_inode_buffer(
 		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
 					      next_unlinked_offset);
 		*buffer_nextp = *logged_nextp;
+
+		/*
+		 * If necessary, recalculate the CRC in the on-disk inode. We
+		 * have to leave the inode in a consistent state for whoever
+		 * reads it next....
+		 */
+		xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+				xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+
 	}
 
 	return 0;
 }
 
 /*
+ * Validate the recovered buffer is of the correct type and attach the
+ * appropriate buffer operations to them for writeback. Magic numbers are in a
+ * few places:
+ *	the first 16 bits of the buffer (inode buffer, dquot buffer),
+ *	the first 32 bits of the buffer (most blocks),
+ *	inside a struct xfs_da_blkinfo at the start of the buffer.
+ */
+static void
+xlog_recovery_validate_buf_type(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	xfs_buf_log_format_t	*buf_f)
+{
+	struct xfs_da_blkinfo	*info = bp->b_addr;
+	__uint32_t		magic32;
+	__uint16_t		magic16;
+	__uint16_t		magicda;
+
+	magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
+	magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
+	magicda = be16_to_cpu(info->magic);
+	switch (xfs_blft_from_flags(buf_f)) {
+	case XFS_BLFT_BTREE_BUF:
+		switch (magic32) {
+		case XFS_ABTB_CRC_MAGIC:
+		case XFS_ABTC_CRC_MAGIC:
+		case XFS_ABTB_MAGIC:
+		case XFS_ABTC_MAGIC:
+			bp->b_ops = &xfs_allocbt_buf_ops;
+			break;
+		case XFS_IBT_CRC_MAGIC:
+		case XFS_IBT_MAGIC:
+			bp->b_ops = &xfs_inobt_buf_ops;
+			break;
+		case XFS_BMAP_CRC_MAGIC:
+		case XFS_BMAP_MAGIC:
+			bp->b_ops = &xfs_bmbt_buf_ops;
+			break;
+		default:
+			xfs_warn(mp, "Bad btree block magic!");
+			ASSERT(0);
+			break;
+		}
+		break;
+	case XFS_BLFT_AGF_BUF:
+		if (magic32 != XFS_AGF_MAGIC) {
+			xfs_warn(mp, "Bad AGF block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_agf_buf_ops;
+		break;
+	case XFS_BLFT_AGFL_BUF:
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			break;
+		if (magic32 != XFS_AGFL_MAGIC) {
+			xfs_warn(mp, "Bad AGFL block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_agfl_buf_ops;
+		break;
+	case XFS_BLFT_AGI_BUF:
+		if (magic32 != XFS_AGI_MAGIC) {
+			xfs_warn(mp, "Bad AGI block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_agi_buf_ops;
+		break;
+	case XFS_BLFT_UDQUOT_BUF:
+	case XFS_BLFT_PDQUOT_BUF:
+	case XFS_BLFT_GDQUOT_BUF:
+#ifdef CONFIG_XFS_QUOTA
+		if (magic16 != XFS_DQUOT_MAGIC) {
+			xfs_warn(mp, "Bad DQUOT block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dquot_buf_ops;
+#else
+		xfs_alert(mp,
+	"Trying to recover dquots without QUOTA support built in!");
+		ASSERT(0);
+#endif
+		break;
+	case XFS_BLFT_DINO_BUF:
+		/*
+		 * we get here with inode allocation buffers, not buffers that
+		 * track unlinked list changes.
+		 */
+		if (magic16 != XFS_DINODE_MAGIC) {
+			xfs_warn(mp, "Bad INODE block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_inode_buf_ops;
+		break;
+	case XFS_BLFT_SYMLINK_BUF:
+		if (magic32 != XFS_SYMLINK_MAGIC) {
+			xfs_warn(mp, "Bad symlink block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_symlink_buf_ops;
+		break;
+	case XFS_BLFT_DIR_BLOCK_BUF:
+		if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
+		    magic32 != XFS_DIR3_BLOCK_MAGIC) {
+			xfs_warn(mp, "Bad dir block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dir3_block_buf_ops;
+		break;
+	case XFS_BLFT_DIR_DATA_BUF:
+		if (magic32 != XFS_DIR2_DATA_MAGIC &&
+		    magic32 != XFS_DIR3_DATA_MAGIC) {
+			xfs_warn(mp, "Bad dir data magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dir3_data_buf_ops;
+		break;
+	case XFS_BLFT_DIR_FREE_BUF:
+		if (magic32 != XFS_DIR2_FREE_MAGIC &&
+		    magic32 != XFS_DIR3_FREE_MAGIC) {
+			xfs_warn(mp, "Bad dir3 free magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dir3_free_buf_ops;
+		break;
+	case XFS_BLFT_DIR_LEAF1_BUF:
+		if (magicda != XFS_DIR2_LEAF1_MAGIC &&
+		    magicda != XFS_DIR3_LEAF1_MAGIC) {
+			xfs_warn(mp, "Bad dir leaf1 magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+		break;
+	case XFS_BLFT_DIR_LEAFN_BUF:
+		if (magicda != XFS_DIR2_LEAFN_MAGIC &&
+		    magicda != XFS_DIR3_LEAFN_MAGIC) {
+			xfs_warn(mp, "Bad dir leafn magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_dir3_leafn_buf_ops;
+		break;
+	case XFS_BLFT_DA_NODE_BUF:
+		if (magicda != XFS_DA_NODE_MAGIC &&
+		    magicda != XFS_DA3_NODE_MAGIC) {
+			xfs_warn(mp, "Bad da node magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_da3_node_buf_ops;
+		break;
+	case XFS_BLFT_ATTR_LEAF_BUF:
+		if (magicda != XFS_ATTR_LEAF_MAGIC &&
+		    magicda != XFS_ATTR3_LEAF_MAGIC) {
+			xfs_warn(mp, "Bad attr leaf magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_attr3_leaf_buf_ops;
+		break;
+	case XFS_BLFT_ATTR_RMT_BUF:
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			break;
+		if (magic32 != XFS_ATTR3_RMT_MAGIC) {
+			xfs_warn(mp, "Bad attr remote magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_attr3_rmt_buf_ops;
+		break;
+	case XFS_BLFT_SB_BUF:
+		if (magic32 != XFS_SB_MAGIC) {
+			xfs_warn(mp, "Bad SB block magic!");
+			ASSERT(0);
+			break;
+		}
+		bp->b_ops = &xfs_sb_buf_ops;
+		break;
+	default:
+		xfs_warn(mp, "Unknown buffer type %d!",
+			 xfs_blft_from_flags(buf_f));
+		break;
+	}
+}
+
+/*
  * Perform a 'normal' buffer recovery.  Each logged region of the
  * buffer should be copied over the corresponding region in the
  * given buffer.  The bitmap in the buf log format structure indicates
@@ -1893,6 +2177,17 @@ xlog_recover_do_reg_buffer(
 		       ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
 
 		/*
+		 * The dirty regions logged in the buffer, even though
+		 * contiguous, may span multiple chunks. This is because the
+		 * dirty region may span a physical page boundary in a buffer
+		 * and hence be split into two separate vectors for writing into
+		 * the log. Hence we need to trim nbits back to the length of
+		 * the current region being copied out of the log.
+		 */
+		if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
+			nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+
+		/*
 		 * Do a sanity check if this is a dquot buffer. Just checking
 		 * the first dquot in the buffer should do. XXXThis is
 		 * probably a good thing to do for other buf types also.
@@ -1929,6 +2224,17 @@ xlog_recover_do_reg_buffer(
 
 	/* Shouldn't be any more regions */
 	ASSERT(i == item->ri_total);
+
+	/*
+	 * We can only do post recovery validation on items on CRC enabled
+	 * fielsystems as we need to know when the buffer was written to be able
+	 * to determine if we should have replayed the item. If we replay old
+	 * metadata over a newer buffer, then it will enter a temporarily
+	 * inconsistent state resulting in verification failures. Hence for now
+	 * just avoid the verification stage for non-crc filesystems
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		xlog_recovery_validate_buf_type(mp, bp, buf_f);
 }
 
 /*
@@ -2049,6 +2355,12 @@ xfs_qm_dqcheck(
 	d->dd_diskdq.d_flags = type;
 	d->dd_diskdq.d_id = cpu_to_be32(id);
 
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+		xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+	}
+
 	return errs;
 }
 
@@ -2214,6 +2526,7 @@ xlog_recover_inode_pass2(
 	int			attr_index;
 	uint			fields;
 	xfs_icdinode_t		*dicp;
+	uint			isize;
 	int			need_free = 0;
 
 	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
@@ -2239,7 +2552,7 @@ xlog_recover_inode_pass2(
 	trace_xfs_log_recover_inode_recover(log, in_f);
 
 	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
-			  NULL);
+			  &xfs_inode_buf_ops);
 	if (!bp) {
 		error = ENOMEM;
 		goto error;
@@ -2350,7 +2663,8 @@ xlog_recover_inode_pass2(
 		error = EFSCORRUPTED;
 		goto error;
 	}
-	if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
+	isize = xfs_icdinode_size(dicp->di_version);
+	if (unlikely(item->ri_buf[1].i_len > isize)) {
 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
@@ -2362,13 +2676,13 @@ xlog_recover_inode_pass2(
 	}
 
 	/* The core is in in-core format */
-	xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
+	xfs_dinode_to_disk(dip, dicp);
 
 	/* the rest is in on-disk format */
-	if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
-		memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
-			item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
-			item->ri_buf[1].i_len  - sizeof(struct xfs_icdinode));
+	if (item->ri_buf[1].i_len > isize) {
+		memcpy((char *)dip + isize,
+			item->ri_buf[1].i_addr + isize,
+			item->ri_buf[1].i_len - isize);
 	}
 
 	fields = in_f->ilf_fields;
@@ -2452,6 +2766,9 @@ xlog_recover_inode_pass2(
 	}
 
 write_inode_buffer:
+	/* re-generate the checksum. */
+	xfs_dinode_calc_crc(log->l_mp, dip);
+
 	ASSERT(bp->b_target->bt_mount == mp);
 	bp->b_iodone = xlog_recover_iodone;
 	xfs_buf_delwri_queue(bp, buffer_list);
@@ -2571,6 +2888,10 @@ xlog_recover_dquot_pass2(
 	}
 
 	memcpy(ddq, recddq, item->ri_buf[1].i_len);
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+				 XFS_DQUOT_CRC_OFF);
+	}
 
 	ASSERT(dq_f->qlf_size == 2);
 	ASSERT(bp->b_target->bt_mount == mp);
@@ -2675,6 +2996,93 @@ xlog_recover_efd_pass2(
 }
 
 /*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log.  It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be intialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_do_icreate_pass2(
+	struct xlog		*log,
+	struct list_head	*buffer_list,
+	xlog_recover_item_t	*item)
+{
+	struct xfs_mount	*mp = log->l_mp;
+	struct xfs_icreate_log	*icl;
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	unsigned int		count;
+	unsigned int		isize;
+	xfs_agblock_t		length;
+
+	icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+	if (icl->icl_type != XFS_LI_ICREATE) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+		return EINVAL;
+	}
+
+	if (icl->icl_size != 1) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+		return EINVAL;
+	}
+
+	agno = be32_to_cpu(icl->icl_ag);
+	if (agno >= mp->m_sb.sb_agcount) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+		return EINVAL;
+	}
+	agbno = be32_to_cpu(icl->icl_agbno);
+	if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+		return EINVAL;
+	}
+	isize = be32_to_cpu(icl->icl_isize);
+	if (isize != mp->m_sb.sb_inodesize) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+		return EINVAL;
+	}
+	count = be32_to_cpu(icl->icl_count);
+	if (!count) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+		return EINVAL;
+	}
+	length = be32_to_cpu(icl->icl_length);
+	if (!length || length >= mp->m_sb.sb_agblocks) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+		return EINVAL;
+	}
+
+	/* existing allocation is fixed value */
+	ASSERT(count == XFS_IALLOC_INODES(mp));
+	ASSERT(length == XFS_IALLOC_BLOCKS(mp));
+	if (count != XFS_IALLOC_INODES(mp) ||
+	     length != XFS_IALLOC_BLOCKS(mp)) {
+		xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+		return EINVAL;
+	}
+
+	/*
+	 * Inode buffers can be freed. Do not replay the inode initialisation as
+	 * we could be overwriting something written after this inode buffer was
+	 * cancelled.
+	 *
+	 * XXX: we need to iterate all buffers and only init those that are not
+	 * cancelled. I think that a more fine grained factoring of
+	 * xfs_ialloc_inode_init may be appropriate here to enable this to be
+	 * done easily.
+	 */
+	if (xlog_check_buffer_cancelled(log,
+			XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+		return 0;
+
+	xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
+					be32_to_cpu(icl->icl_gen));
+	return 0;
+}
+
+/*
  * Free up any resources allocated by the transaction
  *
  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
@@ -2716,6 +3124,7 @@ xlog_recover_commit_pass1(
 	case XFS_LI_EFI:
 	case XFS_LI_EFD:
 	case XFS_LI_DQUOT:
+	case XFS_LI_ICREATE:
 		/* nothing to do in pass 1 */
 		return 0;
 	default:
@@ -2746,6 +3155,8 @@ xlog_recover_commit_pass2(
 		return xlog_recover_efd_pass2(log, item);
 	case XFS_LI_DQUOT:
 		return xlog_recover_dquot_pass2(log, buffer_list, item);
+	case XFS_LI_ICREATE:
+		return xlog_recover_do_icreate_pass2(log, buffer_list, item);
 	case XFS_LI_QUOTAOFF:
 		/* nothing to do in pass2 */
 		return 0;
@@ -2949,6 +3360,7 @@ xlog_recover_process_efi(
 			 * This will pull the EFI from the AIL and
 			 * free the memory associated with it.
 			 */
+			set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
 			xfs_efi_release(efip, efip->efi_format.efi_nextents);
 			return XFS_ERROR(EIO);
 		}
@@ -3752,6 +4164,25 @@ xlog_recover(
 			return error;
 		}
 
+		/*
+		 * Version 5 superblock log feature mask validation. We know the
+		 * log is dirty so check if there are any unknown log features
+		 * in what we need to recover. If there are unknown features
+		 * (e.g. unsupported transactions, then simply reject the
+		 * attempt at recovery before touching anything.
+		 */
+		if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
+		    xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
+					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
+			xfs_warn(log->l_mp,
+"Superblock has unknown incompatible log features (0x%x) enabled.\n"
+"The log can not be fully and/or safely recovered by this kernel.\n"
+"Please recover the log on a kernel that supports the unknown features.",
+				(log->l_mp->m_sb.sb_features_log_incompat &
+					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+			return EINVAL;
+		}
+
 		xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
 				log->l_mp->m_logname ? log->l_mp->m_logname
 						     : "internal");
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 331cd9f83a7..9163dc14053 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -93,6 +93,14 @@ xfs_alert_tag(
 }
 
 void
+asswarn(char *expr, char *file, int line)
+{
+	xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d",
+		expr, file, line);
+	WARN_ON(1);
+}
+
+void
 assfail(char *expr, char *file, int line)
 {
 	xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 56dc0c17f16..85401155750 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -30,7 +30,34 @@ void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
 }
 #endif
 
+#define xfs_printk_ratelimited(func, dev, fmt, ...)		\
+do {									\
+	static DEFINE_RATELIMIT_STATE(_rs,				\
+				      DEFAULT_RATELIMIT_INTERVAL,	\
+				      DEFAULT_RATELIMIT_BURST);		\
+	if (__ratelimit(&_rs))						\
+		func(dev, fmt, ##__VA_ARGS__);			\
+} while (0)
+
+#define xfs_emerg_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__)
+#define xfs_alert_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__)
+#define xfs_crit_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__)
+#define xfs_err_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__)
+#define xfs_warn_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__)
+#define xfs_notice_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__)
+#define xfs_info_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__)
+#define xfs_debug_ratelimited(dev, fmt, ...)				\
+	xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__)
+
 extern void assfail(char *expr, char *f, int l);
+extern void asswarn(char *expr, char *f, int l);
 
 extern void xfs_hex_dump(void *p, int length);
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 7d6df7c00c3..2b0ba358165 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,6 +43,8 @@
 #include "xfs_utils.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
 
 
 #ifdef HAVE_PERCPU_SB
@@ -109,6 +111,14 @@ static const struct {
     { offsetof(xfs_sb_t, sb_logsunit),	 0 },
     { offsetof(xfs_sb_t, sb_features2),	 0 },
     { offsetof(xfs_sb_t, sb_bad_features2), 0 },
+    { offsetof(xfs_sb_t, sb_features_compat), 0 },
+    { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
+    { offsetof(xfs_sb_t, sb_features_incompat), 0 },
+    { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
+    { offsetof(xfs_sb_t, sb_crc),	 0 },
+    { offsetof(xfs_sb_t, sb_pad),	 0 },
+    { offsetof(xfs_sb_t, sb_pquotino),	 0 },
+    { offsetof(xfs_sb_t, sb_lsn),	 0 },
     { sizeof(xfs_sb_t),			 0 }
 };
 
@@ -304,7 +314,8 @@ STATIC int
 xfs_mount_validate_sb(
 	xfs_mount_t	*mp,
 	xfs_sb_t	*sbp,
-	bool		check_inprogress)
+	bool		check_inprogress,
+	bool		check_version)
 {
 
 	/*
@@ -319,11 +330,63 @@ xfs_mount_validate_sb(
 		return XFS_ERROR(EWRONGFS);
 	}
 
+
 	if (!xfs_sb_good_version(sbp)) {
 		xfs_warn(mp, "bad version");
 		return XFS_ERROR(EWRONGFS);
 	}
 
+	if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
+			(sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
+				XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
+		xfs_notice(mp,
+"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/*
+	 * Version 5 superblock feature mask validation. Reject combinations the
+	 * kernel cannot support up front before checking anything else. For
+	 * write validation, we don't need to check feature masks.
+	 */
+	if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
+		xfs_alert(mp,
+"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
+"Use of these features in this kernel is at your own risk!");
+
+		if (xfs_sb_has_compat_feature(sbp,
+					XFS_SB_FEAT_COMPAT_UNKNOWN)) {
+			xfs_warn(mp,
+"Superblock has unknown compatible features (0x%x) enabled.\n"
+"Using a more recent kernel is recommended.",
+				(sbp->sb_features_compat &
+						XFS_SB_FEAT_COMPAT_UNKNOWN));
+		}
+
+		if (xfs_sb_has_ro_compat_feature(sbp,
+					XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+			xfs_alert(mp,
+"Superblock has unknown read-only compatible features (0x%x) enabled.",
+				(sbp->sb_features_ro_compat &
+						XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+			if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+				xfs_warn(mp,
+"Attempted to mount read-only compatible filesystem read-write.\n"
+"Filesystem can only be safely mounted read only.");
+				return XFS_ERROR(EINVAL);
+			}
+		}
+		if (xfs_sb_has_incompat_feature(sbp,
+					XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
+			xfs_warn(mp,
+"Superblock has unknown incompatible features (0x%x) enabled.\n"
+"Filesystem can not be safely mounted by this kernel.",
+				(sbp->sb_features_incompat &
+						XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+			return XFS_ERROR(EINVAL);
+		}
+	}
+
 	if (unlikely(
 	    sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
 		xfs_warn(mp,
@@ -506,6 +569,18 @@ out_unwind:
 	return error;
 }
 
+static void
+xfs_sb_quota_from_disk(struct xfs_sb *sbp)
+{
+	if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
+		sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+					XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
+	if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
+		sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
+					XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
+	sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
+}
+
 void
 xfs_sb_from_disk(
 	struct xfs_sb	*to,
@@ -557,6 +632,43 @@ xfs_sb_from_disk(
 	to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
 	to->sb_features2 = be32_to_cpu(from->sb_features2);
 	to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
+	to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
+	to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
+	to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
+	to->sb_features_log_incompat =
+				be32_to_cpu(from->sb_features_log_incompat);
+	to->sb_pad = 0;
+	to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
+	to->sb_lsn = be64_to_cpu(from->sb_lsn);
+}
+
+static inline void
+xfs_sb_quota_to_disk(
+	xfs_dsb_t	*to,
+	xfs_sb_t	*from,
+	__int64_t	*fields)
+{
+	__uint16_t	qflags = from->sb_qflags;
+
+	if (*fields & XFS_SB_QFLAGS) {
+		/*
+		 * The in-core version of sb_qflags do not have
+		 * XFS_OQUOTA_* flags, whereas the on-disk version
+		 * does.  So, convert incore XFS_{PG}QUOTA_* flags
+		 * to on-disk XFS_OQUOTA_* flags.
+		 */
+		qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
+				XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
+
+		if (from->sb_qflags &
+				(XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
+			qflags |= XFS_OQUOTA_ENFD;
+		if (from->sb_qflags &
+				(XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
+			qflags |= XFS_OQUOTA_CHKD;
+		to->sb_qflags = cpu_to_be16(qflags);
+		*fields &= ~XFS_SB_QFLAGS;
+	}
 }
 
 /*
@@ -580,6 +692,7 @@ xfs_sb_to_disk(
 	if (!fields)
 		return;
 
+	xfs_sb_quota_to_disk(to, from, &fields);
 	while (fields) {
 		f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
 		first = xfs_sb_info[f].offset;
@@ -612,13 +725,13 @@ xfs_sb_to_disk(
 	}
 }
 
-static void
+static int
 xfs_sb_verify(
-	struct xfs_buf	*bp)
+	struct xfs_buf	*bp,
+	bool		check_version)
 {
 	struct xfs_mount *mp = bp->b_target->bt_mount;
 	struct xfs_sb	sb;
-	int		error;
 
 	xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
 
@@ -626,16 +739,47 @@ xfs_sb_verify(
 	 * Only check the in progress field for the primary superblock as
 	 * mkfs.xfs doesn't clear it from secondary superblocks.
 	 */
-	error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
-	if (error)
-		xfs_buf_ioerror(bp, error);
+	return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+				     check_version);
 }
 
+/*
+ * If the superblock has the CRC feature bit set or the CRC field is non-null,
+ * check that the CRC is valid.  We check the CRC field is non-null because a
+ * single bit error could clear the feature bit and unused parts of the
+ * superblock are supposed to be zero. Hence a non-null crc field indicates that
+ * we've potentially lost a feature bit and we should check it anyway.
+ */
 static void
 xfs_sb_read_verify(
 	struct xfs_buf	*bp)
 {
-	xfs_sb_verify(bp);
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_dsb	*dsb = XFS_BUF_TO_SBP(bp);
+	int		error;
+
+	/*
+	 * open code the version check to avoid needing to convert the entire
+	 * superblock from disk order just to check the version number
+	 */
+	if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
+	    (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
+						XFS_SB_VERSION_5) ||
+	     dsb->sb_crc != 0)) {
+
+		if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
+				      offsetof(struct xfs_sb, sb_crc))) {
+			error = EFSCORRUPTED;
+			goto out_error;
+		}
+	}
+	error = xfs_sb_verify(bp, true);
+
+out_error:
+	if (error) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, error);
+	}
 }
 
 /*
@@ -648,11 +792,10 @@ static void
 xfs_sb_quiet_read_verify(
 	struct xfs_buf	*bp)
 {
-	struct xfs_sb	sb;
+	struct xfs_dsb	*dsb = XFS_BUF_TO_SBP(bp);
 
-	xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
 
-	if (sb.sb_magicnum == XFS_SB_MAGIC) {
+	if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
 		/* XFS filesystem, verify noisily! */
 		xfs_sb_read_verify(bp);
 		return;
@@ -663,9 +806,27 @@ xfs_sb_quiet_read_verify(
 
 static void
 xfs_sb_write_verify(
-	struct xfs_buf	*bp)
+	struct xfs_buf		*bp)
 {
-	xfs_sb_verify(bp);
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+	int			error;
+
+	error = xfs_sb_verify(bp, false);
+	if (error) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, error);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+			 offsetof(struct xfs_sb, sb_crc));
 }
 
 const struct xfs_buf_ops xfs_sb_buf_ops = {
@@ -687,7 +848,8 @@ int
 xfs_readsb(xfs_mount_t *mp, int flags)
 {
 	unsigned int	sector_size;
-	xfs_buf_t	*bp;
+	struct xfs_buf	*bp;
+	struct xfs_sb	*sbp = &mp->m_sb;
 	int		error;
 	int		loud = !(flags & XFS_MFSI_QUIET);
 
@@ -714,7 +876,7 @@ reread:
 	if (bp->b_error) {
 		error = bp->b_error;
 		if (loud)
-			xfs_warn(mp, "SB validate failed");
+			xfs_warn(mp, "SB validate failed with error %d.", error);
 		goto release_buf;
 	}
 
@@ -723,13 +885,14 @@ reread:
 	 */
 	xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
 
+	xfs_sb_quota_from_disk(&mp->m_sb);
 	/*
 	 * We must be able to do sector-sized and sector-aligned IO.
 	 */
-	if (sector_size > mp->m_sb.sb_sectsize) {
+	if (sector_size > sbp->sb_sectsize) {
 		if (loud)
 			xfs_warn(mp, "device supports %u byte sectors (not %u)",
-				sector_size, mp->m_sb.sb_sectsize);
+				sector_size, sbp->sb_sectsize);
 		error = ENOSYS;
 		goto release_buf;
 	}
@@ -738,15 +901,18 @@ reread:
 	 * If device sector size is smaller than the superblock size,
 	 * re-read the superblock so the buffer is correctly sized.
 	 */
-	if (sector_size < mp->m_sb.sb_sectsize) {
+	if (sector_size < sbp->sb_sectsize) {
 		xfs_buf_relse(bp);
-		sector_size = mp->m_sb.sb_sectsize;
+		sector_size = sbp->sb_sectsize;
 		goto reread;
 	}
 
 	/* Initialize per-cpu counters */
 	xfs_icsb_reinit_counters(mp);
 
+	/* no need to be quiet anymore, so reset the buf ops */
+	bp->b_ops = &xfs_sb_buf_ops;
+
 	mp->m_sb_bp = bp;
 	xfs_buf_unlock(bp);
 	return 0;
@@ -872,42 +1038,27 @@ xfs_update_alignment(xfs_mount_t *mp)
 		 */
 		if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
 		    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
-			if (mp->m_flags & XFS_MOUNT_RETERR) {
-				xfs_warn(mp, "alignment check failed: "
-					 "(sunit/swidth vs. blocksize)");
-				return XFS_ERROR(EINVAL);
-			}
-			mp->m_dalign = mp->m_swidth = 0;
+			xfs_warn(mp,
+		"alignment check failed: sunit/swidth vs. blocksize(%d)",
+				sbp->sb_blocksize);
+			return XFS_ERROR(EINVAL);
 		} else {
 			/*
 			 * Convert the stripe unit and width to FSBs.
 			 */
 			mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
 			if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
-				if (mp->m_flags & XFS_MOUNT_RETERR) {
-					xfs_warn(mp, "alignment check failed: "
-						 "(sunit/swidth vs. ag size)");
-					return XFS_ERROR(EINVAL);
-				}
 				xfs_warn(mp,
-		"stripe alignment turned off: sunit(%d)/swidth(%d) "
-		"incompatible with agsize(%d)",
-					mp->m_dalign, mp->m_swidth,
-					sbp->sb_agblocks);
-
-				mp->m_dalign = 0;
-				mp->m_swidth = 0;
+			"alignment check failed: sunit/swidth vs. agsize(%d)",
+					 sbp->sb_agblocks);
+				return XFS_ERROR(EINVAL);
 			} else if (mp->m_dalign) {
 				mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
 			} else {
-				if (mp->m_flags & XFS_MOUNT_RETERR) {
-					xfs_warn(mp, "alignment check failed: "
-						"sunit(%d) less than bsize(%d)",
-						mp->m_dalign,
-						mp->m_blockmask +1);
-					return XFS_ERROR(EINVAL);
-				}
-				mp->m_swidth = 0;
+				xfs_warn(mp,
+			"alignment check failed: sunit(%d) less than bsize(%d)",
+					 mp->m_dalign, sbp->sb_blocksize);
+				return XFS_ERROR(EINVAL);
 			}
 		}
 
@@ -924,6 +1075,10 @@ xfs_update_alignment(xfs_mount_t *mp)
 				sbp->sb_width = mp->m_swidth;
 				mp->m_update_flags |= XFS_SB_WIDTH;
 			}
+		} else {
+			xfs_warn(mp,
+	"cannot change alignment: superblock does not support data alignment");
+			return XFS_ERROR(EINVAL);
 		}
 	} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
 		    xfs_sb_version_hasdalign(&mp->m_sb)) {
@@ -1109,8 +1264,8 @@ xfs_mount_reset_sbqflags(
 		return 0;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-				      XFS_DEFAULT_LOG_COUNT);
+	error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp),
+				  0, 0, XFS_DEFAULT_LOG_COUNT);
 	if (error) {
 		xfs_trans_cancel(tp, 0);
 		xfs_alert(mp, "%s: Superblock update failed!", __func__);
@@ -1583,8 +1738,8 @@ xfs_log_sbcount(xfs_mount_t *mp)
 		return 0;
 
 	tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
-	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-					XFS_DEFAULT_LOG_COUNT);
+	error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
+				  XFS_DEFAULT_LOG_COUNT);
 	if (error) {
 		xfs_trans_cancel(tp, 0);
 		return error;
@@ -1633,6 +1788,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
 	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
 	first = xfs_sb_info[f].offset;
 
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
 	xfs_trans_log_buf(tp, bp, first, last);
 }
 
@@ -1945,8 +2101,8 @@ xfs_mount_log_sb(
 			 XFS_SB_VERSIONNUM));
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
-	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-				XFS_DEFAULT_LOG_COUNT);
+	error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
+				  XFS_DEFAULT_LOG_COUNT);
 	if (error) {
 		xfs_trans_cancel(tp, 0);
 		return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bab8314507e..4e374d4a918 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -34,12 +34,19 @@ typedef struct xfs_trans_reservations {
 	uint	tr_addafork;	/* cvt inode to attributed trans */
 	uint	tr_writeid;	/* write setuid/setgid file */
 	uint	tr_attrinval;	/* attr fork buffer invalidation */
-	uint	tr_attrset;	/* set/create an attribute */
+	uint	tr_attrsetm;	/* set/create an attribute at mount time */
+	uint	tr_attrsetrt;	/* set/create an attribute at runtime */
 	uint	tr_attrrm;	/* remove an attribute */
 	uint	tr_clearagi;	/* clear bad agi unlinked ino bucket */
 	uint	tr_growrtalloc;	/* grow realtime allocations */
 	uint	tr_growrtzero;	/* grow realtime zeroing */
 	uint	tr_growrtfree;	/* grow realtime freeing */
+	uint	tr_qm_sbchange;	/* change quota flags */
+	uint	tr_qm_setqlim;	/* adjust quota limits */
+	uint	tr_qm_dqalloc;	/* allocate quota on disk */
+	uint	tr_qm_quotaoff;	/* turn quota off */
+	uint	tr_qm_equotaoff;/* end of turn quota off */
+	uint	tr_sb;		/* modify superblock */
 } xfs_trans_reservations_t;
 
 #ifndef __KERNEL__
@@ -185,8 +192,6 @@ typedef struct xfs_mount {
 	xfs_dablk_t		m_dirleafblk;	/* blockno of dir non-data v2 */
 	xfs_dablk_t		m_dirfreeblk;	/* blockno of dirfreeindex v2 */
 	uint			m_chsize;	/* size of next field */
-	struct xfs_chash	*m_chash;	/* fs private inode per-cluster
-						 * hash table */
 	atomic_t		m_active_trans;	/* number trans frozen */
 #ifdef HAVE_PERCPU_SB
 	xfs_icsb_cnts_t __percpu *m_sb_cnts;	/* per-cpu superblock counters */
@@ -200,7 +205,6 @@ typedef struct xfs_mount {
 						     trimming */
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
-	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
 	int64_t			m_low_space[XFS_LOWSP_MAX];
 						/* low free space thresholds */
 
@@ -223,8 +227,6 @@ typedef struct xfs_mount {
 						   operations, typically for
 						   disk errors in metadata */
 #define XFS_MOUNT_DISCARD	(1ULL << 5)	/* discard unused blocks */
-#define XFS_MOUNT_RETERR	(1ULL << 6)     /* return alignment errors to
-						   user */
 #define XFS_MOUNT_NOALIGN	(1ULL << 7)	/* turn off stripe alignment
 						   allocations */
 #define XFS_MOUNT_ATTR2		(1ULL << 8)	/* allow use of attr2 format */
@@ -385,6 +387,7 @@ extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
 
 #endif	/* __KERNEL__ */
 
+extern void	xfs_sb_calc_crc(struct xfs_buf	*);
 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
 extern int	xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
 					xfs_agnumber_t *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 60eff476315..7a3e007b49f 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -41,6 +41,7 @@
 #include "xfs_qm.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_cksum.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
@@ -69,7 +70,7 @@ xfs_qm_dquot_walk(
 	void			*data)
 {
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
-	struct radix_tree_root	*tree = XFS_DQUOT_TREE(qi, type);
+	struct radix_tree_root	*tree = xfs_dquot_tree(qi, type);
 	uint32_t		next_index;
 	int			last_error = 0;
 	int			skipped;
@@ -188,7 +189,7 @@ xfs_qm_dqpurge(
 	xfs_dqfunlock(dqp);
 	xfs_dqunlock(dqp);
 
-	radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+	radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
 			  be32_to_cpu(dqp->q_core.d_id));
 	qi->qi_dquots--;
 
@@ -298,8 +299,10 @@ xfs_qm_mount_quotas(
 	 */
 	if (!XFS_IS_UQUOTA_ON(mp))
 		mp->m_qflags &= ~XFS_UQUOTA_CHKD;
-	if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
-		mp->m_qflags &= ~XFS_OQUOTA_CHKD;
+	if (!XFS_IS_GQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+	if (!XFS_IS_PQUOTA_ON(mp))
+		mp->m_qflags &= ~XFS_PQUOTA_CHKD;
 
  write_changes:
 	/*
@@ -488,8 +491,7 @@ xfs_qm_need_dqattach(
 		return false;
 	if (!XFS_NOT_DQATTACHED(mp, ip))
 		return false;
-	if (ip->i_ino == mp->m_sb.sb_uquotino ||
-	    ip->i_ino == mp->m_sb.sb_gquotino)
+	if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
 		return false;
 	return true;
 }
@@ -605,8 +607,7 @@ xfs_qm_dqdetach(
 
 	trace_xfs_dquot_dqdetach(ip);
 
-	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
-	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+	ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
 	if (ip->i_udquot) {
 		xfs_qm_dqrele(ip->i_udquot);
 		ip->i_udquot = NULL;
@@ -617,6 +618,20 @@ xfs_qm_dqdetach(
 	}
 }
 
+int
+xfs_qm_calc_dquots_per_chunk(
+	struct xfs_mount	*mp,
+	unsigned int		nbblks)	/* basic block units */
+{
+	unsigned int	ndquots;
+
+	ASSERT(nbblks > 0);
+	ndquots = BBTOB(nbblks);
+	do_div(ndquots, sizeof(xfs_dqblk_t));
+
+	return ndquots;
+}
+
 /*
  * This initializes all the quota information that's kept in the
  * mount structure
@@ -656,9 +671,8 @@ xfs_qm_init_quotainfo(
 
 	/* Precalc some constants */
 	qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
-	ASSERT(qinf->qi_dqchunklen);
-	qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
-	do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
+	qinf->qi_dqperchunk = xfs_qm_calc_dquots_per_chunk(mp,
+							qinf->qi_dqchunklen);
 
 	mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
 
@@ -826,7 +840,7 @@ xfs_qm_reset_dqcounts(
 	xfs_dqid_t	id,
 	uint		type)
 {
-	xfs_disk_dquot_t	*ddq;
+	struct xfs_dqblk	*dqb;
 	int			j;
 
 	trace_xfs_reset_dqcounts(bp, _RET_IP_);
@@ -840,8 +854,12 @@ xfs_qm_reset_dqcounts(
 	do_div(j, sizeof(xfs_dqblk_t));
 	ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
 #endif
-	ddq = bp->b_addr;
+	dqb = bp->b_addr;
 	for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
+		struct xfs_disk_dquot	*ddq;
+
+		ddq = (struct xfs_disk_dquot *)&dqb[j];
+
 		/*
 		 * Do a sanity check, and if needed, repair the dqblk. Don't
 		 * output any warnings because it's perfectly possible to
@@ -858,7 +876,12 @@ xfs_qm_reset_dqcounts(
 		ddq->d_bwarns = 0;
 		ddq->d_iwarns = 0;
 		ddq->d_rtbwarns = 0;
-		ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
+
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			xfs_update_cksum((char *)&dqb[j],
+					 sizeof(struct xfs_dqblk),
+					 XFS_DQUOT_CRC_OFF);
+		}
 	}
 }
 
@@ -894,15 +917,29 @@ xfs_qm_dqiter_bufs(
 			      XFS_FSB_TO_DADDR(mp, bno),
 			      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
 			      &xfs_dquot_buf_ops);
+
+		/*
+		 * CRC and validation errors will return a EFSCORRUPTED here. If
+		 * this occurs, re-read without CRC validation so that we can
+		 * repair the damage via xfs_qm_reset_dqcounts(). This process
+		 * will leave a trace in the log indicating corruption has
+		 * been detected.
+		 */
+		if (error == EFSCORRUPTED) {
+			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+				      XFS_FSB_TO_DADDR(mp, bno),
+				      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+				      NULL);
+		}
+
 		if (error)
 			break;
 
 		xfs_qm_reset_dqcounts(mp, bp, firstid, type);
 		xfs_buf_delwri_queue(bp, buffer_list);
 		xfs_buf_relse(bp);
-		/*
-		 * goto the next block.
-		 */
+
+		/* goto the next block. */
 		bno++;
 		firstid += mp->m_quotainfo->qi_dqperchunk;
 	}
@@ -1057,7 +1094,7 @@ xfs_qm_quotacheck_dqadjust(
 	 * There are no timers for the default values set in the root dquot.
 	 */
 	if (dqp->q_core.d_id) {
-		xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
+		xfs_qm_adjust_dqlimits(mp, dqp);
 		xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
 	}
 
@@ -1115,7 +1152,7 @@ xfs_qm_dqusage_adjust(
 	 * rootino must have its resources accounted for, not so with the quota
 	 * inodes.
 	 */
-	if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+	if (xfs_is_quota_inode(&mp->m_sb, ino)) {
 		*res = BULKSTAT_RV_NOTHING;
 		return XFS_ERROR(EINVAL);
 	}
@@ -1225,19 +1262,20 @@ int
 xfs_qm_quotacheck(
 	xfs_mount_t	*mp)
 {
-	int		done, count, error, error2;
-	xfs_ino_t	lastino;
-	size_t		structsz;
-	xfs_inode_t	*uip, *gip;
-	uint		flags;
-	LIST_HEAD	(buffer_list);
+	int			done, count, error, error2;
+	xfs_ino_t		lastino;
+	size_t			structsz;
+	uint			flags;
+	LIST_HEAD		(buffer_list);
+	struct xfs_inode	*uip = mp->m_quotainfo->qi_uquotaip;
+	struct xfs_inode	*gip = mp->m_quotainfo->qi_gquotaip;
 
 	count = INT_MAX;
 	structsz = 1;
 	lastino = 0;
 	flags = 0;
 
-	ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
+	ASSERT(uip || gip);
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
 	xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1247,7 +1285,6 @@ xfs_qm_quotacheck(
 	 * their counters to zero. We need a clean slate.
 	 * We don't log our changes till later.
 	 */
-	uip = mp->m_quotainfo->qi_uquotaip;
 	if (uip) {
 		error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
 					 &buffer_list);
@@ -1256,14 +1293,14 @@ xfs_qm_quotacheck(
 		flags |= XFS_UQUOTA_CHKD;
 	}
 
-	gip = mp->m_quotainfo->qi_gquotaip;
 	if (gip) {
 		error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
 					 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
 					 &buffer_list);
 		if (error)
 			goto error_return;
-		flags |= XFS_OQUOTA_CHKD;
+		flags |= XFS_IS_GQUOTA_ON(mp) ?
+					XFS_GQUOTA_CHKD : XFS_PQUOTA_CHKD;
 	}
 
 	do {
@@ -1358,15 +1395,13 @@ STATIC int
 xfs_qm_init_quotainos(
 	xfs_mount_t	*mp)
 {
-	xfs_inode_t	*uip, *gip;
-	int		error;
-	__int64_t	sbflags;
-	uint		flags;
+	struct xfs_inode	*uip = NULL;
+	struct xfs_inode	*gip = NULL;
+	int			error;
+	__int64_t		sbflags = 0;
+	uint			flags = 0;
 
 	ASSERT(mp->m_quotainfo);
-	uip = gip = NULL;
-	sbflags = 0;
-	flags = 0;
 
 	/*
 	 * Get the uquota and gquota inodes
@@ -1375,19 +1410,18 @@ xfs_qm_init_quotainos(
 		if (XFS_IS_UQUOTA_ON(mp) &&
 		    mp->m_sb.sb_uquotino != NULLFSINO) {
 			ASSERT(mp->m_sb.sb_uquotino > 0);
-			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-					     0, 0, &uip)))
+			error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+					     0, 0, &uip);
+			if (error)
 				return XFS_ERROR(error);
 		}
 		if (XFS_IS_OQUOTA_ON(mp) &&
 		    mp->m_sb.sb_gquotino != NULLFSINO) {
 			ASSERT(mp->m_sb.sb_gquotino > 0);
-			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-					     0, 0, &gip))) {
-				if (uip)
-					IRELE(uip);
-				return XFS_ERROR(error);
-			}
+			error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+					     0, 0, &gip);
+			if (error)
+				goto error_rele;
 		}
 	} else {
 		flags |= XFS_QMOPT_SBVERSION;
@@ -1402,10 +1436,11 @@ xfs_qm_init_quotainos(
 	 * temporarily switch to read-write to do this.
 	 */
 	if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
-		if ((error = xfs_qm_qino_alloc(mp, &uip,
+		error = xfs_qm_qino_alloc(mp, &uip,
 					      sbflags | XFS_SB_UQUOTINO,
-					      flags | XFS_QMOPT_UQUOTA)))
-			return XFS_ERROR(error);
+					      flags | XFS_QMOPT_UQUOTA);
+		if (error)
+			goto error_rele;
 
 		flags &= ~XFS_QMOPT_SBVERSION;
 	}
@@ -1414,18 +1449,21 @@ xfs_qm_init_quotainos(
 				XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
 		error = xfs_qm_qino_alloc(mp, &gip,
 					  sbflags | XFS_SB_GQUOTINO, flags);
-		if (error) {
-			if (uip)
-				IRELE(uip);
-
-			return XFS_ERROR(error);
-		}
+		if (error)
+			goto error_rele;
 	}
 
 	mp->m_quotainfo->qi_uquotaip = uip;
 	mp->m_quotainfo->qi_gquotaip = gip;
 
 	return 0;
+
+error_rele:
+	if (uip)
+		IRELE(uip);
+	if (gip)
+		IRELE(gip);
+	return XFS_ERROR(error);
 }
 
 STATIC void
@@ -1436,7 +1474,7 @@ xfs_qm_dqfree_one(
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
 
 	mutex_lock(&qi->qi_tree_lock);
-	radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+	radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
 			  be32_to_cpu(dqp->q_core.d_id));
 
 	qi->qi_dquots--;
@@ -1584,10 +1622,9 @@ xfs_qm_write_sb_changes(
 	int		error;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-	if ((error = xfs_trans_reserve(tp, 0,
-				      mp->m_sb.sb_sectsize + 128, 0,
-				      0,
-				      XFS_DEFAULT_LOG_COUNT))) {
+	error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp),
+				  0, 0, XFS_DEFAULT_LOG_COUNT);
+	if (error) {
 		xfs_trans_cancel(tp, 0);
 		return error;
 	}
@@ -1623,7 +1660,8 @@ xfs_qm_vop_dqalloc(
 	struct xfs_dquot	**O_gdqpp)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_dquot	*uq, *gq;
+	struct xfs_dquot	*uq = NULL;
+	struct xfs_dquot	*gq = NULL;
 	int			error;
 	uint			lockflags;
 
@@ -1648,7 +1686,6 @@ xfs_qm_vop_dqalloc(
 		}
 	}
 
-	uq = gq = NULL;
 	if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
 		if (ip->i_d.di_uid != uid) {
 			/*
@@ -1661,11 +1698,12 @@ xfs_qm_vop_dqalloc(
 			 * holding ilock.
 			 */
 			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+			error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
 						 XFS_DQ_USER,
 						 XFS_QMOPT_DQALLOC |
 						 XFS_QMOPT_DOWARN,
-						 &uq))) {
+						 &uq);
+			if (error) {
 				ASSERT(error != ENOENT);
 				return error;
 			}
@@ -1687,15 +1725,14 @@ xfs_qm_vop_dqalloc(
 	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
 		if (ip->i_d.di_gid != gid) {
 			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+			error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
 						 XFS_DQ_GROUP,
 						 XFS_QMOPT_DQALLOC |
 						 XFS_QMOPT_DOWARN,
-						 &gq))) {
-				if (uq)
-					xfs_qm_dqrele(uq);
+						 &gq);
+			if (error) {
 				ASSERT(error != ENOENT);
-				return error;
+				goto error_rele;
 			}
 			xfs_dqunlock(gq);
 			lockflags = XFS_ILOCK_SHARED;
@@ -1707,15 +1744,14 @@ xfs_qm_vop_dqalloc(
 	} else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
 		if (xfs_get_projid(ip) != prid) {
 			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
+			error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
 						 XFS_DQ_PROJ,
 						 XFS_QMOPT_DQALLOC |
 						 XFS_QMOPT_DOWARN,
-						 &gq))) {
-				if (uq)
-					xfs_qm_dqrele(uq);
+						 &gq);
+			if (error) {
 				ASSERT(error != ENOENT);
-				return (error);
+				goto error_rele;
 			}
 			xfs_dqunlock(gq);
 			lockflags = XFS_ILOCK_SHARED;
@@ -1738,6 +1774,11 @@ xfs_qm_vop_dqalloc(
 	else if (gq)
 		xfs_qm_dqrele(gq);
 	return 0;
+
+error_rele:
+	if (uq)
+		xfs_qm_dqrele(uq);
+	return error;
 }
 
 /*
@@ -1785,29 +1826,31 @@ xfs_qm_vop_chown(
  */
 int
 xfs_qm_vop_chown_reserve(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip,
-	xfs_dquot_t	*udqp,
-	xfs_dquot_t	*gdqp,
-	uint		flags)
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp,
+	uint			flags)
 {
-	xfs_mount_t	*mp = ip->i_mount;
-	uint		delblks, blkflags, prjflags = 0;
-	xfs_dquot_t	*unresudq, *unresgdq, *delblksudq, *delblksgdq;
-	int		error;
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			delblks, blkflags, prjflags = 0;
+	struct xfs_dquot	*udq_unres = NULL;
+	struct xfs_dquot	*gdq_unres = NULL;
+	struct xfs_dquot	*udq_delblks = NULL;
+	struct xfs_dquot	*gdq_delblks = NULL;
+	int			error;
 
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
 	delblks = ip->i_delayed_blks;
-	delblksudq = delblksgdq = unresudq = unresgdq = NULL;
 	blkflags = XFS_IS_REALTIME_INODE(ip) ?
 			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
 
 	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
 	    ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
-		delblksudq = udqp;
+		udq_delblks = udqp;
 		/*
 		 * If there are delayed allocation blocks, then we have to
 		 * unreserve those from the old dquot, and add them to the
@@ -1815,7 +1858,7 @@ xfs_qm_vop_chown_reserve(
 		 */
 		if (delblks) {
 			ASSERT(ip->i_udquot);
-			unresudq = ip->i_udquot;
+			udq_unres = ip->i_udquot;
 		}
 	}
 	if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
@@ -1826,18 +1869,19 @@ xfs_qm_vop_chown_reserve(
 		if (prjflags ||
 		    (XFS_IS_GQUOTA_ON(ip->i_mount) &&
 		     ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
-			delblksgdq = gdqp;
+			gdq_delblks = gdqp;
 			if (delblks) {
 				ASSERT(ip->i_gdquot);
-				unresgdq = ip->i_gdquot;
+				gdq_unres = ip->i_gdquot;
 			}
 		}
 	}
 
-	if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
-				delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
-				flags | blkflags | prjflags)))
-		return (error);
+	error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+				udq_delblks, gdq_delblks, ip->i_d.di_nblocks, 1,
+				flags | blkflags | prjflags);
+	if (error)
+		return error;
 
 	/*
 	 * Do the delayed blks reservations/unreservations now. Since, these
@@ -1849,14 +1893,15 @@ xfs_qm_vop_chown_reserve(
 		/*
 		 * Do the reservations first. Unreservation can't fail.
 		 */
-		ASSERT(delblksudq || delblksgdq);
-		ASSERT(unresudq || unresgdq);
-		if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
-				delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
-				flags | blkflags | prjflags)))
-			return (error);
+		ASSERT(udq_delblks || gdq_delblks);
+		ASSERT(udq_unres || gdq_unres);
+		error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+			    udq_delblks, gdq_delblks, (xfs_qcnt_t)delblks, 0,
+			    flags | blkflags | prjflags);
+		if (error)
+			return error;
 		xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
-				unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
+				udq_unres, gdq_unres, -((xfs_qcnt_t)delblks), 0,
 				blkflags);
 	}
 
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 44b858b79d7..bdb4f8b9520 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -69,28 +69,62 @@ typedef struct xfs_quotainfo {
 	struct shrinker  qi_shrinker;
 } xfs_quotainfo_t;
 
-#define XFS_DQUOT_TREE(qi, type) \
-	((type & XFS_DQ_USER) ? \
-	 &((qi)->qi_uquota_tree) : \
-	 &((qi)->qi_gquota_tree))
+static inline struct radix_tree_root *
+xfs_dquot_tree(
+	struct xfs_quotainfo	*qi,
+	int			type)
+{
+	switch (type) {
+	case XFS_DQ_USER:
+		return &qi->qi_uquota_tree;
+	case XFS_DQ_GROUP:
+	case XFS_DQ_PROJ:
+		return &qi->qi_gquota_tree;
+	default:
+		ASSERT(0);
+	}
+	return NULL;
+}
 
+static inline struct xfs_inode *
+xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+{
+	switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+	case XFS_DQ_USER:
+		return dqp->q_mount->m_quotainfo->qi_uquotaip;
+	case XFS_DQ_GROUP:
+	case XFS_DQ_PROJ:
+		return dqp->q_mount->m_quotainfo->qi_gquotaip;
+	default:
+		ASSERT(0);
+	}
+	return NULL;
+}
 
-extern void	xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
-extern int	xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
-			xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
-extern void	xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
-extern void	xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
+extern int	xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
+					     unsigned int nbblks);
+extern void	xfs_trans_mod_dquot(struct xfs_trans *,
+					struct xfs_dquot *, uint, long);
+extern int	xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+			struct xfs_mount *, struct xfs_dquot *,
+			struct xfs_dquot *, long, long, uint);
+extern void	xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
+extern void	xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
 
 /*
  * We keep the usr and grp dquots separately so that locking will be easier
  * to do at commit time. All transactions that we know of at this point
  * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
  */
+enum {
+	XFS_QM_TRANS_USR = 0,
+	XFS_QM_TRANS_GRP,
+	XFS_QM_TRANS_DQTYPES
+};
 #define XFS_QM_TRANS_MAXDQS		2
-typedef struct xfs_dquot_acct {
-	xfs_dqtrx_t	dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
-	xfs_dqtrx_t	dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
-} xfs_dquot_acct_t;
+struct xfs_dquot_acct {
+	struct xfs_dqtrx	dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
+};
 
 /*
  * Users are allowed to have a usage exceeding their softlimit for
@@ -104,22 +138,23 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_IWARNLIMIT	5
 #define XFS_QM_RTBWARNLIMIT	5
 
-extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int		xfs_qm_quotacheck(xfs_mount_t *);
-extern int		xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+extern void		xfs_qm_destroy_quotainfo(struct xfs_mount *);
+extern int		xfs_qm_quotacheck(struct xfs_mount *);
+extern int		xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
 
 /* dquot stuff */
-extern void		xfs_qm_dqpurge_all(xfs_mount_t *, uint);
-extern void		xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+extern void		xfs_qm_dqpurge_all(struct xfs_mount *, uint);
+extern void		xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
 
 /* quota ops */
-extern int		xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-extern int		xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
-					fs_disk_quota_t *);
-extern int		xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
-					fs_disk_quota_t *);
-extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-extern int		xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-extern int		xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+extern int		xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
+extern int		xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
+					uint, struct fs_disk_quota *);
+extern int		xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
+					struct fs_disk_quota *);
+extern int		xfs_qm_scall_getqstat(struct xfs_mount *,
+					struct fs_quota_stat *);
+extern int		xfs_qm_scall_quotaon(struct xfs_mount *, uint);
+extern int		xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
 
 #endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 6b39115bf14..2d02eac1c9a 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -146,7 +146,7 @@ xfs_qm_newmount(
 			 * inode goes inactive and wants to free blocks,
 			 * or via xfs_log_mount_finish.
 			 */
-			*needquotamount = B_TRUE;
+			*needquotamount = true;
 			*quotaflags = mp->m_qflags;
 			mp->m_qflags = 0;
 		}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 8a59f854655..a08801ae24e 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -117,11 +117,11 @@ xfs_qm_scall_quotaoff(
 	}
 	if (flags & XFS_GQUOTA_ACCT) {
 		dqtype |= XFS_QMOPT_GQUOTA;
-		flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+		flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
 		inactivate_flags |= XFS_GQUOTA_ACTIVE;
 	} else if (flags & XFS_PQUOTA_ACCT) {
 		dqtype |= XFS_QMOPT_PQUOTA;
-		flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+		flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
 		inactivate_flags |= XFS_PQUOTA_ACTIVE;
 	}
 
@@ -335,14 +335,14 @@ xfs_qm_scall_quotaon(
 	 * quota acct on ondisk without m_qflags' knowing.
 	 */
 	if (((flags & XFS_UQUOTA_ACCT) == 0 &&
-	    (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
-	    (flags & XFS_UQUOTA_ENFD))
-	    ||
+	     (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+	     (flags & XFS_UQUOTA_ENFD)) ||
+	    ((flags & XFS_GQUOTA_ACCT) == 0 &&
+	     (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+	     (flags & XFS_GQUOTA_ENFD)) ||
 	    ((flags & XFS_PQUOTA_ACCT) == 0 &&
-	    (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
-	    (flags & XFS_GQUOTA_ACCT) == 0 &&
-	    (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
-	    (flags & XFS_OQUOTA_ENFD))) {
+	     (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+	     (flags & XFS_PQUOTA_ENFD))) {
 		xfs_debug(mp,
 			"%s: Can't enforce without acct, flags=%x sbflags=%x\n",
 			__func__, flags, mp->m_sb.sb_qflags);
@@ -407,11 +407,11 @@ xfs_qm_scall_getqstat(
 	struct fs_quota_stat	*out)
 {
 	struct xfs_quotainfo	*q = mp->m_quotainfo;
-	struct xfs_inode	*uip, *gip;
-	boolean_t		tempuqip, tempgqip;
+	struct xfs_inode	*uip = NULL;
+	struct xfs_inode	*gip = NULL;
+	bool                    tempuqip = false;
+	bool                    tempgqip = false;
 
-	uip = gip = NULL;
-	tempuqip = tempgqip = B_FALSE;
 	memset(out, 0, sizeof(fs_quota_stat_t));
 
 	out->qs_version = FS_QSTAT_VERSION;
@@ -434,12 +434,12 @@ xfs_qm_scall_getqstat(
 	if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
 		if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
 					0, 0, &uip) == 0)
-			tempuqip = B_TRUE;
+			tempuqip = true;
 	}
 	if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
 		if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
 					0, 0, &gip) == 0)
-			tempgqip = B_TRUE;
+			tempgqip = true;
 	}
 	if (uip) {
 		out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
@@ -472,15 +472,15 @@ xfs_qm_scall_getqstat(
  */
 int
 xfs_qm_scall_setqlim(
-	xfs_mount_t		*mp,
+	struct xfs_mount	*mp,
 	xfs_dqid_t		id,
 	uint			type,
 	fs_disk_quota_t		*newlim)
 {
 	struct xfs_quotainfo	*q = mp->m_quotainfo;
-	xfs_disk_dquot_t	*ddq;
-	xfs_dquot_t		*dqp;
-	xfs_trans_t		*tp;
+	struct xfs_disk_dquot	*ddq;
+	struct xfs_dquot	*dqp;
+	struct xfs_trans	*tp;
 	int			error;
 	xfs_qcnt_t		hard, soft;
 
@@ -489,30 +489,36 @@ xfs_qm_scall_setqlim(
 	if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
 		return 0;
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
-	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
-				      0, 0, XFS_DEFAULT_LOG_COUNT))) {
-		xfs_trans_cancel(tp, 0);
-		return (error);
-	}
-
 	/*
 	 * We don't want to race with a quotaoff so take the quotaoff lock.
-	 * (We don't hold an inode lock, so there's nothing else to stop
-	 * a quotaoff from happening). (XXXThis doesn't currently happen
-	 * because we take the vfslock before calling xfs_qm_sysent).
+	 * We don't hold an inode lock, so there's nothing else to stop
+	 * a quotaoff from happening.
 	 */
 	mutex_lock(&q->qi_quotaofflock);
 
 	/*
-	 * Get the dquot (locked), and join it to the transaction.
-	 * Allocate the dquot if this doesn't exist.
+	 * Get the dquot (locked) before we start, as we need to do a
+	 * transaction to allocate it if it doesn't exist. Once we have the
+	 * dquot, unlock it so we can start the next transaction safely. We hold
+	 * a reference to the dquot, so it's safe to do this unlock/lock without
+	 * it being reclaimed in the mean time.
 	 */
-	if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
-		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
+	if (error) {
 		ASSERT(error != ENOENT);
 		goto out_unlock;
 	}
+	xfs_dqunlock(dqp);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+	error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
+				  0, 0, XFS_DEFAULT_LOG_COUNT);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		goto out_rele;
+	}
+
+	xfs_dqlock(dqp);
 	xfs_trans_dqjoin(tp, dqp);
 	ddq = &dqp->q_core;
 
@@ -528,6 +534,7 @@ xfs_qm_scall_setqlim(
 	if (hard == 0 || hard >= soft) {
 		ddq->d_blk_hardlimit = cpu_to_be64(hard);
 		ddq->d_blk_softlimit = cpu_to_be64(soft);
+		xfs_dquot_set_prealloc_limits(dqp);
 		if (id == 0) {
 			q->qi_bhardlimit = hard;
 			q->qi_bsoftlimit = soft;
@@ -619,9 +626,10 @@ xfs_qm_scall_setqlim(
 	xfs_trans_log_dquot(tp, dqp);
 
 	error = xfs_trans_commit(tp, 0);
-	xfs_qm_dqrele(dqp);
 
- out_unlock:
+out_rele:
+	xfs_qm_dqrele(dqp);
+out_unlock:
 	mutex_unlock(&q->qi_quotaofflock);
 	return error;
 }
@@ -638,8 +646,9 @@ xfs_qm_log_quotaoff_end(
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
 
-	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
-				      0, 0, XFS_DEFAULT_LOG_COUNT))) {
+	error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp),
+				  0, 0, XFS_DEFAULT_LOG_COUNT);
+	if (error) {
 		xfs_trans_cancel(tp, 0);
 		return (error);
 	}
@@ -671,14 +680,10 @@ xfs_qm_log_quotaoff(
 	uint			oldsbqflag=0;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
-	if ((error = xfs_trans_reserve(tp, 0,
-				      sizeof(xfs_qoff_logitem_t) * 2 +
-				      mp->m_sb.sb_sectsize + 128,
-				      0,
-				      0,
-				      XFS_DEFAULT_LOG_COUNT))) {
+	error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp),
+				  0, 0, XFS_DEFAULT_LOG_COUNT);
+	if (error)
 		goto error0;
-	}
 
 	qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
 	xfs_trans_log_quotaoff_item(tp, qoffi);
@@ -771,9 +776,12 @@ xfs_qm_scall_getquota(
 	 * gets turned off. No need to confuse the user level code,
 	 * so return zeroes in that case.
 	 */
-	if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
-	    (!XFS_IS_OQUOTA_ENFORCED(mp) &&
-			(dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+	if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
+	     dqp->q_core.d_flags == XFS_DQ_USER) ||
+	    (!XFS_IS_GQUOTA_ENFORCED(mp) &&
+	     dqp->q_core.d_flags == XFS_DQ_GROUP) ||
+	    (!XFS_IS_PQUOTA_ENFORCED(mp) &&
+	     dqp->q_core.d_flags == XFS_DQ_PROJ)) {
 		dst->d_btimer = 0;
 		dst->d_itimer = 0;
 		dst->d_rtbtimer = 0;
@@ -781,8 +789,8 @@ xfs_qm_scall_getquota(
 
 #ifdef DEBUG
 	if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
-	     (XFS_IS_OQUOTA_ENFORCED(mp) &&
-			(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
+	     (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
+	     (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
 	    dst->d_id != 0) {
 		if ((dst->d_bcount > dst->d_blk_softlimit) &&
 		    (dst->d_blk_softlimit > 0)) {
@@ -828,16 +836,16 @@ xfs_qm_export_flags(
 	uflags = 0;
 	if (flags & XFS_UQUOTA_ACCT)
 		uflags |= FS_QUOTA_UDQ_ACCT;
-	if (flags & XFS_PQUOTA_ACCT)
-		uflags |= FS_QUOTA_PDQ_ACCT;
 	if (flags & XFS_GQUOTA_ACCT)
 		uflags |= FS_QUOTA_GDQ_ACCT;
+	if (flags & XFS_PQUOTA_ACCT)
+		uflags |= FS_QUOTA_PDQ_ACCT;
 	if (flags & XFS_UQUOTA_ENFD)
 		uflags |= FS_QUOTA_UDQ_ENFD;
-	if (flags & (XFS_OQUOTA_ENFD)) {
-		uflags |= (flags & XFS_GQUOTA_ACCT) ?
-			FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
-	}
+	if (flags & XFS_GQUOTA_ENFD)
+		uflags |= FS_QUOTA_GDQ_ENFD;
+	if (flags & XFS_PQUOTA_ENFD)
+		uflags |= FS_QUOTA_PDQ_ENFD;
 	return (uflags);
 }
 
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index b50ec5b95d5..c3483bab9cd 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -77,9 +77,18 @@ typedef struct	xfs_disk_dquot {
  */
 typedef struct xfs_dqblk {
 	xfs_disk_dquot_t  dd_diskdq;	/* portion that lives incore as well */
-	char		  dd_fill[32];	/* filling for posterity */
+	char		  dd_fill[4];	/* filling for posterity */
+
+	/*
+	 * These two are only present on filesystems with the CRC bits set.
+	 */
+	__be32		  dd_crc;	/* checksum */
+	__be64		  dd_lsn;	/* last modification in log */
+	uuid_t		  dd_uuid;	/* location information */
 } xfs_dqblk_t;
 
+#define XFS_DQUOT_CRC_OFF	offsetof(struct xfs_dqblk, dd_crc)
+
 /*
  * flags for q_flags field in the dquot.
  */
@@ -152,30 +161,42 @@ typedef struct xfs_qoff_logformat {
 #define XFS_GQUOTA_ACCT	0x0040  /* group quota accounting ON */
 
 /*
+ * Conversion to and from the combined OQUOTA flag (if necessary)
+ * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
+ */
+#define XFS_GQUOTA_ENFD	0x0080  /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD	0x0100  /* quotacheck run on group quotas */
+#define XFS_PQUOTA_ENFD	0x0200  /* project quota limits enforced */
+#define XFS_PQUOTA_CHKD	0x0400  /* quotacheck run on project quotas */
+
+/*
  * Quota Accounting/Enforcement flags
  */
 #define XFS_ALL_QUOTA_ACCT	\
 		(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD	(XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
-#define XFS_ALL_QUOTA_CHKD	(XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
+#define XFS_ALL_QUOTA_ENFD	\
+		(XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD	\
+		(XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
 
 #define XFS_IS_QUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
 #define XFS_IS_UQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_UQUOTA_ACCT)
 #define XFS_IS_PQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_PQUOTA_ACCT)
 #define XFS_IS_GQUOTA_RUNNING(mp)	((mp)->m_qflags & XFS_GQUOTA_ACCT)
 #define XFS_IS_UQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_UQUOTA_ENFD)
-#define XFS_IS_OQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_OQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#define XFS_IS_PQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_PQUOTA_ENFD)
 
 /*
  * Incore only flags for quotaoff - these bits get cleared when quota(s)
  * are in the process of getting turned off. These flags are in m_qflags but
  * never in sb_qflags.
  */
-#define XFS_UQUOTA_ACTIVE	0x0100  /* uquotas are being turned off */
-#define XFS_PQUOTA_ACTIVE	0x0200  /* pquotas are being turned off */
-#define XFS_GQUOTA_ACTIVE	0x0400  /* gquotas are being turned off */
+#define XFS_UQUOTA_ACTIVE	0x1000  /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE	0x2000  /* gquotas are being turned off */
+#define XFS_PQUOTA_ACTIVE	0x4000  /* pquotas are being turned off */
 #define XFS_ALL_QUOTA_ACTIVE	\
-	(XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
+	(XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
 
 /*
  * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
@@ -259,24 +280,23 @@ typedef struct xfs_qoff_logformat {
 	((XFS_IS_UQUOTA_ON(mp) && \
 		(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
 	 (XFS_IS_GQUOTA_ON(mp) && \
-		((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
-		 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
+		(mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
 	 (XFS_IS_PQUOTA_ON(mp) && \
-		((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \
-		 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
+		(mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
 
 #define XFS_MOUNT_QUOTA_SET1	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-				 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
-				 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+				 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+				 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD)
 
 #define XFS_MOUNT_QUOTA_SET2	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-				 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
-				 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
+				 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
+				 XFS_PQUOTA_ENFD|XFS_PQUOTA_CHKD)
 
 #define XFS_MOUNT_QUOTA_ALL	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
-				 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
-				 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
-				 XFS_GQUOTA_ACCT)
+				 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+				 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
+				 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
+				 XFS_PQUOTA_CHKD)
 
 
 /*
@@ -380,5 +400,7 @@ extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
 				xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+
 #endif	/* __KERNEL__ */
 #endif	/* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 71926d63052..20e30f93b0c 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -75,8 +75,10 @@ xfs_fs_set_xstate(
 		flags |= XFS_GQUOTA_ACCT;
 	if (uflags & FS_QUOTA_UDQ_ENFD)
 		flags |= XFS_UQUOTA_ENFD;
-	if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
-		flags |= XFS_OQUOTA_ENFD;
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		flags |= XFS_GQUOTA_ENFD;
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		flags |= XFS_PQUOTA_ENFD;
 
 	switch (op) {
 	case Q_XQUOTAON:
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index a05b45175fb..78f9e70b80c 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -32,6 +32,7 @@ struct xfs_mount;
 #define	XFS_SB_VERSION_2	2		/* 6.2 - attributes */
 #define	XFS_SB_VERSION_3	3		/* 6.2 - new inode version */
 #define	XFS_SB_VERSION_4	4		/* 6.2+ - bitmask version */
+#define	XFS_SB_VERSION_5	5		/* CRC enabled filesystem */
 #define	XFS_SB_VERSION_NUMBITS		0x000f
 #define	XFS_SB_VERSION_ALLFBITS		0xfff0
 #define	XFS_SB_VERSION_SASHFBITS	0xf000
@@ -161,6 +162,20 @@ typedef struct xfs_sb {
 	 */
 	__uint32_t	sb_bad_features2;
 
+	/* version 5 superblock fields start here */
+
+	/* feature masks */
+	__uint32_t	sb_features_compat;
+	__uint32_t	sb_features_ro_compat;
+	__uint32_t	sb_features_incompat;
+	__uint32_t	sb_features_log_incompat;
+
+	__uint32_t	sb_crc;		/* superblock crc */
+	__uint32_t	sb_pad;
+
+	xfs_ino_t	sb_pquotino;	/* project quota inode */
+	xfs_lsn_t	sb_lsn;		/* last write sequence */
+
 	/* must be padded to 64 bit alignment */
 } xfs_sb_t;
 
@@ -229,7 +244,21 @@ typedef struct xfs_dsb {
 	 * for features2 bits. Easiest just to mark it bad and not use
 	 * it for anything else.
 	 */
-	__be32	sb_bad_features2;
+	__be32		sb_bad_features2;
+
+	/* version 5 superblock fields start here */
+
+	/* feature masks */
+	__be32		sb_features_compat;
+	__be32		sb_features_ro_compat;
+	__be32		sb_features_incompat;
+	__be32		sb_features_log_incompat;
+
+	__le32		sb_crc;		/* superblock crc */
+	__be32		sb_pad;
+
+	__be64		sb_pquotino;	/* project quota inode */
+	__be64		sb_lsn;		/* last write sequence */
 
 	/* must be padded to 64 bit alignment */
 } xfs_dsb_t;
@@ -250,7 +279,10 @@ typedef enum {
 	XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
 	XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
 	XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
-	XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2,
+	XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
+	XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
+	XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
+	XFS_SBS_PQUOTINO, XFS_SBS_LSN,
 	XFS_SBS_FIELDCOUNT
 } xfs_sb_field_t;
 
@@ -276,6 +308,12 @@ typedef enum {
 #define XFS_SB_FDBLOCKS		XFS_SB_MVAL(FDBLOCKS)
 #define XFS_SB_FEATURES2	XFS_SB_MVAL(FEATURES2)
 #define XFS_SB_BAD_FEATURES2	XFS_SB_MVAL(BAD_FEATURES2)
+#define XFS_SB_FEATURES_COMPAT	XFS_SB_MVAL(FEATURES_COMPAT)
+#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
+#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
+#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
+#define XFS_SB_CRC		XFS_SB_MVAL(CRC)
+#define XFS_SB_PQUOTINO		XFS_SB_MVAL(PQUOTINO)
 #define	XFS_SB_NUM_BITS		((int)XFS_SBS_FIELDCOUNT)
 #define	XFS_SB_ALL_BITS		((1LL << XFS_SB_NUM_BITS) - 1)
 #define	XFS_SB_MOD_BITS		\
@@ -283,7 +321,9 @@ typedef enum {
 	 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
 	 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
 	 XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
-	 XFS_SB_BAD_FEATURES2)
+	 XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
+	 XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
+	 XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
 
 
 /*
@@ -325,6 +365,8 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp)
 
 		return 1;
 	}
+	if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
+		return 1;
 
 	return 0;
 }
@@ -365,7 +407,7 @@ static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
 {
 	return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
 		sbp->sb_versionnum == XFS_SB_VERSION_3 ||
-		(XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
 		 (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
 }
 
@@ -373,7 +415,7 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
 {
 	if (sbp->sb_versionnum == XFS_SB_VERSION_1)
 		sbp->sb_versionnum = XFS_SB_VERSION_2;
-	else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+	else if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4)
 		sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
 	else
 		sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
@@ -382,7 +424,7 @@ static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
 static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
 {
 	return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
-		 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		 (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
 		  (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
 }
 
@@ -396,13 +438,13 @@ static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
 
 static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
 {
-	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+	return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
 		(sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
 }
 
 static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
 {
-	if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+	if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4)
 		sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
 	else
 		sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
@@ -411,13 +453,14 @@ static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
 
 static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
 {
-	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-		(sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+	       (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
 }
 
 static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
 {
-	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+	return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
 		(sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
 }
 
@@ -429,38 +472,42 @@ static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
 
 static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
 {
-	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-		(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+	       (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT));
 }
 
 static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
 {
-	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-		(sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+	       (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT));
 }
 
 static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
 {
-	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-		(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+	       (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT));
 }
 
 static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
 {
-	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+	return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
 		(sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
 }
 
 static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
 {
-	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+	return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
 		(sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
 }
 
 static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 {
-	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-		(sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+	       (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+		(sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT));
 }
 
 /*
@@ -475,14 +522,16 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 
 static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
 {
-	return xfs_sb_version_hasmorebits(sbp) &&
-		(sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT);
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+	       (xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
 }
 
 static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
 {
-	return xfs_sb_version_hasmorebits(sbp) &&
-		(sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+	       (xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
 }
 
 static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
@@ -500,14 +549,79 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
 
 static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
 {
-	return xfs_sb_version_hasmorebits(sbp) &&
-		(sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
+	       (xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
 }
 
 static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
 {
-	return (xfs_sb_version_hasmorebits(sbp) &&
-		(sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+
+/*
+ * Extended v5 superblock feature masks. These are to be used for new v5
+ * superblock features only.
+ *
+ * Compat features are new features that old kernels will not notice or affect
+ * and so can mount read-write without issues.
+ *
+ * RO-Compat (read only) are features that old kernels can read but will break
+ * if they write. Hence only read-only mounts of such filesystems are allowed on
+ * kernels that don't support the feature bit.
+ *
+ * InCompat features are features which old kernels will not understand and so
+ * must not mount.
+ *
+ * Log-InCompat features are for changes to log formats or new transactions that
+ * can't be replayed on older kernels. The fields are set when the filesystem is
+ * mounted, and a clean unmount clears the fields.
+ */
+#define XFS_SB_FEAT_COMPAT_ALL 0
+#define XFS_SB_FEAT_COMPAT_UNKNOWN	~XFS_SB_FEAT_COMPAT_ALL
+static inline bool
+xfs_sb_has_compat_feature(
+	struct xfs_sb	*sbp,
+	__uint32_t	feature)
+{
+	return (sbp->sb_features_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_RO_COMPAT_ALL 0
+#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
+static inline bool
+xfs_sb_has_ro_compat_feature(
+	struct xfs_sb	*sbp,
+	__uint32_t	feature)
+{
+	return (sbp->sb_features_ro_compat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_ALL
+static inline bool
+xfs_sb_has_incompat_feature(
+	struct xfs_sb	*sbp,
+	__uint32_t	feature)
+{
+	return (sbp->sb_features_incompat & feature) != 0;
+}
+
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_LOG_ALL
+static inline bool
+xfs_sb_has_incompat_log_feature(
+	struct xfs_sb	*sbp,
+	__uint32_t	feature)
+{
+	return (sbp->sb_features_log_incompat & feature) != 0;
+}
+
+static inline bool
+xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
+{
+	return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino);
 }
 
 /*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ab8839b2627..1d68ffcdeaa 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -51,6 +51,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_icache.h"
 #include "xfs_trace.h"
+#include "xfs_icreate_item.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -139,9 +140,9 @@ static const match_table_t tokens = {
 
 
 STATIC unsigned long
-suffix_strtoul(char *s, char **endp, unsigned int base)
+suffix_kstrtoint(char *s, unsigned int base, int *res)
 {
-	int	last, shift_left_factor = 0;
+	int	last, shift_left_factor = 0, _res;
 	char	*value = s;
 
 	last = strlen(value) - 1;
@@ -158,7 +159,10 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
 		value[last] = '\0';
 	}
 
-	return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
+	if (kstrtoint(s, base, &_res))
+		return -EINVAL;
+	*res = _res << shift_left_factor;
+	return 0;
 }
 
 /*
@@ -174,7 +178,7 @@ xfs_parseargs(
 	char			*options)
 {
 	struct super_block	*sb = mp->m_super;
-	char			*this_char, *value, *eov;
+	char			*this_char, *value;
 	int			dsunit = 0;
 	int			dswidth = 0;
 	int			iosize = 0;
@@ -230,14 +234,16 @@ xfs_parseargs(
 					this_char);
 				return EINVAL;
 			}
-			mp->m_logbufs = simple_strtoul(value, &eov, 10);
+			if (kstrtoint(value, 10, &mp->m_logbufs))
+				return EINVAL;
 		} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
 			if (!value || !*value) {
 				xfs_warn(mp, "%s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
-			mp->m_logbsize = suffix_strtoul(value, &eov, 10);
+			if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
+				return EINVAL;
 		} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
 			if (!value || !*value) {
 				xfs_warn(mp, "%s option requires an argument",
@@ -266,7 +272,8 @@ xfs_parseargs(
 					this_char);
 				return EINVAL;
 			}
-			iosize = simple_strtoul(value, &eov, 10);
+			if (kstrtoint(value, 10, &iosize))
+				return EINVAL;
 			iosizelog = ffs(iosize) - 1;
 		} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
 			if (!value || !*value) {
@@ -274,7 +281,8 @@ xfs_parseargs(
 					this_char);
 				return EINVAL;
 			}
-			iosize = suffix_strtoul(value, &eov, 10);
+			if (suffix_kstrtoint(value, 10, &iosize))
+				return EINVAL;
 			iosizelog = ffs(iosize) - 1;
 		} else if (!strcmp(this_char, MNTOPT_GRPID) ||
 			   !strcmp(this_char, MNTOPT_BSDGROUPS)) {
@@ -296,14 +304,16 @@ xfs_parseargs(
 					this_char);
 				return EINVAL;
 			}
-			dsunit = simple_strtoul(value, &eov, 10);
+			if (kstrtoint(value, 10, &dsunit))
+				return EINVAL;
 		} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
 			if (!value || !*value) {
 				xfs_warn(mp, "%s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
-			dswidth = simple_strtoul(value, &eov, 10);
+			if (kstrtoint(value, 10, &dswidth))
+				return EINVAL;
 		} else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
 			mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
 		} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
@@ -350,17 +360,17 @@ xfs_parseargs(
 		} else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
 			   !strcmp(this_char, MNTOPT_PRJQUOTA)) {
 			mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
-					 XFS_OQUOTA_ENFD);
+					 XFS_PQUOTA_ENFD);
 		} else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
 			mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-			mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+			mp->m_qflags &= ~XFS_PQUOTA_ENFD;
 		} else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
 			   !strcmp(this_char, MNTOPT_GRPQUOTA)) {
 			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
-					 XFS_OQUOTA_ENFD);
+					 XFS_GQUOTA_ENFD);
 		} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
 			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-			mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+			mp->m_qflags &= ~XFS_GQUOTA_ENFD;
 		} else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
 			xfs_warn(mp,
 	"delaylog is the default now, option is deprecated.");
@@ -430,20 +440,15 @@ xfs_parseargs(
 	}
 
 done:
-	if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+	if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
 		/*
 		 * At this point the superblock has not been read
 		 * in, therefore we do not know the block size.
 		 * Before the mount call ends we will convert
 		 * these to FSBs.
 		 */
-		if (dsunit) {
-			mp->m_dalign = dsunit;
-			mp->m_flags |= XFS_MOUNT_RETERR;
-		}
-
-		if (dswidth)
-			mp->m_swidth = dswidth;
+		mp->m_dalign = dsunit;
+		mp->m_swidth = dswidth;
 	}
 
 	if (mp->m_logbufs != -1 &&
@@ -554,12 +559,12 @@ xfs_showargs(
 	/* Either project or group quotas can be active, not both */
 
 	if (mp->m_qflags & XFS_PQUOTA_ACCT) {
-		if (mp->m_qflags & XFS_OQUOTA_ENFD)
+		if (mp->m_qflags & XFS_PQUOTA_ENFD)
 			seq_puts(m, "," MNTOPT_PRJQUOTA);
 		else
 			seq_puts(m, "," MNTOPT_PQUOTANOENF);
 	} else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
-		if (mp->m_qflags & XFS_OQUOTA_ENFD)
+		if (mp->m_qflags & XFS_GQUOTA_ENFD)
 			seq_puts(m, "," MNTOPT_GRPQUOTA);
 		else
 			seq_puts(m, "," MNTOPT_GQUOTANOENF);
@@ -1127,8 +1132,8 @@ xfs_fs_statfs(
 	spin_unlock(&mp->m_sb_lock);
 
 	if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-	    ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
-			      (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+	    ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
+			      (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
 		xfs_qm_statvfs(ip, statp);
 	return 0;
 }
@@ -1364,6 +1369,17 @@ xfs_finish_flags(
 	}
 
 	/*
+	 * V5 filesystems always use attr2 format for attributes.
+	 */
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+		xfs_warn(mp,
+"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
+			MNTOPT_NOATTR2, MNTOPT_ATTR2);
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
 	 * mkfs'ed attr2 will turn on attr2 mount unless explicitly
 	 * told by noattr2 to turn it off
 	 */
@@ -1461,6 +1477,10 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
+	/* version 5 superblocks support inode version counters. */
+	if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+		sb->s_flags |= MS_I_VERSION;
+
 	error = xfs_mountfs(mp);
 	if (error)
 		goto out_filestream_unmount;
@@ -1552,6 +1572,7 @@ static struct file_system_type xfs_fs_type = {
 	.kill_sb		= kill_block_super,
 	.fs_flags		= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("xfs");
 
 STATIC int __init
 xfs_init_zones(void)
@@ -1634,9 +1655,15 @@ xfs_init_zones(void)
 					KM_ZONE_SPREAD, NULL);
 	if (!xfs_ili_zone)
 		goto out_destroy_inode_zone;
+	xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
+					"xfs_icr");
+	if (!xfs_icreate_zone)
+		goto out_destroy_ili_zone;
 
 	return 0;
 
+ out_destroy_ili_zone:
+	kmem_zone_destroy(xfs_ili_zone);
  out_destroy_inode_zone:
 	kmem_zone_destroy(xfs_inode_zone);
  out_destroy_efi_zone:
@@ -1675,6 +1702,7 @@ xfs_destroy_zones(void)
 	 * destroy caches.
 	 */
 	rcu_barrier();
+	kmem_zone_destroy(xfs_icreate_zone);
 	kmem_zone_destroy(xfs_ili_zone);
 	kmem_zone_destroy(xfs_inode_zone);
 	kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
new file mode 100644
index 00000000000..e830fb56e27
--- /dev/null
+++ b/fs/xfs/xfs_symlink.c
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012-2013 Red Hat, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_itable.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_utils.h"
+#include "xfs_trans_space.h"
+#include "xfs_log_priv.h"
+#include "xfs_trace.h"
+#include "xfs_symlink.h"
+#include "xfs_cksum.h"
+#include "xfs_buf_item.h"
+
+
+/*
+ * Each contiguous block has a header, so it is not just a simple pathlen
+ * to FSB conversion.
+ */
+int
+xfs_symlink_blocks(
+	struct xfs_mount *mp,
+	int		pathlen)
+{
+	int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
+
+	return (pathlen + buflen - 1) / buflen;
+}
+
+static int
+xfs_symlink_hdr_set(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return 0;
+
+	dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
+	dsl->sl_offset = cpu_to_be32(offset);
+	dsl->sl_bytes = cpu_to_be32(size);
+	uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+	dsl->sl_owner = cpu_to_be64(ino);
+	dsl->sl_blkno = cpu_to_be64(bp->b_bn);
+	bp->b_ops = &xfs_symlink_buf_ops;
+
+	return sizeof(struct xfs_dsymlink_hdr);
+}
+
+/*
+ * Checking of the symlink header is split into two parts. the verifier does
+ * CRC, location and bounds checking, the unpacking function checks the path
+ * parameters and owner.
+ */
+bool
+xfs_symlink_hdr_ok(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	uint32_t		offset,
+	uint32_t		size,
+	struct xfs_buf		*bp)
+{
+	struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+
+	if (offset != be32_to_cpu(dsl->sl_offset))
+		return false;
+	if (size != be32_to_cpu(dsl->sl_bytes))
+		return false;
+	if (ino != be64_to_cpu(dsl->sl_owner))
+		return false;
+
+	/* ok */
+	return true;
+}
+
+static bool
+xfs_symlink_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dsymlink_hdr	*dsl = bp->b_addr;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return false;
+	if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+		return false;
+	if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
+		return false;
+	if (be32_to_cpu(dsl->sl_offset) +
+				be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
+		return false;
+	if (dsl->sl_owner == 0)
+		return false;
+
+	return true;
+}
+
+static void
+xfs_symlink_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+				  offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
+	    !xfs_symlink_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+}
+
+static void
+xfs_symlink_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	/* no verification of non-crc buffers */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_symlink_verify(bp)) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		return;
+	}
+
+	if (bip) {
+		struct xfs_dsymlink_hdr *dsl = bp->b_addr;
+		dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+	}
+	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+			 offsetof(struct xfs_dsymlink_hdr, sl_crc));
+}
+
+const struct xfs_buf_ops xfs_symlink_buf_ops = {
+	.verify_read = xfs_symlink_read_verify,
+	.verify_write = xfs_symlink_write_verify,
+};
+
+void
+xfs_symlink_local_to_remote(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	struct xfs_inode	*ip,
+	struct xfs_ifork	*ifp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	char			*buf;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+		bp->b_ops = NULL;
+		memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+		return;
+	}
+
+	/*
+	 * As this symlink fits in an inode literal area, it must also fit in
+	 * the smallest buffer the filesystem supports.
+	 */
+	ASSERT(BBTOB(bp->b_length) >=
+			ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
+
+	bp->b_ops = &xfs_symlink_buf_ops;
+
+	buf = bp->b_addr;
+	buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
+	memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+}
+
+/* ----- Kernel only functions below ----- */
+STATIC int
+xfs_readlink_bmap(
+	struct xfs_inode	*ip,
+	char			*link)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	mval[XFS_SYMLINK_MAPS];
+	struct xfs_buf		*bp;
+	xfs_daddr_t		d;
+	char			*cur_chunk;
+	int			pathlen = ip->i_d.di_size;
+	int			nmaps = XFS_SYMLINK_MAPS;
+	int			byte_cnt;
+	int			n;
+	int			error = 0;
+	int			fsblocks = 0;
+	int			offset;
+
+	fsblocks = xfs_symlink_blocks(mp, pathlen);
+	error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+	if (error)
+		goto out;
+
+	offset = 0;
+	for (n = 0; n < nmaps; n++) {
+		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+
+		bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
+				  &xfs_symlink_buf_ops);
+		if (!bp)
+			return XFS_ERROR(ENOMEM);
+		error = bp->b_error;
+		if (error) {
+			xfs_buf_ioerror_alert(bp, __func__);
+			xfs_buf_relse(bp);
+			goto out;
+		}
+		byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+		if (pathlen < byte_cnt)
+			byte_cnt = pathlen;
+
+		cur_chunk = bp->b_addr;
+		if (xfs_sb_version_hascrc(&mp->m_sb)) {
+			if (!xfs_symlink_hdr_ok(mp, ip->i_ino, offset,
+							byte_cnt, bp)) {
+				error = EFSCORRUPTED;
+				xfs_alert(mp,
+"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
+					offset, byte_cnt, ip->i_ino);
+				xfs_buf_relse(bp);
+				goto out;
+
+			}
+
+			cur_chunk += sizeof(struct xfs_dsymlink_hdr);
+		}
+
+		memcpy(link + offset, bp->b_addr, byte_cnt);
+
+		pathlen -= byte_cnt;
+		offset += byte_cnt;
+
+		xfs_buf_relse(bp);
+	}
+	ASSERT(pathlen == 0);
+
+	link[ip->i_d.di_size] = '\0';
+	error = 0;
+
+ out:
+	return error;
+}
+
+int
+xfs_readlink(
+	struct xfs_inode *ip,
+	char		*link)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	xfs_fsize_t	pathlen;
+	int		error = 0;
+
+	trace_xfs_readlink(ip);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	pathlen = ip->i_d.di_size;
+	if (!pathlen)
+		goto out;
+
+	if (pathlen < 0 || pathlen > MAXPATHLEN) {
+		xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
+			 __func__, (unsigned long long) ip->i_ino,
+			 (long long) pathlen);
+		ASSERT(0);
+		error = XFS_ERROR(EFSCORRUPTED);
+		goto out;
+	}
+
+
+	if (ip->i_df.if_flags & XFS_IFINLINE) {
+		memcpy(link, ip->i_df.if_u1.if_data, pathlen);
+		link[pathlen] = '\0';
+	} else {
+		error = xfs_readlink_bmap(ip, link);
+	}
+
+ out:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return error;
+}
+
+int
+xfs_symlink(
+	struct xfs_inode	*dp,
+	struct xfs_name		*link_name,
+	const char		*target_path,
+	umode_t			mode,
+	struct xfs_inode	**ipp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	struct xfs_trans	*tp = NULL;
+	struct xfs_inode	*ip = NULL;
+	int			error = 0;
+	int			pathlen;
+	struct xfs_bmap_free	free_list;
+	xfs_fsblock_t		first_block;
+	bool			unlock_dp_on_error = false;
+	uint			cancel_flags;
+	int			committed;
+	xfs_fileoff_t		first_fsb;
+	xfs_filblks_t		fs_blocks;
+	int			nmaps;
+	struct xfs_bmbt_irec	mval[XFS_SYMLINK_MAPS];
+	xfs_daddr_t		d;
+	const char		*cur_chunk;
+	int			byte_cnt;
+	int			n;
+	xfs_buf_t		*bp;
+	prid_t			prid;
+	struct xfs_dquot	*udqp = NULL;
+	struct xfs_dquot	*gdqp = NULL;
+	uint			resblks;
+
+	*ipp = NULL;
+
+	trace_xfs_symlink(dp, link_name);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	/*
+	 * Check component lengths of the target path name.
+	 */
+	pathlen = strlen(target_path);
+	if (pathlen >= MAXPATHLEN)      /* total string too long */
+		return XFS_ERROR(ENAMETOOLONG);
+
+	udqp = gdqp = NULL;
+	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+		prid = xfs_get_projid(dp);
+	else
+		prid = XFS_PROJID_DEFAULT;
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+	if (error)
+		goto std_return;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	/*
+	 * The symlink will fit into the inode data fork?
+	 * There can't be any attributes so we get the whole variable part.
+	 */
+	if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version))
+		fs_blocks = 0;
+	else
+		fs_blocks = xfs_symlink_blocks(mp, pathlen);
+	resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
+	error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+	if (error == ENOSPC && fs_blocks == 0) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+	}
+	if (error) {
+		cancel_flags = 0;
+		goto error_return;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+	unlock_dp_on_error = true;
+
+	/*
+	 * Check whether the directory allows new symlinks or not.
+	 */
+	if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
+		error = XFS_ERROR(EPERM);
+		goto error_return;
+	}
+
+	/*
+	 * Reserve disk quota : blocks and inode.
+	 */
+	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
+	if (error)
+		goto error_return;
+
+	/*
+	 * Check for ability to enter directory entry, if no space reserved.
+	 */
+	error = xfs_dir_canenter(tp, dp, link_name, resblks);
+	if (error)
+		goto error_return;
+	/*
+	 * Initialize the bmap freelist prior to calling either
+	 * bmapi or the directory create code.
+	 */
+	xfs_bmap_init(&free_list, &first_block);
+
+	/*
+	 * Allocate an inode for the symlink.
+	 */
+	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
+			       prid, resblks > 0, &ip, NULL);
+	if (error) {
+		if (error == ENOSPC)
+			goto error_return;
+		goto error1;
+	}
+
+	/*
+	 * An error after we've joined dp to the transaction will result in the
+	 * transaction cancel unlocking dp so don't do it explicitly in the
+	 * error path.
+	 */
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	unlock_dp_on_error = false;
+
+	/*
+	 * Also attach the dquot(s) to it, if applicable.
+	 */
+	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
+
+	if (resblks)
+		resblks -= XFS_IALLOC_SPACE_RES(mp);
+	/*
+	 * If the symlink will fit into the inode, write it inline.
+	 */
+	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+		xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
+		memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
+		ip->i_d.di_size = pathlen;
+
+		/*
+		 * The inode was initially created in extent format.
+		 */
+		ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+		ip->i_df.if_flags |= XFS_IFINLINE;
+
+		ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+
+	} else {
+		int	offset;
+
+		first_fsb = 0;
+		nmaps = XFS_SYMLINK_MAPS;
+
+		error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
+				  XFS_BMAPI_METADATA, &first_block, resblks,
+				  mval, &nmaps, &free_list);
+		if (error)
+			goto error2;
+
+		if (resblks)
+			resblks -= fs_blocks;
+		ip->i_d.di_size = pathlen;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+		cur_chunk = target_path;
+		offset = 0;
+		for (n = 0; n < nmaps; n++) {
+			char	*buf;
+
+			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					       BTOBB(byte_cnt), 0);
+			if (!bp) {
+				error = ENOMEM;
+				goto error2;
+			}
+			bp->b_ops = &xfs_symlink_buf_ops;
+
+			byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+			byte_cnt = min(byte_cnt, pathlen);
+
+			buf = bp->b_addr;
+			buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
+						   byte_cnt, bp);
+
+			memcpy(buf, cur_chunk, byte_cnt);
+
+			cur_chunk += byte_cnt;
+			pathlen -= byte_cnt;
+			offset += byte_cnt;
+
+			xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
+							(char *)bp->b_addr);
+		}
+		ASSERT(pathlen == 0);
+	}
+
+	/*
+	 * Create the directory entry for the symlink.
+	 */
+	error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
+					&first_block, &free_list, resblks);
+	if (error)
+		goto error2;
+	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * symlink transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	if (error) {
+		goto error2;
+	}
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+
+	*ipp = ip;
+	return 0;
+
+ error2:
+	IRELE(ip);
+ error1:
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+	xfs_qm_dqrele(udqp);
+	xfs_qm_dqrele(gdqp);
+
+	if (unlock_dp_on_error)
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ std_return:
+	return error;
+}
+
+/*
+ * Free a symlink that has blocks associated with it.
+ */
+STATIC int
+xfs_inactive_symlink_rmt(
+	xfs_inode_t	*ip,
+	xfs_trans_t	**tpp)
+{
+	xfs_buf_t	*bp;
+	int		committed;
+	int		done;
+	int		error;
+	xfs_fsblock_t	first_block;
+	xfs_bmap_free_t	free_list;
+	int		i;
+	xfs_mount_t	*mp;
+	xfs_bmbt_irec_t	mval[XFS_SYMLINK_MAPS];
+	int		nmaps;
+	xfs_trans_t	*ntp;
+	int		size;
+	xfs_trans_t	*tp;
+
+	tp = *tpp;
+	mp = ip->i_mount;
+	ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
+	/*
+	 * We're freeing a symlink that has some
+	 * blocks allocated to it.  Free the
+	 * blocks here.  We know that we've got
+	 * either 1 or 2 extents and that we can
+	 * free them all in one bunmapi call.
+	 */
+	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+
+	/*
+	 * Lock the inode, fix the size, and join it to the transaction.
+	 * Hold it so in the normal path, we still have it locked for
+	 * the second transaction.  In the error paths we need it
+	 * held so the cancel won't rele it, see below.
+	 */
+	size = (int)ip->i_d.di_size;
+	ip->i_d.di_size = 0;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	/*
+	 * Find the block(s) so we can inval and unmap them.
+	 */
+	done = 0;
+	xfs_bmap_init(&free_list, &first_block);
+	nmaps = ARRAY_SIZE(mval);
+	error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size),
+				mval, &nmaps, 0);
+	if (error)
+		goto error0;
+	/*
+	 * Invalidate the block(s). No validation is done.
+	 */
+	for (i = 0; i < nmaps; i++) {
+		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+		if (!bp) {
+			error = ENOMEM;
+			goto error1;
+		}
+		xfs_trans_binval(tp, bp);
+	}
+	/*
+	 * Unmap the dead block(s) to the free_list.
+	 */
+	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+			&first_block, &free_list, &done)))
+		goto error1;
+	ASSERT(done);
+	/*
+	 * Commit the first transaction.  This logs the EFI and the inode.
+	 */
+	if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
+		goto error1;
+	/*
+	 * The transaction must have been committed, since there were
+	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
+	 * The new tp has the extent freeing and EFDs.
+	 */
+	ASSERT(committed);
+	/*
+	 * The first xact was committed, so add the inode to the new one.
+	 * Mark it dirty so it will be logged and moved forward in the log as
+	 * part of every commit.
+	 */
+	xfs_trans_ijoin(tp, ip, 0);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	/*
+	 * Get a new, empty transaction to return to our caller.
+	 */
+	ntp = xfs_trans_dup(tp);
+	/*
+	 * Commit the transaction containing extent freeing and EFDs.
+	 * If we get an error on the commit here or on the reserve below,
+	 * we need to unlock the inode since the new transaction doesn't
+	 * have the inode attached.
+	 */
+	error = xfs_trans_commit(tp, 0);
+	tp = ntp;
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		goto error0;
+	}
+	/*
+	 * transaction commit worked ok so we can drop the extra ticket
+	 * reference that we gained in xfs_trans_dup()
+	 */
+	xfs_log_ticket_put(tp->t_ticket);
+
+	/*
+	 * Remove the memory for extent descriptions (just bookkeeping).
+	 */
+	if (ip->i_df.if_bytes)
+		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
+	ASSERT(ip->i_df.if_bytes == 0);
+	/*
+	 * Put an itruncate log reservation in the new transaction
+	 * for our caller.
+	 */
+	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		goto error0;
+	}
+
+	xfs_trans_ijoin(tp, ip, 0);
+	*tpp = tp;
+	return 0;
+
+ error1:
+	xfs_bmap_cancel(&free_list);
+ error0:
+	return error;
+}
+
+/*
+ * xfs_inactive_symlink - free a symlink
+ */
+int
+xfs_inactive_symlink(
+	struct xfs_inode	*ip,
+	struct xfs_trans	**tp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	int			pathlen;
+
+	trace_xfs_inactive_symlink(ip);
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	/*
+	 * Zero length symlinks _can_ exist.
+	 */
+	pathlen = (int)ip->i_d.di_size;
+	if (!pathlen)
+		return 0;
+
+	if (pathlen < 0 || pathlen > MAXPATHLEN) {
+		xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
+			 __func__, (unsigned long long)ip->i_ino, pathlen);
+		ASSERT(0);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (ip->i_df.if_flags & XFS_IFINLINE) {
+		if (ip->i_df.if_bytes > 0)
+			xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
+					  XFS_DATA_FORK);
+		ASSERT(ip->i_df.if_bytes == 0);
+		return 0;
+	}
+
+	/* remove the remote symlink */
+	return xfs_inactive_symlink_rmt(ip, tp);
+}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
new file mode 100644
index 00000000000..374394880c0
--- /dev/null
+++ b/fs/xfs/xfs_symlink.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2012 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SYMLINK_H
+#define __XFS_SYMLINK_H 1
+
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+struct xfs_buf;
+struct xfs_ifork;
+struct xfs_name;
+
+#define XFS_SYMLINK_MAGIC	0x58534c4d	/* XSLM */
+
+struct xfs_dsymlink_hdr {
+	__be32	sl_magic;
+	__be32	sl_offset;
+	__be32	sl_bytes;
+	__be32	sl_crc;
+	uuid_t	sl_uuid;
+	__be64	sl_owner;
+	__be64	sl_blkno;
+	__be64	sl_lsn;
+};
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 3 extents back from
+ * bmapi when crc headers are taken into account.
+ */
+#define XFS_SYMLINK_MAPS 3
+
+#define XFS_SYMLINK_BUF_SPACE(mp, bufsize)	\
+	((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+			sizeof(struct xfs_dsymlink_hdr) : 0))
+
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+				 struct xfs_inode *ip, struct xfs_ifork *ifp);
+
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+
+#ifdef __KERNEL__
+
+int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
+		const char *target_path, umode_t mode, struct xfs_inode **ipp);
+int xfs_readlink(struct xfs_inode *ip, char *link);
+int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
+
+#endif /* __KERNEL__ */
+#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 2801b5ce6cd..1743b9f8e23 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header;
 #ifdef CONFIG_PROC_FS
 STATIC int
 xfs_stats_clear_proc_handler(
-	ctl_table	*ctl,
-	int		write,
-	void		__user *buffer,
-	size_t		*lenp,
-	loff_t		*ppos)
+	struct ctl_table	*ctl,
+	int			write,
+	void			__user *buffer,
+	size_t			*lenp,
+	loff_t			*ppos)
 {
 	int		c, ret, *valp = ctl->data;
 	__uint32_t	vn_active;
@@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler(
 
 STATIC int
 xfs_panic_mask_proc_handler(
-	ctl_table	*ctl,
-	int		write,
-	void		__user *buffer,
-	size_t		*lenp,
-	loff_t		*ppos)
+	struct ctl_table	*ctl,
+	int			write,
+	void			__user *buffer,
+	size_t			*lenp,
+	loff_t			*ppos)
 {
 	int		ret, *valp = ctl->data;
 
@@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler(
 }
 #endif /* CONFIG_PROC_FS */
 
-static ctl_table xfs_table[] = {
+static struct ctl_table xfs_table[] = {
 	{
 		.procname	= "irix_sgid_inherit",
 		.data		= &xfs_params.sgid_inherit.val,
@@ -227,7 +227,7 @@ static ctl_table xfs_table[] = {
 	{}
 };
 
-static ctl_table xfs_dir_table[] = {
+static struct ctl_table xfs_dir_table[] = {
 	{
 		.procname	= "xfs",
 		.mode		= 0555,
@@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = {
 	{}
 };
 
-static ctl_table xfs_root_table[] = {
+static struct ctl_table xfs_root_table[] = {
 	{
 		.procname	= "fs",
 		.mode		= 0555,
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 624bedd8135..b6e3897c1d9 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -22,7 +22,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
@@ -30,6 +29,7 @@
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "xfs_mount.h"
+#include "xfs_da_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_alloc.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 16a812977ea..47910e638c1 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -486,9 +486,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \
 	TP_PROTO(struct xfs_buf_log_item *bip), \
 	TP_ARGS(bip))
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
@@ -508,6 +511,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
 
 DECLARE_EVENT_CLASS(xfs_lock_class,
 	TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
@@ -571,6 +575,7 @@ DEFINE_INODE_EVENT(xfs_iget_miss);
 DEFINE_INODE_EVENT(xfs_getattr);
 DEFINE_INODE_EVENT(xfs_setattr);
 DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_inactive_symlink);
 DEFINE_INODE_EVENT(xfs_alloc_file_space);
 DEFINE_INODE_EVENT(xfs_free_file_space);
 DEFINE_INODE_EVENT(xfs_readdir);
@@ -619,6 +624,30 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
 		  (char *)__entry->caller_ip)
 )
 
+TRACE_EVENT(xfs_iomap_prealloc_size,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift,
+		 unsigned int writeio_blocks),
+	TP_ARGS(ip, blocks, shift, writeio_blocks),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsblock_t, blocks)
+		__field(int, shift)
+		__field(unsigned int, writeio_blocks)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->blocks = blocks;
+		__entry->shift = shift;
+		__entry->writeio_blocks = writeio_blocks;
+	),
+	TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d "
+		  "m_writeio_blocks %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
+		  __entry->blocks, __entry->shift, __entry->writeio_blocks)
+)
+
 #define DEFINE_IREF_EVENT(name) \
 DEFINE_EVENT(xfs_iref_class, name, \
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
@@ -950,14 +979,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read);
 DEFINE_RW_EVENT(xfs_file_splice_write);
 
 DECLARE_EVENT_CLASS(xfs_page_class,
-	TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
-	TP_ARGS(inode, page, off),
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
+		 unsigned int len),
+	TP_ARGS(inode, page, off, len),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(pgoff_t, pgoff)
 		__field(loff_t, size)
 		__field(unsigned long, offset)
+		__field(unsigned int, length)
 		__field(int, delalloc)
 		__field(int, unwritten)
 	),
@@ -971,24 +1002,27 @@ DECLARE_EVENT_CLASS(xfs_page_class,
 		__entry->pgoff = page_offset(page);
 		__entry->size = i_size_read(inode);
 		__entry->offset = off;
+		__entry->length = len;
 		__entry->delalloc = delalloc;
 		__entry->unwritten = unwritten;
 	),
 	TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-		  "delalloc %d unwritten %d",
+		  "length %x delalloc %d unwritten %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->pgoff,
 		  __entry->size,
 		  __entry->offset,
+		  __entry->length,
 		  __entry->delalloc,
 		  __entry->unwritten)
 )
 
 #define DEFINE_PAGE_EVENT(name)		\
 DEFINE_EVENT(xfs_page_class, name,	\
-	TP_PROTO(struct inode *inode, struct page *page, unsigned long off),	\
-	TP_ARGS(inode, page, off))
+	TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
+		 unsigned int len),	\
+	TP_ARGS(inode, page, off, len))
 DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 06ed520a767..35a22998135 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -37,14 +37,45 @@
 #include "xfs_extent_busy.h"
 #include "xfs_bmap.h"
 #include "xfs_quota.h"
+#include "xfs_qm.h"
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
 #include "xfs_inode_item.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
 #include "xfs_trace.h"
 
 kmem_zone_t	*xfs_trans_zone;
 kmem_zone_t	*xfs_log_item_desc_zone;
 
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer.  Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+	return round_up(sizeof(struct xlog_op_header) +
+			sizeof(struct xfs_buf_log_format), 128);
+}
+
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction.  size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+	uint		nbufs,
+	uint		size)
+{
+	return nbufs * (size + xfs_buf_log_overhead());
+}
 
 /*
  * Various log reservation values.
@@ -85,18 +116,15 @@ xfs_calc_write_reservation(
 	struct xfs_mount	*mp)
 {
 	return XFS_DQUOT_LOGRES(mp) +
-		MAX((mp->m_sb.sb_inodesize +
-		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
-		     2 * mp->m_sb.sb_sectsize +
-		     mp->m_sb.sb_sectsize +
-		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
-		     128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
-			    XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
-		    (2 * mp->m_sb.sb_sectsize +
-		     2 * mp->m_sb.sb_sectsize +
-		     mp->m_sb.sb_sectsize +
-		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
-		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+		MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+				      XFS_FSB_TO_B(mp, 1)) +
+		     xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))));
 }
 
 /*
@@ -117,18 +145,17 @@ xfs_calc_itruncate_reservation(
 	struct xfs_mount	*mp)
 {
 	return XFS_DQUOT_LOGRES(mp) +
-		MAX((mp->m_sb.sb_inodesize +
-		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
-		     128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
-		    (4 * mp->m_sb.sb_sectsize +
-		     4 * mp->m_sb.sb_sectsize +
-		     mp->m_sb.sb_sectsize +
-		     XFS_ALLOCFREE_LOG_RES(mp, 4) +
-		     128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
-		     128 * 5 +
-		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
-		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
-			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+		MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+				      XFS_FSB_TO_B(mp, 1)) +
+		    xfs_calc_buf_res(5, 0) +
+		    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				     XFS_FSB_TO_B(mp, 1)) +
+		    xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+				     mp->m_in_maxlevels, 0)));
 }
 
 /*
@@ -148,14 +175,12 @@ xfs_calc_rename_reservation(
 	struct xfs_mount	*mp)
 {
 	return XFS_DQUOT_LOGRES(mp) +
-		MAX((4 * mp->m_sb.sb_inodesize +
-		     2 * XFS_DIROP_LOG_RES(mp) +
-		     128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
-		    (3 * mp->m_sb.sb_sectsize +
-		     3 * mp->m_sb.sb_sectsize +
-		     mp->m_sb.sb_sectsize +
-		     XFS_ALLOCFREE_LOG_RES(mp, 3) +
-		     128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
+		MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) +
+		     xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
+				      XFS_FSB_TO_B(mp, 1))));
 }
 
 /*
@@ -175,15 +200,12 @@ xfs_calc_link_reservation(
 	struct xfs_mount	*mp)
 {
 	return XFS_DQUOT_LOGRES(mp) +
-		MAX((mp->m_sb.sb_inodesize +
-		     mp->m_sb.sb_inodesize +
-		     XFS_DIROP_LOG_RES(mp) +
-		     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
-		    (mp->m_sb.sb_sectsize +
-		     mp->m_sb.sb_sectsize +
-		     mp->m_sb.sb_sectsize +
-		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
-		     128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+		MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				      XFS_FSB_TO_B(mp, 1))));
 }
 
 /*
@@ -203,82 +225,102 @@ xfs_calc_remove_reservation(
 	struct xfs_mount	*mp)
 {
 	return XFS_DQUOT_LOGRES(mp) +
-		MAX((mp->m_sb.sb_inodesize +
-		     mp->m_sb.sb_inodesize +
-		     XFS_DIROP_LOG_RES(mp) +
-		     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
-		    (2 * mp->m_sb.sb_sectsize +
-		     2 * mp->m_sb.sb_sectsize +
-		     mp->m_sb.sb_sectsize +
-		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
-		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+		MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+		     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+				      XFS_FSB_TO_B(mp, 1))),
+		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))));
 }
 
 /*
- * For symlink we can modify:
+ * For create, break it in to the two cases that the transaction
+ * covers. We start with the modify case - allocation done by modification
+ * of the state of existing inodes - and the allocation case.
+ */
+
+/*
+ * For create we can modify:
  *    the parent directory inode: inode size
  *    the new inode: inode size
- *    the inode btree entry: 1 block
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
  *    the directory btree: (max depth + v2) * dir block size
  *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
+ */
+STATIC uint
+xfs_calc_create_resv_modify(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		(uint)XFS_FSB_TO_B(mp, 1) +
+		xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * For create we can allocate some inodes giving:
  *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
  *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
  *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
  */
 STATIC uint
-xfs_calc_symlink_reservation(
+xfs_calc_create_resv_alloc(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		mp->m_sb.sb_sectsize +
+		xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+__xfs_calc_create_reservation(
 	struct xfs_mount	*mp)
 {
 	return XFS_DQUOT_LOGRES(mp) +
-		MAX((mp->m_sb.sb_inodesize +
-		     mp->m_sb.sb_inodesize +
-		     XFS_FSB_TO_B(mp, 1) +
-		     XFS_DIROP_LOG_RES(mp) +
-		     1024 +
-		     128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
-		    (2 * mp->m_sb.sb_sectsize +
-		     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
-		     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
-		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
-		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
-			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+		MAX(xfs_calc_create_resv_alloc(mp),
+		    xfs_calc_create_resv_modify(mp));
 }
 
 /*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
+ * For icreate we can allocate some inodes giving:
  *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
  *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
  *    the inode btree: max depth * blocksize
  *    the allocation btrees: 2 trees * (max depth - 1) * block size
  */
 STATIC uint
-xfs_calc_create_reservation(
+xfs_calc_icreate_resv_alloc(
 	struct xfs_mount	*mp)
 {
+	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		mp->m_sb.sb_sectsize +
+		xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
+}
+
+STATIC uint
+xfs_calc_icreate_reservation(xfs_mount_t *mp)
+{
 	return XFS_DQUOT_LOGRES(mp) +
-		MAX((mp->m_sb.sb_inodesize +
-		     mp->m_sb.sb_inodesize +
-		     mp->m_sb.sb_sectsize +
-		     XFS_FSB_TO_B(mp, 1) +
-		     XFS_DIROP_LOG_RES(mp) +
-		     128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
-		    (3 * mp->m_sb.sb_sectsize +
-		     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
-		     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
-		     XFS_ALLOCFREE_LOG_RES(mp, 1) +
-		     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
-			    XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+		MAX(xfs_calc_icreate_resv_alloc(mp),
+		    xfs_calc_create_resv_modify(mp));
+}
+
+STATIC uint
+xfs_calc_create_reservation(
+	struct xfs_mount	*mp)
+{
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		return xfs_calc_icreate_reservation(mp);
+	return __xfs_calc_create_reservation(mp);
+
 }
 
 /*
@@ -291,6 +333,20 @@ xfs_calc_mkdir_reservation(
 	return xfs_calc_create_reservation(mp);
 }
 
+
+/*
+ * Making a new symplink is the same as creating a new file, but
+ * with the added blocks for remote symlink data which can be up to 1kB in
+ * length (MAXPATHLEN).
+ */
+STATIC uint
+xfs_calc_symlink_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_create_reservation(mp) +
+	       xfs_calc_buf_res(1, MAXPATHLEN);
+}
+
 /*
  * In freeing an inode we can modify:
  *    the inode being freed: inode size
@@ -306,16 +362,16 @@ xfs_calc_ifree_reservation(
 	struct xfs_mount	*mp)
 {
 	return XFS_DQUOT_LOGRES(mp) +
-		mp->m_sb.sb_inodesize +
-		mp->m_sb.sb_sectsize +
-		mp->m_sb.sb_sectsize +
-		XFS_FSB_TO_B(mp, 1) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+		xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
 		MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
 		    XFS_INODE_CLUSTER_SIZE(mp)) +
-		128 * 5 +
-		XFS_ALLOCFREE_LOG_RES(mp, 1) +
-		128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
-		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+		xfs_calc_buf_res(1, 0) +
+		xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+				 mp->m_in_maxlevels, 0) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
 }
 
 /*
@@ -343,9 +399,9 @@ STATIC uint
 xfs_calc_growdata_reservation(
 	struct xfs_mount	*mp)
 {
-	return mp->m_sb.sb_sectsize * 3 +
-		XFS_ALLOCFREE_LOG_RES(mp, 1) +
-		128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+	return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
 }
 
 /*
@@ -362,12 +418,12 @@ STATIC uint
 xfs_calc_growrtalloc_reservation(
 	struct xfs_mount	*mp)
 {
-	return 2 * mp->m_sb.sb_sectsize +
-		XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
-		mp->m_sb.sb_inodesize +
-		XFS_ALLOCFREE_LOG_RES(mp, 1) +
-		128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
-		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+	return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
 }
 
 /*
@@ -379,7 +435,7 @@ STATIC uint
 xfs_calc_growrtzero_reservation(
 	struct xfs_mount	*mp)
 {
-	return mp->m_sb.sb_blocksize + 128;
+	return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
 }
 
 /*
@@ -396,11 +452,10 @@ STATIC uint
 xfs_calc_growrtfree_reservation(
 	struct xfs_mount	*mp)
 {
-	return mp->m_sb.sb_sectsize +
-		2 * mp->m_sb.sb_inodesize +
-		mp->m_sb.sb_blocksize +
-		mp->m_rsumsize +
-		128 * 5;
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
+		xfs_calc_buf_res(1, mp->m_rsumsize);
 }
 
 /*
@@ -411,7 +466,7 @@ STATIC uint
 xfs_calc_swrite_reservation(
 	struct xfs_mount	*mp)
 {
-	return mp->m_sb.sb_inodesize + 128;
+	return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
 }
 
 /*
@@ -421,7 +476,7 @@ xfs_calc_swrite_reservation(
 STATIC uint
 xfs_calc_writeid_reservation(xfs_mount_t *mp)
 {
-	return mp->m_sb.sb_inodesize + 128;
+	return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
 }
 
 /*
@@ -437,13 +492,13 @@ xfs_calc_addafork_reservation(
 	struct xfs_mount	*mp)
 {
 	return XFS_DQUOT_LOGRES(mp) +
-		mp->m_sb.sb_inodesize +
-		mp->m_sb.sb_sectsize * 2 +
-		mp->m_dirblksize +
-		XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
-		XFS_ALLOCFREE_LOG_RES(mp, 1) +
-		128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
-		       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+		xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+		xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(1, mp->m_dirblksize) +
+		xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
+				 XFS_FSB_TO_B(mp, 1)) +
+		xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+				 XFS_FSB_TO_B(mp, 1));
 }
 
 /*
@@ -461,35 +516,51 @@ STATIC uint
 xfs_calc_attrinval_reservation(
 	struct xfs_mount	*mp)
 {
-	return MAX((mp->m_sb.sb_inodesize +
-		    XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
-		    128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
-		   (4 * mp->m_sb.sb_sectsize +
-		    4 * mp->m_sb.sb_sectsize +
-		    mp->m_sb.sb_sectsize +
-		    XFS_ALLOCFREE_LOG_RES(mp, 4) +
-		    128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
+	return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+		    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+				     XFS_FSB_TO_B(mp, 1))),
+		   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
+		    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
+				     XFS_FSB_TO_B(mp, 1))));
 }
 
 /*
- * Setting an attribute.
+ * Setting an attribute at mount time.
  *	the inode getting the attribute
  *	the superblock for allocations
  *	the agfs extents are allocated from
  *	the attribute btree * max depth
  *	the inode allocation btree
  * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
  */
 STATIC uint
-xfs_calc_attrset_reservation(
+xfs_calc_attrsetm_reservation(
 	struct xfs_mount	*mp)
 {
 	return XFS_DQUOT_LOGRES(mp) +
-		mp->m_sb.sb_inodesize +
-		mp->m_sb.sb_sectsize +
-		XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
-		128 * (2 + XFS_DA_NODE_MAXDEPTH);
+		xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
+}
+
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ * 	the superblock for allocations: sector size
+ *	the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp).
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+		xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+				 XFS_FSB_TO_B(mp, 1));
 }
 
 /*
@@ -508,16 +579,15 @@ xfs_calc_attrrm_reservation(
 	struct xfs_mount	*mp)
 {
 	return XFS_DQUOT_LOGRES(mp) +
-		MAX((mp->m_sb.sb_inodesize +
-		     XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
-		     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
-		     128 * (1 + XFS_DA_NODE_MAXDEPTH +
-			    XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
-		    (2 * mp->m_sb.sb_sectsize +
-		     2 * mp->m_sb.sb_sectsize +
-		     mp->m_sb.sb_sectsize +
-		     XFS_ALLOCFREE_LOG_RES(mp, 2) +
-		     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
+		MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
+		     xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
+				      XFS_FSB_TO_B(mp, 1)) +
+		     (uint)XFS_FSB_TO_B(mp,
+					XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+		     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
+		    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+		     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
+				      XFS_FSB_TO_B(mp, 1))));
 }
 
 /*
@@ -527,7 +597,78 @@ STATIC uint
 xfs_calc_clear_agi_bucket_reservation(
 	struct xfs_mount	*mp)
 {
-	return mp->m_sb.sb_sectsize + 128;
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Clearing the quotaflags in the superblock.
+ *	the super block for changing quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_sbchange_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * Adjusting quota limits.
+ *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+
+/*
+ * Allocating quota on disk if needed.
+ *	the write transaction log space: XFS_WRITE_LOG_RES(mp)
+ *	the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+	struct xfs_mount	*mp)
+{
+	return XFS_WRITE_LOG_RES(mp) +
+		xfs_calc_buf_res(1,
+			XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+
+/*
+ * Turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ *    the superblock for the quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_reservation(
+	struct xfs_mount	*mp)
+{
+	return sizeof(struct xfs_qoff_logitem) * 2 +
+		xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+
+/*
+ * End of turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_end_reservation(
+	struct xfs_mount	*mp)
+{
+	return sizeof(struct xfs_qoff_logitem) * 2;
+}
+
+/*
+ * Syncing the incore super block changes to disk.
+ *     the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+	struct xfs_mount	*mp)
+{
+	return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
 }
 
 /*
@@ -555,12 +696,19 @@ xfs_trans_init(
 	resp->tr_writeid = xfs_calc_writeid_reservation(mp);
 	resp->tr_addafork = xfs_calc_addafork_reservation(mp);
 	resp->tr_attrinval = xfs_calc_attrinval_reservation(mp);
-	resp->tr_attrset = xfs_calc_attrset_reservation(mp);
+	resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp);
+	resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp);
 	resp->tr_attrrm = xfs_calc_attrrm_reservation(mp);
 	resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp);
 	resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp);
 	resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp);
 	resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp);
+	resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp);
+	resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp);
+	resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp);
+	resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp);
+	resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp);
+	resp->tr_sb = xfs_calc_sb_reservation(mp);
 }
 
 /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c6c0601abd7..2b4946393e3 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -48,6 +48,7 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_BUF		0x123c	/* v2 bufs, variable sized inode bufs */
 #define	XFS_LI_DQUOT		0x123d
 #define	XFS_LI_QUOTAOFF		0x123e
+#define	XFS_LI_ICREATE		0x123f
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -107,7 +108,8 @@ typedef struct xfs_trans_header {
 #define	XFS_TRANS_SWAPEXT		40
 #define	XFS_TRANS_SB_COUNT		41
 #define	XFS_TRANS_CHECKPOINT		42
-#define	XFS_TRANS_TYPE_MAX		42
+#define	XFS_TRANS_ICREATE		43
+#define	XFS_TRANS_TYPE_MAX		43
 /* new transaction types need to be reflected in xfs_logprint(8) */
 
 #define XFS_TRANS_TYPES \
@@ -210,23 +212,18 @@ struct xfs_log_item_desc {
 /*
  * Per-extent log reservation for the allocation btree changes
  * involved in freeing or allocating an extent.
- * 2 trees * (2 blocks/level * max depth - 1) * block size
+ * 2 trees * (2 blocks/level * max depth - 1)
  */
-#define	XFS_ALLOCFREE_LOG_RES(mp,nx) \
-	((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
 #define	XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
 	((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
 
 /*
  * Per-directory log reservation for any directory change.
- * dir blocks: (1 btree block per level + data block + free block) * dblock size
- * bmap btree: (levels + 2) * max depth * block size
+ * dir blocks: (1 btree block per level + data block + free block)
+ * bmap btree: (levels + 2) * max depth
  * v2 directory blocks can be fragmented below the dirblksize down to the fsb
  * size, so account for that in the DAENTER macros.
  */
-#define	XFS_DIROP_LOG_RES(mp)	\
-	(XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
-	 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
 #define	XFS_DIROP_LOG_COUNT(mp)	\
 	(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
 	 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
@@ -252,17 +249,19 @@ struct xfs_log_item_desc {
  * as long as SWRITE logs the entire inode core
  */
 #define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
-#define	XFS_WRITEID_LOG_RES(mp)	((mp)->m_reservations.tr_swrite)
+#define	XFS_WRITEID_LOG_RES(mp)		((mp)->m_reservations.tr_swrite)
 #define	XFS_ADDAFORK_LOG_RES(mp)	((mp)->m_reservations.tr_addafork)
 #define	XFS_ATTRINVAL_LOG_RES(mp)	((mp)->m_reservations.tr_attrinval)
-#define	XFS_ATTRSET_LOG_RES(mp, ext)	\
-	((mp)->m_reservations.tr_attrset + \
-	 (ext * (mp)->m_sb.sb_sectsize) + \
-	 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
-	 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-#define	XFS_ATTRRM_LOG_RES(mp)	((mp)->m_reservations.tr_attrrm)
+#define	XFS_ATTRSETM_LOG_RES(mp)	((mp)->m_reservations.tr_attrsetm)
+#define XFS_ATTRSETRT_LOG_RES(mp)	((mp)->m_reservations.tr_attrsetrt)
+#define	XFS_ATTRRM_LOG_RES(mp)		((mp)->m_reservations.tr_attrrm)
 #define	XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
-
+#define XFS_QM_SBCHANGE_LOG_RES(mp)	((mp)->m_reservations.tr_qm_sbchange)
+#define XFS_QM_SETQLIM_LOG_RES(mp)	((mp)->m_reservations.tr_qm_setqlim)
+#define XFS_QM_DQALLOC_LOG_RES(mp)	((mp)->m_reservations.tr_qm_dqalloc)
+#define XFS_QM_QUOTAOFF_LOG_RES(mp)	((mp)->m_reservations.tr_qm_quotaoff)
+#define XFS_QM_QUOTAOFF_END_LOG_RES(mp)	((mp)->m_reservations.tr_qm_equotaoff)
+#define XFS_SB_LOG_RES(mp)		((mp)->m_reservations.tr_sb)
 
 /*
  * Various log count values.
@@ -403,7 +402,7 @@ typedef struct xfs_trans {
 	int64_t			t_res_fdblocks_delta; /* on-disk only chg */
 	int64_t			t_frextents_delta;/* superblock freextents chg*/
 	int64_t			t_res_frextents_delta; /* on-disk only chg */
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 	int64_t			t_ag_freeblks_delta; /* debugging counter */
 	int64_t			t_ag_flist_delta; /* debugging counter */
 	int64_t			t_ag_btree_delta; /* debugging counter */
@@ -431,7 +430,7 @@ typedef struct xfs_trans {
 #define	xfs_trans_get_block_res(tp)	((tp)->t_blk_res)
 #define	xfs_trans_set_sync(tp)		((tp)->t_flags |= XFS_TRANS_SYNC)
 
-#ifdef DEBUG
+#if defined(DEBUG) || defined(XFS_WARN)
 #define	xfs_trans_agblocks_delta(tp, d)	((tp)->t_ag_freeblks_delta += (int64_t)d)
 #define	xfs_trans_agflist_delta(tp, d)	((tp)->t_ag_flist_delta += (int64_t)d)
 #define	xfs_trans_agbtree_delta(tp, d)	((tp)->t_ag_btree_delta += (int64_t)d)
@@ -501,6 +500,7 @@ void		xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 6011ee66133..0eda7254305 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -55,20 +55,6 @@ xfs_ail_check(
 		ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
 
 
-#ifdef XFS_TRANS_DEBUG
-	/*
-	 * Walk the list checking lsn ordering, and that every entry has the
-	 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
-	 * when specifically debugging the transaction subsystem.
-	 */
-	prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-	list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
-		if (&prev_lip->li_ail != &ailp->xa_ail)
-			ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-		ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-		prev_lip = lip;
-	}
-#endif /* XFS_TRANS_DEBUG */
 }
 #else /* !DEBUG */
 #define	xfs_ail_check(a,l)
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 3edf5dbee00..aa5a04b844d 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -397,7 +397,6 @@ shutdown_abort:
 	return XFS_ERROR(EIO);
 }
 
-
 /*
  * Release the buffer bp which was previously acquired with one of the
  * xfs_trans_... buffer allocation routines if the buffer has not
@@ -603,8 +602,14 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
 	bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-	bip->bli_flags |= XFS_BLI_LOGGED;
-	xfs_buf_item_log(bip, first, last);
+
+	/*
+	 * If we have an ordered buffer we are not logging any dirty range but
+	 * it still needs to be marked dirty and that it has been logged.
+	 */
+	bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+	if (!(bip->bli_flags & XFS_BLI_ORDERED))
+		xfs_buf_item_log(bip, first, last);
 }
 
 
@@ -659,6 +664,7 @@ xfs_trans_binval(
 		ASSERT(XFS_BUF_ISSTALE(bp));
 		ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
 		ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
+		ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
 		ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
 		ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
 		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
@@ -671,6 +677,7 @@ xfs_trans_binval(
 	bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
 	bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
 	bip->__bli_format.blf_flags |= XFS_BLF_CANCEL;
+	bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK;
 	for (i = 0; i < bip->bli_format_count; i++) {
 		memset(bip->bli_formats[i].blf_data_map, 0,
 		       (bip->bli_formats[i].blf_map_size * sizeof(uint)));
@@ -702,12 +709,13 @@ xfs_trans_inode_buf(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->bli_flags |= XFS_BLI_INODE_BUF;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
 }
 
 /*
  * This call is used to indicate that the buffer is going to
  * be staled and was an inode buffer. This means it gets
- * special processing during unpin - where any inodes 
+ * special processing during unpin - where any inodes
  * associated with the buffer should be removed from ail.
  * There is also special processing during recovery,
  * any replay of the inodes in the buffer needs to be
@@ -726,6 +734,7 @@ xfs_trans_stale_inode_buf(
 
 	bip->bli_flags |= XFS_BLI_STALE_INODE;
 	bip->bli_item.li_cb = xfs_buf_iodone;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
 }
 
 /*
@@ -749,8 +758,66 @@ xfs_trans_inode_alloc_buf(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+}
+
+/*
+ * Mark the buffer as ordered for this transaction. This means
+ * that the contents of the buffer are not recorded in the transaction
+ * but it is tracked in the AIL as though it was. This allows us
+ * to record logical changes in transactions rather than the physical
+ * changes we make to the buffer without changing writeback ordering
+ * constraints of metadata buffers.
+ */
+void
+xfs_trans_ordered_buf(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_flags |= XFS_BLI_ORDERED;
+	trace_xfs_buf_item_ordered(bip);
+}
+
+/*
+ * Set the type of the buffer for log recovery so that it can correctly identify
+ * and hence attach the correct buffer ops to the buffer after replay.
+ */
+void
+xfs_trans_buf_set_type(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp,
+	enum xfs_blft		type)
+{
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!tp)
+		return;
+
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	xfs_blft_to_flags(&bip->__bli_format, type);
 }
 
+void
+xfs_trans_buf_copy_type(
+	struct xfs_buf		*dst_bp,
+	struct xfs_buf		*src_bp)
+{
+	struct xfs_buf_log_item	*sbip = src_bp->b_fspriv;
+	struct xfs_buf_log_item	*dbip = dst_bp->b_fspriv;
+	enum xfs_blft		type;
+
+	type = xfs_blft_from_flags(&sbip->__bli_format);
+	xfs_blft_to_flags(&dbip->__bli_format, type);
+}
 
 /*
  * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of
@@ -769,14 +836,28 @@ xfs_trans_dquot_buf(
 	xfs_buf_t	*bp,
 	uint		type)
 {
-	xfs_buf_log_item_t	*bip = bp->b_fspriv;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
 
-	ASSERT(bp->b_transp == tp);
-	ASSERT(bip != NULL);
 	ASSERT(type == XFS_BLF_UDQUOT_BUF ||
 	       type == XFS_BLF_PDQUOT_BUF ||
 	       type == XFS_BLF_GDQUOT_BUF);
-	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->__bli_format.blf_flags |= type;
+
+	switch (type) {
+	case XFS_BLF_UDQUOT_BUF:
+		type = XFS_BLFT_UDQUOT_BUF;
+		break;
+	case XFS_BLF_PDQUOT_BUF:
+		type = XFS_BLFT_PDQUOT_BUF;
+		break;
+	case XFS_BLF_GDQUOT_BUF:
+		type = XFS_BLFT_GDQUOT_BUF;
+		break;
+	default:
+		type = XFS_BLFT_UNKNOWN_BUF;
+		break;
+	}
+
+	xfs_trans_buf_set_type(tp, bp, type);
 }
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 0c7fa54f309..3ba64d54016 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -103,8 +103,6 @@ xfs_trans_dup_dqinfo(
 		return;
 
 	xfs_trans_alloc_dqinfo(ntp);
-	oqa = otp->t_dqinfo->dqa_usrdquots;
-	nqa = ntp->t_dqinfo->dqa_usrdquots;
 
 	/*
 	 * Because the quota blk reservation is carried forward,
@@ -113,7 +111,9 @@ xfs_trans_dup_dqinfo(
 	if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
 		ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
 
-	for (j = 0; j < 2; j++) {
+	for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+		oqa = otp->t_dqinfo->dqs[j];
+		nqa = ntp->t_dqinfo->dqs[j];
 		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
 			if (oqa[i].qt_dquot == NULL)
 				break;
@@ -138,8 +138,6 @@ xfs_trans_dup_dqinfo(
 			oq->qt_ino_res = oq->qt_ino_res_used;
 
 		}
-		oqa = otp->t_dqinfo->dqa_grpdquots;
-		nqa = ntp->t_dqinfo->dqa_grpdquots;
 	}
 }
 
@@ -157,8 +155,7 @@ xfs_trans_mod_dquot_byino(
 
 	if (!XFS_IS_QUOTA_RUNNING(mp) ||
 	    !XFS_IS_QUOTA_ON(mp) ||
-	    ip->i_ino == mp->m_sb.sb_uquotino ||
-	    ip->i_ino == mp->m_sb.sb_gquotino)
+	    xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
 		return;
 
 	if (tp->t_dqinfo == NULL)
@@ -170,16 +167,18 @@ xfs_trans_mod_dquot_byino(
 		(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
 }
 
-STATIC xfs_dqtrx_t *
+STATIC struct xfs_dqtrx *
 xfs_trans_get_dqtrx(
-	xfs_trans_t	*tp,
-	xfs_dquot_t	*dqp)
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*dqp)
 {
-	int		i;
-	xfs_dqtrx_t	*qa;
+	int			i;
+	struct xfs_dqtrx	*qa;
 
-	qa = XFS_QM_ISUDQ(dqp) ?
-		tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+	if (XFS_QM_ISUDQ(dqp))
+		qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
+	else
+		qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
 
 	for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
 		if (qa[i].qt_dquot == NULL ||
@@ -326,12 +325,12 @@ xfs_trans_dqlockedjoin(
  */
 void
 xfs_trans_apply_dquot_deltas(
-	xfs_trans_t		*tp)
+	struct xfs_trans	*tp)
 {
 	int			i, j;
-	xfs_dquot_t		*dqp;
-	xfs_dqtrx_t		*qtrx, *qa;
-	xfs_disk_dquot_t	*d;
+	struct xfs_dquot	*dqp;
+	struct xfs_dqtrx	*qtrx, *qa;
+	struct xfs_disk_dquot	*d;
 	long			totalbdelta;
 	long			totalrtbdelta;
 
@@ -339,12 +338,10 @@ xfs_trans_apply_dquot_deltas(
 		return;
 
 	ASSERT(tp->t_dqinfo);
-	qa = tp->t_dqinfo->dqa_usrdquots;
-	for (j = 0; j < 2; j++) {
-		if (qa[0].qt_dquot == NULL) {
-			qa = tp->t_dqinfo->dqa_grpdquots;
+	for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+		qa = tp->t_dqinfo->dqs[j];
+		if (qa[0].qt_dquot == NULL)
 			continue;
-		}
 
 		/*
 		 * Lock all of the dquots and join them to the transaction.
@@ -412,7 +409,7 @@ xfs_trans_apply_dquot_deltas(
 			 * Start/reset the timer(s) if needed.
 			 */
 			if (d->d_id) {
-				xfs_qm_adjust_dqlimits(tp->t_mountp, d);
+				xfs_qm_adjust_dqlimits(tp->t_mountp, dqp);
 				xfs_qm_adjust_dqtimers(tp->t_mountp, d);
 			}
 
@@ -495,10 +492,6 @@ xfs_trans_apply_dquot_deltas(
 			ASSERT(dqp->q_res_rtbcount >=
 				be64_to_cpu(dqp->q_core.d_rtbcount));
 		}
-		/*
-		 * Do the group quotas next
-		 */
-		qa = tp->t_dqinfo->dqa_grpdquots;
 	}
 }
 
@@ -516,14 +509,14 @@ xfs_trans_unreserve_and_mod_dquots(
 	int			i, j;
 	xfs_dquot_t		*dqp;
 	xfs_dqtrx_t		*qtrx, *qa;
-	boolean_t		locked;
+	bool                    locked;
 
 	if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
 		return;
 
-	qa = tp->t_dqinfo->dqa_usrdquots;
+	for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
+		qa = tp->t_dqinfo->dqs[j];
 
-	for (j = 0; j < 2; j++) {
 		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
 			qtrx = &qa[i];
 			/*
@@ -537,17 +530,17 @@ xfs_trans_unreserve_and_mod_dquots(
 			 * about the number of blocks used field, or deltas.
 			 * Also we don't bother to zero the fields.
 			 */
-			locked = B_FALSE;
+			locked = false;
 			if (qtrx->qt_blk_res) {
 				xfs_dqlock(dqp);
-				locked = B_TRUE;
+				locked = true;
 				dqp->q_res_bcount -=
 					(xfs_qcnt_t)qtrx->qt_blk_res;
 			}
 			if (qtrx->qt_ino_res) {
 				if (!locked) {
 					xfs_dqlock(dqp);
-					locked = B_TRUE;
+					locked = true;
 				}
 				dqp->q_res_icount -=
 					(xfs_qcnt_t)qtrx->qt_ino_res;
@@ -556,7 +549,7 @@ xfs_trans_unreserve_and_mod_dquots(
 			if (qtrx->qt_rtblk_res) {
 				if (!locked) {
 					xfs_dqlock(dqp);
-					locked = B_TRUE;
+					locked = true;
 				}
 				dqp->q_res_rtbcount -=
 					(xfs_qcnt_t)qtrx->qt_rtblk_res;
@@ -565,7 +558,6 @@ xfs_trans_unreserve_and_mod_dquots(
 				xfs_dqunlock(dqp);
 
 		}
-		qa = tp->t_dqinfo->dqa_grpdquots;
 	}
 }
 
@@ -640,8 +632,8 @@ xfs_trans_dqresv(
 	if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
 	    dqp->q_core.d_id &&
 	    ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
-	     (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
-	      (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
+	     (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
+	     (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
 		if (nblks > 0) {
 			/*
 			 * dquot is locked already. See if we'd go over the
@@ -748,15 +740,15 @@ error_return:
  */
 int
 xfs_trans_reserve_quota_bydquots(
-	xfs_trans_t	*tp,
-	xfs_mount_t	*mp,
-	xfs_dquot_t	*udqp,
-	xfs_dquot_t	*gdqp,
-	long		nblks,
-	long		ninos,
-	uint		flags)
+	struct xfs_trans	*tp,
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*udqp,
+	struct xfs_dquot	*gdqp,
+	long			nblks,
+	long			ninos,
+	uint			flags)
 {
-	int		resvd = 0, error;
+	int		error;
 
 	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
 		return 0;
@@ -771,28 +763,24 @@ xfs_trans_reserve_quota_bydquots(
 					(flags & ~XFS_QMOPT_ENOSPC));
 		if (error)
 			return error;
-		resvd = 1;
 	}
 
 	if (gdqp) {
 		error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
-		if (error) {
-			/*
-			 * can't do it, so backout previous reservation
-			 */
-			if (resvd) {
-				flags |= XFS_QMOPT_FORCE_RES;
-				xfs_trans_dqresv(tp, mp, udqp,
-						 -nblks, -ninos, flags);
-			}
-			return error;
-		}
+		if (error)
+			goto unwind_usr;
 	}
 
 	/*
 	 * Didn't change anything critical, so, no need to log
 	 */
 	return 0;
+
+unwind_usr:
+	flags |= XFS_QMOPT_FORCE_RES;
+	if (udqp)
+		xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
+	return error;
 }
 
 
@@ -816,8 +804,7 @@ xfs_trans_reserve_quota_nblks(
 	if (XFS_IS_PQUOTA_ON(mp))
 		flags |= XFS_QMOPT_ENOSPC;
 
-	ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
-	ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+	ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index d2eee20d5f5..53dfe46f368 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -33,14 +33,6 @@
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
 
-#ifdef XFS_TRANS_DEBUG
-STATIC void
-xfs_trans_inode_broot_debug(
-	xfs_inode_t	*ip);
-#else
-#define	xfs_trans_inode_broot_debug(ip)
-#endif
-
 /*
  * Add a locked inode to the transaction.
  *
@@ -67,8 +59,6 @@ xfs_trans_ijoin(
 	 * Get a log_item_desc to point at the new item.
 	 */
 	xfs_trans_add_item(tp, &iip->ili_item);
-
-	xfs_trans_inode_broot_debug(ip);
 }
 
 /*
@@ -122,6 +112,17 @@ xfs_trans_log_inode(
 	ASSERT(ip->i_itemp != NULL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
+	/*
+	 * First time we log the inode in a transaction, bump the inode change
+	 * counter if it is configured for this to occur.
+	 */
+	if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
+	    IS_I_VERSION(VFS_I(ip))) {
+		inode_inc_iversion(VFS_I(ip));
+		ip->i_d.di_changecount = VFS_I(ip)->i_version;
+		flags |= XFS_ILOG_CORE;
+	}
+
 	tp->t_flags |= XFS_TRANS_DIRTY;
 	ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 
@@ -135,34 +136,3 @@ xfs_trans_log_inode(
 	flags |= ip->i_itemp->ili_last_fields;
 	ip->i_itemp->ili_fields |= flags;
 }
-
-#ifdef XFS_TRANS_DEBUG
-/*
- * Keep track of the state of the inode btree root to make sure we
- * log it properly.
- */
-STATIC void
-xfs_trans_inode_broot_debug(
-	xfs_inode_t	*ip)
-{
-	xfs_inode_log_item_t	*iip;
-
-	ASSERT(ip->i_itemp != NULL);
-	iip = ip->i_itemp;
-	if (iip->ili_root_size != 0) {
-		ASSERT(iip->ili_orig_root != NULL);
-		kmem_free(iip->ili_orig_root);
-		iip->ili_root_size = 0;
-		iip->ili_orig_root = NULL;
-	}
-	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-		ASSERT((ip->i_df.if_broot != NULL) &&
-		       (ip->i_df.if_broot_bytes > 0));
-		iip->ili_root_size = ip->i_df.if_broot_bytes;
-		iip->ili_orig_root =
-			(char*)kmem_alloc(iip->ili_root_size, KM_SLEEP);
-		memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot),
-		      iip->ili_root_size);
-	}
-}
-#endif
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 7a41874f4c2..61ba1cfa974 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -32,7 +32,6 @@ typedef unsigned int		__uint32_t;
 typedef signed long long int	__int64_t;
 typedef unsigned long long int	__uint64_t;
 
-typedef enum { B_FALSE,B_TRUE }	boolean_t;
 typedef __uint32_t		prid_t;		/* project ID */
 typedef __uint32_t		inst_t;		/* an instruction */
 
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index d95f565a390..42c0ef288ae 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2012 Red Hat, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -48,103 +49,8 @@
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
+#include "xfs_symlink.h"
 
-/*
- * The maximum pathlen is 1024 bytes. Since the minimum file system
- * blocksize is 512 bytes, we can get a max of 2 extents back from
- * bmapi.
- */
-#define SYMLINK_MAPS 2
-
-STATIC int
-xfs_readlink_bmap(
-	xfs_inode_t	*ip,
-	char		*link)
-{
-	xfs_mount_t	*mp = ip->i_mount;
-	int		pathlen = ip->i_d.di_size;
-	int             nmaps = SYMLINK_MAPS;
-	xfs_bmbt_irec_t mval[SYMLINK_MAPS];
-	xfs_daddr_t	d;
-	int		byte_cnt;
-	int		n;
-	xfs_buf_t	*bp;
-	int		error = 0;
-
-	error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, pathlen), mval, &nmaps,
-			       0);
-	if (error)
-		goto out;
-
-	for (n = 0; n < nmaps; n++) {
-		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
-		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-
-		bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
-		if (!bp)
-			return XFS_ERROR(ENOMEM);
-		error = bp->b_error;
-		if (error) {
-			xfs_buf_ioerror_alert(bp, __func__);
-			xfs_buf_relse(bp);
-			goto out;
-		}
-		if (pathlen < byte_cnt)
-			byte_cnt = pathlen;
-		pathlen -= byte_cnt;
-
-		memcpy(link, bp->b_addr, byte_cnt);
-		xfs_buf_relse(bp);
-	}
-
-	link[ip->i_d.di_size] = '\0';
-	error = 0;
-
- out:
-	return error;
-}
-
-int
-xfs_readlink(
-	xfs_inode_t     *ip,
-	char		*link)
-{
-	xfs_mount_t	*mp = ip->i_mount;
-	xfs_fsize_t	pathlen;
-	int		error = 0;
-
-	trace_xfs_readlink(ip);
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
-
-	xfs_ilock(ip, XFS_ILOCK_SHARED);
-
-	pathlen = ip->i_d.di_size;
-	if (!pathlen)
-		goto out;
-
-	if (pathlen < 0 || pathlen > MAXPATHLEN) {
-		xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
-			 __func__, (unsigned long long) ip->i_ino,
-			 (long long) pathlen);
-		ASSERT(0);
-		error = XFS_ERROR(EFSCORRUPTED);
-		goto out;
-	}
-
-
-	if (ip->i_df.if_flags & XFS_IFINLINE) {
-		memcpy(link, ip->i_df.if_u1.if_data, pathlen);
-		link[pathlen] = '\0';
-	} else {
-		error = xfs_readlink_bmap(ip, link);
-	}
-
- out:
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-	return error;
-}
 
 /*
  * This is called by xfs_inactive to free any blocks beyond eof
@@ -249,145 +155,6 @@ xfs_free_eofblocks(
 	return error;
 }
 
-/*
- * Free a symlink that has blocks associated with it.
- */
-STATIC int
-xfs_inactive_symlink_rmt(
-	xfs_inode_t	*ip,
-	xfs_trans_t	**tpp)
-{
-	xfs_buf_t	*bp;
-	int		committed;
-	int		done;
-	int		error;
-	xfs_fsblock_t	first_block;
-	xfs_bmap_free_t	free_list;
-	int		i;
-	xfs_mount_t	*mp;
-	xfs_bmbt_irec_t	mval[SYMLINK_MAPS];
-	int		nmaps;
-	xfs_trans_t	*ntp;
-	int		size;
-	xfs_trans_t	*tp;
-
-	tp = *tpp;
-	mp = ip->i_mount;
-	ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
-	/*
-	 * We're freeing a symlink that has some
-	 * blocks allocated to it.  Free the
-	 * blocks here.  We know that we've got
-	 * either 1 or 2 extents and that we can
-	 * free them all in one bunmapi call.
-	 */
-	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
-
-	/*
-	 * Lock the inode, fix the size, and join it to the transaction.
-	 * Hold it so in the normal path, we still have it locked for
-	 * the second transaction.  In the error paths we need it
-	 * held so the cancel won't rele it, see below.
-	 */
-	size = (int)ip->i_d.di_size;
-	ip->i_d.di_size = 0;
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	/*
-	 * Find the block(s) so we can inval and unmap them.
-	 */
-	done = 0;
-	xfs_bmap_init(&free_list, &first_block);
-	nmaps = ARRAY_SIZE(mval);
-	error = xfs_bmapi_read(ip, 0, XFS_B_TO_FSB(mp, size),
-				mval, &nmaps, 0);
-	if (error)
-		goto error0;
-	/*
-	 * Invalidate the block(s).
-	 */
-	for (i = 0; i < nmaps; i++) {
-		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
-			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
-			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
-		if (!bp) {
-			error = ENOMEM;
-			goto error1;
-		}
-		xfs_trans_binval(tp, bp);
-	}
-	/*
-	 * Unmap the dead block(s) to the free_list.
-	 */
-	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
-			&first_block, &free_list, &done)))
-		goto error1;
-	ASSERT(done);
-	/*
-	 * Commit the first transaction.  This logs the EFI and the inode.
-	 */
-	if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
-		goto error1;
-	/*
-	 * The transaction must have been committed, since there were
-	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
-	 * The new tp has the extent freeing and EFDs.
-	 */
-	ASSERT(committed);
-	/*
-	 * The first xact was committed, so add the inode to the new one.
-	 * Mark it dirty so it will be logged and moved forward in the log as
-	 * part of every commit.
-	 */
-	xfs_trans_ijoin(tp, ip, 0);
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	/*
-	 * Get a new, empty transaction to return to our caller.
-	 */
-	ntp = xfs_trans_dup(tp);
-	/*
-	 * Commit the transaction containing extent freeing and EFDs.
-	 * If we get an error on the commit here or on the reserve below,
-	 * we need to unlock the inode since the new transaction doesn't
-	 * have the inode attached.
-	 */
-	error = xfs_trans_commit(tp, 0);
-	tp = ntp;
-	if (error) {
-		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		goto error0;
-	}
-	/*
-	 * transaction commit worked ok so we can drop the extra ticket
-	 * reference that we gained in xfs_trans_dup()
-	 */
-	xfs_log_ticket_put(tp->t_ticket);
-
-	/*
-	 * Remove the memory for extent descriptions (just bookkeeping).
-	 */
-	if (ip->i_df.if_bytes)
-		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
-	ASSERT(ip->i_df.if_bytes == 0);
-	/*
-	 * Put an itruncate log reservation in the new transaction
-	 * for our caller.
-	 */
-	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
-		ASSERT(XFS_FORCED_SHUTDOWN(mp));
-		goto error0;
-	}
-
-	xfs_trans_ijoin(tp, ip, 0);
-	*tpp = tp;
-	return 0;
-
- error1:
-	xfs_bmap_cancel(&free_list);
- error0:
-	return error;
-}
-
 int
 xfs_release(
 	xfs_inode_t	*ip)
@@ -555,18 +322,9 @@ xfs_inactive(
 	xfs_trans_ijoin(tp, ip, 0);
 
 	if (S_ISLNK(ip->i_d.di_mode)) {
-		/*
-		 * Zero length symlinks _can_ exist.
-		 */
-		if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) {
-			error = xfs_inactive_symlink_rmt(ip, &tp);
-			if (error)
-				goto out_cancel;
-		} else if (ip->i_df.if_bytes > 0) {
-			xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
-					  XFS_DATA_FORK);
-			ASSERT(ip->i_df.if_bytes == 0);
-		}
+		error = xfs_inactive_symlink(ip, &tp);
+		if (error)
+			goto out_cancel;
 	} else if (truncate) {
 		ip->i_d.di_size = 0;
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -725,7 +483,7 @@ xfs_create(
 	int			error;
 	xfs_bmap_free_t		free_list;
 	xfs_fsblock_t		first_block;
-	boolean_t		unlock_dp_on_error = B_FALSE;
+	bool                    unlock_dp_on_error = false;
 	uint			cancel_flags;
 	int			committed;
 	prid_t			prid;
@@ -794,7 +552,7 @@ xfs_create(
 	}
 
 	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-	unlock_dp_on_error = B_TRUE;
+	unlock_dp_on_error = true;
 
 	xfs_bmap_init(&free_list, &first_block);
 
@@ -830,7 +588,7 @@ xfs_create(
 	 * error path.
 	 */
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-	unlock_dp_on_error = B_FALSE;
+	unlock_dp_on_error = false;
 
 	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
 					&first_block, &free_list, resblks ?
@@ -1353,247 +1111,6 @@ xfs_link(
 }
 
 int
-xfs_symlink(
-	xfs_inode_t		*dp,
-	struct xfs_name		*link_name,
-	const char		*target_path,
-	umode_t			mode,
-	xfs_inode_t		**ipp)
-{
-	xfs_mount_t		*mp = dp->i_mount;
-	xfs_trans_t		*tp;
-	xfs_inode_t		*ip;
-	int			error;
-	int			pathlen;
-	xfs_bmap_free_t		free_list;
-	xfs_fsblock_t		first_block;
-	boolean_t		unlock_dp_on_error = B_FALSE;
-	uint			cancel_flags;
-	int			committed;
-	xfs_fileoff_t		first_fsb;
-	xfs_filblks_t		fs_blocks;
-	int			nmaps;
-	xfs_bmbt_irec_t		mval[SYMLINK_MAPS];
-	xfs_daddr_t		d;
-	const char		*cur_chunk;
-	int			byte_cnt;
-	int			n;
-	xfs_buf_t		*bp;
-	prid_t			prid;
-	struct xfs_dquot	*udqp, *gdqp;
-	uint			resblks;
-
-	*ipp = NULL;
-	error = 0;
-	ip = NULL;
-	tp = NULL;
-
-	trace_xfs_symlink(dp, link_name);
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
-
-	/*
-	 * Check component lengths of the target path name.
-	 */
-	pathlen = strlen(target_path);
-	if (pathlen >= MAXPATHLEN)      /* total string too long */
-		return XFS_ERROR(ENAMETOOLONG);
-
-	udqp = gdqp = NULL;
-	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-		prid = xfs_get_projid(dp);
-	else
-		prid = XFS_PROJID_DEFAULT;
-
-	/*
-	 * Make sure that we have allocated dquot(s) on disk.
-	 */
-	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
-	if (error)
-		goto std_return;
-
-	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
-	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	/*
-	 * The symlink will fit into the inode data fork?
-	 * There can't be any attributes so we get the whole variable part.
-	 */
-	if (pathlen <= XFS_LITINO(mp))
-		fs_blocks = 0;
-	else
-		fs_blocks = XFS_B_TO_FSB(mp, pathlen);
-	resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
-	error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
-			XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
-	if (error == ENOSPC && fs_blocks == 0) {
-		resblks = 0;
-		error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
-				XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
-	}
-	if (error) {
-		cancel_flags = 0;
-		goto error_return;
-	}
-
-	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-	unlock_dp_on_error = B_TRUE;
-
-	/*
-	 * Check whether the directory allows new symlinks or not.
-	 */
-	if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
-		error = XFS_ERROR(EPERM);
-		goto error_return;
-	}
-
-	/*
-	 * Reserve disk quota : blocks and inode.
-	 */
-	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
-	if (error)
-		goto error_return;
-
-	/*
-	 * Check for ability to enter directory entry, if no space reserved.
-	 */
-	error = xfs_dir_canenter(tp, dp, link_name, resblks);
-	if (error)
-		goto error_return;
-	/*
-	 * Initialize the bmap freelist prior to calling either
-	 * bmapi or the directory create code.
-	 */
-	xfs_bmap_init(&free_list, &first_block);
-
-	/*
-	 * Allocate an inode for the symlink.
-	 */
-	error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
-			       prid, resblks > 0, &ip, NULL);
-	if (error) {
-		if (error == ENOSPC)
-			goto error_return;
-		goto error1;
-	}
-
-	/*
-	 * An error after we've joined dp to the transaction will result in the
-	 * transaction cancel unlocking dp so don't do it explicitly in the
-	 * error path.
-	 */
-	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-	unlock_dp_on_error = B_FALSE;
-
-	/*
-	 * Also attach the dquot(s) to it, if applicable.
-	 */
-	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
-
-	if (resblks)
-		resblks -= XFS_IALLOC_SPACE_RES(mp);
-	/*
-	 * If the symlink will fit into the inode, write it inline.
-	 */
-	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
-		xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
-		memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
-		ip->i_d.di_size = pathlen;
-
-		/*
-		 * The inode was initially created in extent format.
-		 */
-		ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
-		ip->i_df.if_flags |= XFS_IFINLINE;
-
-		ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
-		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
-
-	} else {
-		first_fsb = 0;
-		nmaps = SYMLINK_MAPS;
-
-		error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
-				  XFS_BMAPI_METADATA, &first_block, resblks,
-				  mval, &nmaps, &free_list);
-		if (error)
-			goto error2;
-
-		if (resblks)
-			resblks -= fs_blocks;
-		ip->i_d.di_size = pathlen;
-		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-		cur_chunk = target_path;
-		for (n = 0; n < nmaps; n++) {
-			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
-			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
-					       BTOBB(byte_cnt), 0);
-			if (!bp) {
-				error = ENOMEM;
-				goto error2;
-			}
-			if (pathlen < byte_cnt) {
-				byte_cnt = pathlen;
-			}
-			pathlen -= byte_cnt;
-
-			memcpy(bp->b_addr, cur_chunk, byte_cnt);
-			cur_chunk += byte_cnt;
-
-			xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
-		}
-	}
-
-	/*
-	 * Create the directory entry for the symlink.
-	 */
-	error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
-					&first_block, &free_list, resblks);
-	if (error)
-		goto error2;
-	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
-	/*
-	 * If this is a synchronous mount, make sure that the
-	 * symlink transaction goes to disk before returning to
-	 * the user.
-	 */
-	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
-		xfs_trans_set_sync(tp);
-	}
-
-	error = xfs_bmap_finish(&tp, &free_list, &committed);
-	if (error) {
-		goto error2;
-	}
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	xfs_qm_dqrele(udqp);
-	xfs_qm_dqrele(gdqp);
-
-	*ipp = ip;
-	return 0;
-
- error2:
-	IRELE(ip);
- error1:
-	xfs_bmap_cancel(&free_list);
-	cancel_flags |= XFS_TRANS_ABORT;
- error_return:
-	xfs_trans_cancel(tp, cancel_flags);
-	xfs_qm_dqrele(udqp);
-	xfs_qm_dqrele(gdqp);
-
-	if (unlock_dp_on_error)
-		xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
-	return error;
-}
-
-int
 xfs_set_dmattrs(
 	xfs_inode_t     *ip,
 	u_int		evmask,
@@ -1927,7 +1444,7 @@ xfs_free_file_space(
 	xfs_mount_t		*mp;
 	int			nimap;
 	uint			resblks;
-	uint			rounding;
+	xfs_off_t		rounding;
 	int			rt;
 	xfs_fileoff_t		startoffset_fsb;
 	xfs_trans_t		*tp;
@@ -1956,7 +1473,7 @@ xfs_free_file_space(
 		inode_dio_wait(VFS_I(ip));
 	}
 
-	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
 	ioffset = offset & ~(rounding - 1);
 	error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
 					      ioffset, -1);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 5163022d980..38c67c34d73 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,8 +31,7 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
 		struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 		struct xfs_name *target_name);
-int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
-		       xfs_off_t *offset, filldir_t filldir);
+int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
 		const char *target_path, umode_t mode, struct xfs_inode **ipp);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);