1 files changed, 220 insertions, 39 deletions
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 718394e2c01..904777c1cd2 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -1,4 +1,4 @@
-/* 
+/*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (C) 2000-2001 Qualcomm Incorporated
 
@@ -12,30 +12,33 @@
    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
    IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
-   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES 
-   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 
-   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 
+   CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
+   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+   ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, 
-   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS 
+   ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
+   COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
    SOFTWARE IS DISCLAIMED.
 */
 
 #ifndef __BLUETOOTH_H
 #define __BLUETOOTH_H
 
-#include <asm/types.h>
-#include <asm/byteorder.h>
-#include <linux/list.h>
 #include <linux/poll.h>
 #include <net/sock.h>
+#include <linux/seq_file.h>
 
 #ifndef AF_BLUETOOTH
 #define AF_BLUETOOTH	31
 #define PF_BLUETOOTH	AF_BLUETOOTH
 #endif
 
+/* Bluetooth versions */
+#define BLUETOOTH_VER_1_1	1
+#define BLUETOOTH_VER_1_2	2
+#define BLUETOOTH_VER_2_0	3
+
 /* Reserv for core and drivers use */
 #define BT_SKB_RESERVE	8
 
@@ -56,17 +59,74 @@
 #define BT_SECURITY	4
 struct bt_security {
 	__u8 level;
+	__u8 key_size;
 };
 #define BT_SECURITY_SDP		0
 #define BT_SECURITY_LOW		1
 #define BT_SECURITY_MEDIUM	2
 #define BT_SECURITY_HIGH	3
+#define BT_SECURITY_FIPS	4
 
 #define BT_DEFER_SETUP	7
 
-#define BT_INFO(fmt, arg...) printk(KERN_INFO "Bluetooth: " fmt "\n" , ## arg)
-#define BT_ERR(fmt, arg...)  printk(KERN_ERR "%s: " fmt "\n" , __func__ , ## arg)
-#define BT_DBG(fmt, arg...)  pr_debug("%s: " fmt "\n" , __func__ , ## arg)
+#define BT_FLUSHABLE	8
+
+#define BT_FLUSHABLE_OFF	0
+#define BT_FLUSHABLE_ON		1
+
+#define BT_POWER	9
+struct bt_power {
+	__u8 force_active;
+};
+#define BT_POWER_FORCE_ACTIVE_OFF 0
+#define BT_POWER_FORCE_ACTIVE_ON  1
+
+#define BT_CHANNEL_POLICY	10
+
+/* BR/EDR only (default policy)
+ *   AMP controllers cannot be used.
+ *   Channel move requests from the remote device are denied.
+ *   If the L2CAP channel is currently using AMP, move the channel to BR/EDR.
+ */
+#define BT_CHANNEL_POLICY_BREDR_ONLY		0
+
+/* BR/EDR Preferred
+ *   Allow use of AMP controllers.
+ *   If the L2CAP channel is currently on AMP, move it to BR/EDR.
+ *   Channel move requests from the remote device are allowed.
+ */
+#define BT_CHANNEL_POLICY_BREDR_PREFERRED	1
+
+/* AMP Preferred
+ *   Allow use of AMP controllers
+ *   If the L2CAP channel is currently on BR/EDR and AMP controller
+ *     resources are available, initiate a channel move to AMP.
+ *   Channel move requests from the remote device are allowed.
+ *   If the L2CAP socket has not been connected yet, try to create
+ *     and configure the channel directly on an AMP controller rather
+ *     than BR/EDR.
+ */
+#define BT_CHANNEL_POLICY_AMP_PREFERRED		2
+
+#define BT_VOICE		11
+struct bt_voice {
+	__u16 setting;
+};
+
+#define BT_VOICE_TRANSPARENT			0x0003
+#define BT_VOICE_CVSD_16BIT			0x0060
+
+#define BT_SNDMTU		12
+#define BT_RCVMTU		13
+
+__printf(1, 2)
+int bt_info(const char *fmt, ...);
+__printf(1, 2)
+int bt_err(const char *fmt, ...);
+
+#define BT_INFO(fmt, ...)	bt_info(fmt "\n", ##__VA_ARGS__)
+#define BT_ERR(fmt, ...)	bt_err(fmt "\n", ##__VA_ARGS__)
+#define BT_DBG(fmt, ...)	pr_debug(fmt "\n", ##__VA_ARGS__)
 
 /* Connection and socket states */
 enum {
@@ -81,27 +141,80 @@ enum {
 	BT_CLOSED
 };
 
+/* If unused will be removed by compiler */
+static inline const char *state_to_string(int state)
+{
+	switch (state) {
+	case BT_CONNECTED:
+		return "BT_CONNECTED";
+	case BT_OPEN:
+		return "BT_OPEN";
+	case BT_BOUND:
+		return "BT_BOUND";
+	case BT_LISTEN:
+		return "BT_LISTEN";
+	case BT_CONNECT:
+		return "BT_CONNECT";
+	case BT_CONNECT2:
+		return "BT_CONNECT2";
+	case BT_CONFIG:
+		return "BT_CONFIG";
+	case BT_DISCONN:
+		return "BT_DISCONN";
+	case BT_CLOSED:
+		return "BT_CLOSED";
+	}
+
+	return "invalid state";
+}
+
 /* BD Address */
 typedef struct {
 	__u8 b[6];
-} __attribute__((packed)) bdaddr_t;
+} __packed bdaddr_t;
+
+/* BD Address type */
+#define BDADDR_BREDR		0x00
+#define BDADDR_LE_PUBLIC	0x01
+#define BDADDR_LE_RANDOM	0x02
+
+static inline bool bdaddr_type_is_valid(__u8 type)
+{
+	switch (type) {
+	case BDADDR_BREDR:
+	case BDADDR_LE_PUBLIC:
+	case BDADDR_LE_RANDOM:
+		return true;
+	}
+
+	return false;
+}
+
+static inline bool bdaddr_type_is_le(__u8 type)
+{
+	switch (type) {
+	case BDADDR_LE_PUBLIC:
+	case BDADDR_LE_RANDOM:
+		return true;
+	}
+
+	return false;
+}
 
-#define BDADDR_ANY   (&(bdaddr_t) {{0, 0, 0, 0, 0, 0}})
-#define BDADDR_LOCAL (&(bdaddr_t) {{0, 0, 0, 0xff, 0xff, 0xff}})
+#define BDADDR_ANY  (&(bdaddr_t) {{0, 0, 0, 0, 0, 0}})
+#define BDADDR_NONE (&(bdaddr_t) {{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}})
 
 /* Copy, swap, convert BD Address */
-static inline int bacmp(bdaddr_t *ba1, bdaddr_t *ba2)
+static inline int bacmp(const bdaddr_t *ba1, const bdaddr_t *ba2)
 {
 	return memcmp(ba1, ba2, sizeof(bdaddr_t));
 }
-static inline void bacpy(bdaddr_t *dst, bdaddr_t *src)
+static inline void bacpy(bdaddr_t *dst, const bdaddr_t *src)
 {
 	memcpy(dst, src, sizeof(bdaddr_t));
 }
 
 void baswap(bdaddr_t *dst, bdaddr_t *src);
-char *batostr(bdaddr_t *ba);
-bdaddr_t *strtoba(char *str);
 
 /* Common socket structures and functions */
 
@@ -109,38 +222,75 @@ bdaddr_t *strtoba(char *str);
 
 struct bt_sock {
 	struct sock sk;
-	bdaddr_t    src;
-	bdaddr_t    dst;
 	struct list_head accept_q;
 	struct sock *parent;
-	u32 defer_setup;
+	unsigned long flags;
+	void (*skb_msg_name)(struct sk_buff *, void *, int *);
+};
+
+enum {
+	BT_SK_DEFER_SETUP,
+	BT_SK_SUSPEND,
 };
 
 struct bt_sock_list {
 	struct hlist_head head;
 	rwlock_t          lock;
+#ifdef CONFIG_PROC_FS
+        int (* custom_seq_show)(struct seq_file *, void *);
+#endif
 };
 
-int  bt_sock_register(int proto, struct net_proto_family *ops);
-int  bt_sock_unregister(int proto);
+int  bt_sock_register(int proto, const struct net_proto_family *ops);
+void bt_sock_unregister(int proto);
 void bt_sock_link(struct bt_sock_list *l, struct sock *s);
 void bt_sock_unlink(struct bt_sock_list *l, struct sock *s);
-int  bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags);
-uint bt_sock_poll(struct file * file, struct socket *sock, poll_table *wait);
+int  bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
+				struct msghdr *msg, size_t len, int flags);
+int  bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
+			struct msghdr *msg, size_t len, int flags);
+uint bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait);
 int  bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
 int  bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo);
+int  bt_sock_wait_ready(struct sock *sk, unsigned long flags);
 
 void bt_accept_enqueue(struct sock *parent, struct sock *sk);
 void bt_accept_unlink(struct sock *sk);
 struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock);
 
 /* Skb helpers */
+struct l2cap_ctrl {
+	unsigned int	sframe:1,
+			poll:1,
+			final:1,
+			fcs:1,
+			sar:2,
+			super:2;
+	__u16		reqseq;
+	__u16		txseq;
+	__u8		retries;
+};
+
+struct hci_dev;
+
+typedef void (*hci_req_complete_t)(struct hci_dev *hdev, u8 status);
+
+struct hci_req_ctrl {
+	bool			start;
+	u8			event;
+	hci_req_complete_t	complete;
+};
+
 struct bt_skb_cb {
 	__u8 pkt_type;
 	__u8 incoming;
-	__u8 tx_seq;
-	__u8 retries;
-	__u8 sar;
+	__u16 expect;
+	__u8 force_active;
+	struct l2cap_chan *chan;
+	struct l2cap_ctrl control;
+	struct hci_req_ctrl req;
+	bdaddr_t bdaddr;
+	__le16 psm;
 };
 #define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb))
 
@@ -148,34 +298,65 @@ static inline struct sk_buff *bt_skb_alloc(unsigned int len, gfp_t how)
 {
 	struct sk_buff *skb;
 
-	if ((skb = alloc_skb(len + BT_SKB_RESERVE, how))) {
+	skb = alloc_skb(len + BT_SKB_RESERVE, how);
+	if (skb) {
 		skb_reserve(skb, BT_SKB_RESERVE);
 		bt_cb(skb)->incoming  = 0;
 	}
 	return skb;
 }
 
-static inline struct sk_buff *bt_skb_send_alloc(struct sock *sk, unsigned long len, 
-							int nb, int *err)
+static inline struct sk_buff *bt_skb_send_alloc(struct sock *sk,
+					unsigned long len, int nb, int *err)
 {
 	struct sk_buff *skb;
 
-	if ((skb = sock_alloc_send_skb(sk, len + BT_SKB_RESERVE, nb, err))) {
+	skb = sock_alloc_send_skb(sk, len + BT_SKB_RESERVE, nb, err);
+	if (skb) {
 		skb_reserve(skb, BT_SKB_RESERVE);
 		bt_cb(skb)->incoming  = 0;
 	}
 
+	if (!skb && *err)
+		return NULL;
+
+	*err = sock_error(sk);
+	if (*err)
+		goto out;
+
+	if (sk->sk_shutdown) {
+		*err = -ECONNRESET;
+		goto out;
+	}
+
 	return skb;
+
+out:
+	kfree_skb(skb);
+	return NULL;
 }
 
-int bt_err(__u16 code);
+int bt_to_errno(__u16 code);
+
+int hci_sock_init(void);
+void hci_sock_cleanup(void);
+
+int bt_sysfs_init(void);
+void bt_sysfs_cleanup(void);
+
+int bt_procfs_init(struct net *net, const char *name,
+		   struct bt_sock_list *sk_list,
+		   int (*seq_show)(struct seq_file *, void *));
+void bt_procfs_cleanup(struct net *net, const char *name);
+
+extern struct dentry *bt_debugfs;
 
-extern int hci_sock_init(void);
-extern void hci_sock_cleanup(void);
+int l2cap_init(void);
+void l2cap_exit(void);
 
-extern int bt_sysfs_init(void);
-extern void bt_sysfs_cleanup(void);
+int sco_init(void);
+void sco_exit(void);
 
-extern struct class *bt_class;
+void bt_sock_reclassify_lock(struct sock *sk, int proto);
 
 #endif /* __BLUETOOTH_H */
diff --git a/fs/xfs/Kbuild b/fs/xfs/Kbuild
deleted file mode 100644
index 2566e96706f..00000000000
--- a/fs/xfs/Kbuild
+++ /dev/null
@@ -1,6 +0,0 @@
-#
-# The xfs people like to share Makefile with 2.6 and 2.4.
-# Utilise file named Kbuild file which has precedence over Makefile.
-#
-
-include $(srctree)/$(obj)/Makefile-linux-2.6
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index bac27d66151..399e8cec6e6 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,8 @@
 config XFS_FS
 	tristate "XFS filesystem support"
-	select EXPORTFS if NFSD!=n
+	depends on BLOCK
+	select EXPORTFS
+	select LIBCRC32C
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
@@ -18,14 +20,10 @@ config XFS_FS
 	  system of your root partition is compiled as a module, you'll need
 	  to use an initial ramdisk (initrd) to boot.
 
-config XFS_EXPORT
-	bool
-	depends on XFS_FS && EXPORTFS
-	default y
-
 config XFS_QUOTA
 	bool "XFS Quota support"
 	depends on XFS_FS
+	select QUOTACTL
 	help
 	  If you say Y here, you will be able to set limits for disk usage on
 	  a per user and/or a per group basis under XFS.  XFS considers quota
@@ -40,21 +38,10 @@ config XFS_QUOTA
 	  with or without the generic quota support enabled (CONFIG_QUOTA) -
 	  they are completely independent subsystems.
 
-config XFS_SECURITY
-	bool "XFS Security Label support"
-	depends on XFS_FS
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute namespace for inode security
-	  labels in the XFS filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for inode security labels, say N.
-
 config XFS_POSIX_ACL
 	bool "XFS POSIX ACL support"
 	depends on XFS_FS
+	select FS_POSIX_ACL
 	help
 	  POSIX Access Control Lists (ACLs) support permissions for users and
 	  groups beyond the owner/group/world scheme.
@@ -65,18 +52,45 @@ config XFS_POSIX_ACL
 	  If you don't know what Access Control Lists are, say N.
 
 config XFS_RT
-	bool "XFS Realtime support (EXPERIMENTAL)"
-	depends on XFS_FS && EXPERIMENTAL
+	bool "XFS Realtime subvolume support"
+	depends on XFS_FS
 	help
 	  If you say Y here you will be able to mount and use XFS filesystems
-	  which contain a realtime subvolume. The realtime subvolume is a
-	  separate area of disk space where only file data is stored. The
-	  realtime subvolume is designed to provide very deterministic
-	  data rates suitable for media streaming applications.
+	  which contain a realtime subvolume.  The realtime subvolume is a
+	  separate area of disk space where only file data is stored.  It was
+	  originally designed to provide deterministic data rates suitable
+	  for media streaming applications, but is also useful as a generic
+	  mechanism for ensuring data and metadata/log I/Os are completely
+	  separated.  Regular file I/Os are isolated to a separate device
+	  from all other requests, and this can be done quite transparently
+	  to applications via the inherit-realtime directory inode flag.
 
-	  See the xfs man page in section 5 for a bit more information.
-
-	  This feature is unsupported at this time, is not yet fully
-	  functional, and may cause serious problems.
+	  See the xfs man page in section 5 for additional information.
 
 	  If unsure, say N.
+
+config XFS_WARN
+	bool "XFS Verbose Warnings"
+	depends on XFS_FS && !XFS_DEBUG
+	help
+	  Say Y here to get an XFS build with many additional warnings.
+	  It converts ASSERT checks to WARN, so will log any out-of-bounds
+	  conditions that occur that would otherwise be missed. It is much
+	  lighter weight than XFS_DEBUG and does not modify algorithms and will
+	  not cause the kernel to panic on non-fatal errors.
+
+	  However, similar to XFS_DEBUG, it is only advisable to use this if you
+	  are debugging a particular problem.
+
+config XFS_DEBUG
+	bool "XFS Debugging support"
+	depends on XFS_FS
+	help
+	  Say Y here to get an XFS build with many debugging features,
+	  including ASSERT checks, function wrappers around macros,
+	  and extra sanity-checking functions in various code paths.
+
+	  Note that the resulting code will be HUGE and SLOW, and probably
+	  not useful unless you are debugging a particular problem.
+
+	  Say N unless you are an XFS developer, or you play one on TV.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 49e3e7e5e3d..c21f4350666 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -1 +1,116 @@
-include $(TOPDIR)/fs/xfs/Makefile-linux-$(VERSION).$(PATCHLEVEL)
+#
+# Copyright (c) 2000-2005 Silicon Graphics, Inc.
+# All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+
+ccflags-y += -I$(src)			# needed for trace events
+
+ccflags-$(CONFIG_XFS_DEBUG) += -g
+
+obj-$(CONFIG_XFS_FS)		+= xfs.o
+
+# this one should be compiled first, as the tracing macros can easily blow up
+xfs-y				+= xfs_trace.o
+
+# highlevel code
+xfs-y				+= xfs_aops.o \
+				   xfs_attr_inactive.o \
+				   xfs_attr_list.o \
+				   xfs_bit.o \
+				   xfs_bmap_util.o \
+				   xfs_buf.o \
+				   xfs_dir2_readdir.o \
+				   xfs_discard.o \
+				   xfs_error.o \
+				   xfs_export.o \
+				   xfs_extent_busy.o \
+				   xfs_file.o \
+				   xfs_filestream.o \
+				   xfs_fsops.o \
+				   xfs_globals.o \
+				   xfs_icache.o \
+				   xfs_ioctl.o \
+				   xfs_iomap.o \
+				   xfs_iops.o \
+				   xfs_itable.o \
+				   xfs_message.o \
+				   xfs_mount.o \
+				   xfs_mru_cache.o \
+				   xfs_super.o \
+				   xfs_symlink.o \
+				   xfs_trans.o \
+				   xfs_xattr.o \
+				   kmem.o \
+				   uuid.o
+
+# code shared with libxfs
+xfs-y				+= xfs_alloc.o \
+				   xfs_alloc_btree.o \
+				   xfs_attr.o \
+				   xfs_attr_leaf.o \
+				   xfs_attr_remote.o \
+				   xfs_bmap.o \
+				   xfs_bmap_btree.o \
+				   xfs_btree.o \
+				   xfs_da_btree.o \
+				   xfs_da_format.o \
+				   xfs_dir2.o \
+				   xfs_dir2_block.o \
+				   xfs_dir2_data.o \
+				   xfs_dir2_leaf.o \
+				   xfs_dir2_node.o \
+				   xfs_dir2_sf.o \
+				   xfs_dquot_buf.o \
+				   xfs_ialloc.o \
+				   xfs_ialloc_btree.o \
+				   xfs_icreate_item.o \
+				   xfs_inode.o \
+				   xfs_inode_fork.o \
+				   xfs_inode_buf.o \
+				   xfs_log_recover.o \
+				   xfs_log_rlimit.o \
+				   xfs_sb.o \
+				   xfs_symlink_remote.o \
+				   xfs_trans_resv.o
+
+# low-level transaction/log code
+xfs-y				+= xfs_log.o \
+				   xfs_log_cil.o \
+				   xfs_buf_item.o \
+				   xfs_extfree_item.o \
+				   xfs_inode_item.o \
+				   xfs_trans_ail.o \
+				   xfs_trans_buf.o \
+				   xfs_trans_extfree.o \
+				   xfs_trans_inode.o \
+
+# optional features
+xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
+				   xfs_dquot_item.o \
+				   xfs_trans_dquot.o \
+				   xfs_qm_syscalls.o \
+				   xfs_qm_bhv.o \
+				   xfs_qm.o \
+				   xfs_quotaops.o
+
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o \
+				   xfs_rtbitmap.o
+
+xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
+xfs-$(CONFIG_PROC_FS)		+= xfs_stats.o
+xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
+xfs-$(CONFIG_COMPAT)		+= xfs_ioctl32.o
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
deleted file mode 100644
index 97bd4743b46..00000000000
--- a/fs/xfs/Makefile-linux-2.6
+++ /dev/null
@@ -1,151 +0,0 @@
-#
-# Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms of version 2 of the GNU General Public License as
-# published by the Free Software Foundation.
-#
-# This program is distributed in the hope that it would be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# Further, this software is distributed without any warranty that it is
-# free of the rightful claim of any third person regarding infringement
-# or the like.  Any license provided herein, whether implied or
-# otherwise, applies only to this software file.  Patent licenses, if
-# any, provided herein do not apply to combinations of this program with
-# other software, or any other product whatsoever.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write the Free Software Foundation, Inc., 59
-# Temple Place - Suite 330, Boston MA 02111-1307, USA.
-#
-# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
-# Mountain View, CA  94043, or:
-#
-# http://www.sgi.com
-#
-# For further information regarding this notice, see:
-#
-# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
-#
-
-EXTRA_CFLAGS +=	 -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char
-
-XFS_LINUX := linux-2.6
-
-ifeq ($(CONFIG_XFS_DEBUG),y)
-	EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG
-	EXTRA_CFLAGS += -DPAGEBUF_LOCK_TRACKING
-endif
-ifeq ($(CONFIG_XFS_TRACE),y)
-	EXTRA_CFLAGS += -DXFS_ALLOC_TRACE
-	EXTRA_CFLAGS += -DXFS_ATTR_TRACE
-	EXTRA_CFLAGS += -DXFS_BLI_TRACE
-	EXTRA_CFLAGS += -DXFS_BMAP_TRACE
-	EXTRA_CFLAGS += -DXFS_BMBT_TRACE
-	EXTRA_CFLAGS += -DXFS_DIR_TRACE
-	EXTRA_CFLAGS += -DXFS_DIR2_TRACE
-	EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
-	EXTRA_CFLAGS += -DXFS_ILOCK_TRACE
-	EXTRA_CFLAGS += -DXFS_LOG_TRACE
-	EXTRA_CFLAGS += -DXFS_RW_TRACE
-	EXTRA_CFLAGS += -DPAGEBUF_TRACE
-	EXTRA_CFLAGS += -DXFS_VNODE_TRACE
-endif
-
-obj-$(CONFIG_XFS_FS)		+= xfs.o
-
-xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix quota/, \
-				   xfs_dquot.o \
-				   xfs_dquot_item.o \
-				   xfs_trans_dquot.o \
-				   xfs_qm_syscalls.o \
-				   xfs_qm_bhv.o \
-				   xfs_qm.o)
-
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS)		+= quota/xfs_qm_stats.o
-endif
-
-xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
-xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
-xfs-$(CONFIG_PROC_FS)		+= $(XFS_LINUX)/xfs_stats.o
-xfs-$(CONFIG_SYSCTL)		+= $(XFS_LINUX)/xfs_sysctl.o
-xfs-$(CONFIG_COMPAT)		+= $(XFS_LINUX)/xfs_ioctl32.o
-xfs-$(CONFIG_XFS_EXPORT)	+= $(XFS_LINUX)/xfs_export.o
-
-
-xfs-y				+= xfs_alloc.o \
-				   xfs_alloc_btree.o \
-				   xfs_attr.o \
-				   xfs_attr_leaf.o \
-				   xfs_behavior.o \
-				   xfs_bit.o \
-				   xfs_bmap.o \
-				   xfs_bmap_btree.o \
-				   xfs_btree.o \
-				   xfs_buf_item.o \
-				   xfs_da_btree.o \
-				   xfs_dir.o \
-				   xfs_dir2.o \
-				   xfs_dir2_block.o \
-				   xfs_dir2_data.o \
-				   xfs_dir2_leaf.o \
-				   xfs_dir2_node.o \
-				   xfs_dir2_sf.o \
-				   xfs_dir_leaf.o \
-				   xfs_error.o \
-				   xfs_extfree_item.o \
-				   xfs_fsops.o \
-				   xfs_ialloc.o \
-				   xfs_ialloc_btree.o \
-				   xfs_iget.o \
-				   xfs_inode.o \
-				   xfs_inode_item.o \
-				   xfs_iocore.o \
-				   xfs_iomap.o \
-				   xfs_itable.o \
-				   xfs_dfrag.o \
-				   xfs_log.o \
-				   xfs_log_recover.o \
-				   xfs_mount.o \
-				   xfs_rename.o \
-				   xfs_trans.o \
-				   xfs_trans_ail.o \
-				   xfs_trans_buf.o \
-				   xfs_trans_extfree.o \
-				   xfs_trans_inode.o \
-				   xfs_trans_item.o \
-				   xfs_utils.o \
-				   xfs_vfsops.o \
-				   xfs_vnodeops.o \
-				   xfs_rw.o \
-				   xfs_dmops.o \
-				   xfs_qmops.o
-
-xfs-$(CONFIG_XFS_TRACE)		+= xfs_dir2_trace.o
-
-# Objects in linux/
-xfs-y				+= $(addprefix $(XFS_LINUX)/, \
-				   kmem.o \
-				   xfs_aops.o \
-				   xfs_buf.o \
-				   xfs_file.o \
-				   xfs_fs_subr.o \
-				   xfs_globals.o \
-				   xfs_ioctl.o \
-				   xfs_iops.o \
-				   xfs_lrw.o \
-				   xfs_super.o \
-				   xfs_vfs.o \
-				   xfs_vnode.o)
-
-# Objects in support/
-xfs-y				+= $(addprefix support/, \
-				   debug.o \
-				   move.o \
-				   uuid.o)
-
-xfs-$(CONFIG_XFS_TRACE)		+= support/ktrace.o
-
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
new file mode 100644
index 00000000000..844e288b957
--- /dev/null
+++ b/fs/xfs/kmem.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include "time.h"
+#include "kmem.h"
+#include "xfs_message.h"
+
+/*
+ * Greedy allocation.  May fail and may return vmalloced memory.
+ */
+void *
+kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
+{
+	void		*ptr;
+	size_t		kmsize = maxsize;
+
+	while (!(ptr = vzalloc(kmsize))) {
+		if ((kmsize >>= 1) <= minsize)
+			kmsize = minsize;
+	}
+	if (ptr)
+		*size = kmsize;
+	return ptr;
+}
+
+void *
+kmem_alloc(size_t size, xfs_km_flags_t flags)
+{
+	int	retries = 0;
+	gfp_t	lflags = kmem_flags_convert(flags);
+	void	*ptr;
+
+	do {
+		ptr = kmalloc(size, lflags);
+		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+			return ptr;
+		if (!(++retries % 100))
+			xfs_err(NULL,
+		"possible memory allocation deadlock in %s (mode:0x%x)",
+					__func__, lflags);
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+	} while (1);
+}
+
+void *
+kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
+{
+	unsigned noio_flag = 0;
+	void	*ptr;
+	gfp_t	lflags;
+
+	ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
+	if (ptr)
+		return ptr;
+
+	/*
+	 * __vmalloc() will allocate data pages and auxillary structures (e.g.
+	 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
+	 * here. Hence we need to tell memory reclaim that we are in such a
+	 * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+	 * the filesystem here and potentially deadlocking.
+	 */
+	if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+		noio_flag = memalloc_noio_save();
+
+	lflags = kmem_flags_convert(flags);
+	ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+
+	if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+		memalloc_noio_restore(noio_flag);
+
+	return ptr;
+}
+
+void
+kmem_free(const void *ptr)
+{
+	if (!is_vmalloc_addr(ptr)) {
+		kfree(ptr);
+	} else {
+		vfree(ptr);
+	}
+}
+
+void *
+kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
+	     xfs_km_flags_t flags)
+{
+	void	*new;
+
+	new = kmem_alloc(newsize, flags);
+	if (ptr) {
+		if (new)
+			memcpy(new, ptr,
+				((oldsize < newsize) ? oldsize : newsize));
+		kmem_free(ptr);
+	}
+	return new;
+}
+
+void *
+kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
+{
+	int	retries = 0;
+	gfp_t	lflags = kmem_flags_convert(flags);
+	void	*ptr;
+
+	do {
+		ptr = kmem_cache_alloc(zone, lflags);
+		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+			return ptr;
+		if (!(++retries % 100))
+			xfs_err(NULL,
+		"possible memory allocation deadlock in %s (mode:0x%x)",
+					__func__, lflags);
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+	} while (1);
+}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
new file mode 100644
index 00000000000..64db0e53ede
--- /dev/null
+++ b/fs/xfs/kmem.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+/*
+ * General memory allocation interfaces
+ */
+
+typedef unsigned __bitwise xfs_km_flags_t;
+#define KM_SLEEP	((__force xfs_km_flags_t)0x0001u)
+#define KM_NOSLEEP	((__force xfs_km_flags_t)0x0002u)
+#define KM_NOFS		((__force xfs_km_flags_t)0x0004u)
+#define KM_MAYFAIL	((__force xfs_km_flags_t)0x0008u)
+#define KM_ZERO		((__force xfs_km_flags_t)0x0010u)
+
+/*
+ * We use a special process flag to avoid recursive callbacks into
+ * the filesystem during transactions.  We will also issue our own
+ * warnings, so we explicitly skip any generic ones (silly of us).
+ */
+static inline gfp_t
+kmem_flags_convert(xfs_km_flags_t flags)
+{
+	gfp_t	lflags;
+
+	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
+
+	if (flags & KM_NOSLEEP) {
+		lflags = GFP_ATOMIC | __GFP_NOWARN;
+	} else {
+		lflags = GFP_KERNEL | __GFP_NOWARN;
+		if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+			lflags &= ~__GFP_FS;
+	}
+
+	if (flags & KM_ZERO)
+		lflags |= __GFP_ZERO;
+
+	return lflags;
+}
+
+extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
+extern void  kmem_free(const void *);
+
+
+extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
+
+static inline void *
+kmem_zalloc(size_t size, xfs_km_flags_t flags)
+{
+	return kmem_alloc(size, flags | KM_ZERO);
+}
+
+/*
+ * Zone interfaces
+ */
+
+#define KM_ZONE_HWALIGN	SLAB_HWCACHE_ALIGN
+#define KM_ZONE_RECLAIM	SLAB_RECLAIM_ACCOUNT
+#define KM_ZONE_SPREAD	SLAB_MEM_SPREAD
+
+#define kmem_zone	kmem_cache
+#define kmem_zone_t	struct kmem_cache
+
+static inline kmem_zone_t *
+kmem_zone_init(int size, char *zone_name)
+{
+	return kmem_cache_create(zone_name, size, 0, 0, NULL);
+}
+
+static inline kmem_zone_t *
+kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
+		     void (*construct)(void *))
+{
+	return kmem_cache_create(zone_name, size, 0, flags, construct);
+}
+
+static inline void
+kmem_zone_free(kmem_zone_t *zone, void *ptr)
+{
+	kmem_cache_free(zone, ptr);
+}
+
+static inline void
+kmem_zone_destroy(kmem_zone_t *zone)
+{
+	if (zone)
+		kmem_cache_destroy(zone);
+}
+
+extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
+
+static inline void *
+kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
+{
+	return kmem_zone_alloc(zone, flags | KM_ZERO);
+}
+
+#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
deleted file mode 100644
index aba7fcf881a..00000000000
--- a/fs/xfs/linux-2.6/kmem.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/blkdev.h>
-#include "time.h"
-#include "kmem.h"
-
-#define MAX_VMALLOCS	6
-#define MAX_SLAB_SIZE	0x20000
-
-void *
-kmem_alloc(size_t size, unsigned int __nocast flags)
-{
-	int	retries = 0;
-	gfp_t	lflags = kmem_flags_convert(flags);
-	void	*ptr;
-
-	do {
-		if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
-			ptr = kmalloc(size, lflags);
-		else
-			ptr = __vmalloc(size, lflags, PAGE_KERNEL);
-		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
-			return ptr;
-		if (!(++retries % 100))
-			printk(KERN_ERR "XFS: possible memory allocation "
-					"deadlock in %s (mode:0x%x)\n",
-					__FUNCTION__, lflags);
-		blk_congestion_wait(WRITE, HZ/50);
-	} while (1);
-}
-
-void *
-kmem_zalloc(size_t size, unsigned int __nocast flags)
-{
-	void	*ptr;
-
-	ptr = kmem_alloc(size, flags);
-	if (ptr)
-		memset((char *)ptr, 0, (int)size);
-	return ptr;
-}
-
-void
-kmem_free(void *ptr, size_t size)
-{
-	if (((unsigned long)ptr < VMALLOC_START) ||
-	    ((unsigned long)ptr >= VMALLOC_END)) {
-		kfree(ptr);
-	} else {
-		vfree(ptr);
-	}
-}
-
-void *
-kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
-	     unsigned int __nocast flags)
-{
-	void	*new;
-
-	new = kmem_alloc(newsize, flags);
-	if (ptr) {
-		if (new)
-			memcpy(new, ptr,
-				((oldsize < newsize) ? oldsize : newsize));
-		kmem_free(ptr, oldsize);
-	}
-	return new;
-}
-
-void *
-kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
-{
-	int	retries = 0;
-	gfp_t	lflags = kmem_flags_convert(flags);
-	void	*ptr;
-
-	do {
-		ptr = kmem_cache_alloc(zone, lflags);
-		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
-			return ptr;
-		if (!(++retries % 100))
-			printk(KERN_ERR "XFS: possible memory allocation "
-					"deadlock in %s (mode:0x%x)\n",
-					__FUNCTION__, lflags);
-		blk_congestion_wait(WRITE, HZ/50);
-	} while (1);
-}
-
-void *
-kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
-{
-	void	*ptr;
-
-	ptr = kmem_zone_alloc(zone, flags);
-	if (ptr)
-		memset((char *)ptr, 0, kmem_cache_size(zone));
-	return ptr;
-}
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
deleted file mode 100644
index c64a29cdfff..00000000000
--- a/fs/xfs/linux-2.6/kmem.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_KMEM_H__
-#define __XFS_SUPPORT_KMEM_H__
-
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-
-/*
- * memory management routines
- */
-#define KM_SLEEP	0x0001u
-#define KM_NOSLEEP	0x0002u
-#define KM_NOFS		0x0004u
-#define KM_MAYFAIL	0x0008u
-
-#define	kmem_zone	kmem_cache
-#define kmem_zone_t	struct kmem_cache
-
-typedef unsigned long xfs_pflags_t;
-
-#define PFLAGS_TEST_NOIO()              (current->flags & PF_NOIO)
-#define PFLAGS_TEST_FSTRANS()           (current->flags & PF_FSTRANS)
-
-#define PFLAGS_SET_NOIO() do {		\
-	current->flags |= PF_NOIO;	\
-} while (0)
-
-#define PFLAGS_CLEAR_NOIO() do {	\
-	current->flags &= ~PF_NOIO;	\
-} while (0)
-
-/* these could be nested, so we save state */
-#define PFLAGS_SET_FSTRANS(STATEP) do {	\
-	*(STATEP) = current->flags;	\
-	current->flags |= PF_FSTRANS;	\
-} while (0)
-
-#define PFLAGS_CLEAR_FSTRANS(STATEP) do { \
-	*(STATEP) = current->flags;	\
-	current->flags &= ~PF_FSTRANS;	\
-} while (0)
-
-/* Restore the PF_FSTRANS state to what was saved in STATEP */
-#define PFLAGS_RESTORE_FSTRANS(STATEP) do {     		\
-	current->flags = ((current->flags & ~PF_FSTRANS) |	\
-			  (*(STATEP) & PF_FSTRANS));		\
-} while (0)
-
-#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \
-	*(NSTATEP) = *(OSTATEP);	\
-} while (0)
-
-static __inline gfp_t kmem_flags_convert(unsigned int __nocast flags)
-{
-	gfp_t	lflags = __GFP_NOWARN;	/* we'll report problems, if need be */
-
-#ifdef DEBUG
-	if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) {
-		printk(KERN_WARNING
-		    "XFS: memory allocation with wrong flags (%x)\n", flags);
-		BUG();
-	}
-#endif
-
-	if (flags & KM_NOSLEEP) {
-		lflags |= GFP_ATOMIC;
-	} else {
-		lflags |= GFP_KERNEL;
-
-		/* avoid recusive callbacks to filesystem during transactions */
-		if (PFLAGS_TEST_FSTRANS() || (flags & KM_NOFS))
-			lflags &= ~__GFP_FS;
-	}
-
-        return lflags;
-}
-
-static __inline kmem_zone_t *
-kmem_zone_init(int size, char *zone_name)
-{
-	return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
-}
-
-static __inline void
-kmem_zone_free(kmem_zone_t *zone, void *ptr)
-{
-	kmem_cache_free(zone, ptr);
-}
-
-static __inline void
-kmem_zone_destroy(kmem_zone_t *zone)
-{
-	if (zone && kmem_cache_destroy(zone))
-		BUG();
-}
-
-extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
-extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
-
-extern void *kmem_alloc(size_t, unsigned int __nocast);
-extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast);
-extern void *kmem_zalloc(size_t, unsigned int __nocast);
-extern void  kmem_free(void *, size_t);
-
-typedef struct shrinker *kmem_shaker_t;
-typedef int (*kmem_shake_func_t)(int, gfp_t);
-
-static __inline kmem_shaker_t
-kmem_shake_register(kmem_shake_func_t sfunc)
-{
-	return set_shrinker(DEFAULT_SEEKS, sfunc);
-}
-
-static __inline void
-kmem_shake_deregister(kmem_shaker_t shrinker)
-{
-	remove_shrinker(shrinker);
-}
-
-static __inline int
-kmem_shake_allow(gfp_t gfp_mask)
-{
-	return (gfp_mask & __GFP_WAIT);
-}
-
-#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
deleted file mode 100644
index 194a84490bd..00000000000
--- a/fs/xfs/linux-2.6/sema.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SEMA_H__
-#define __XFS_SUPPORT_SEMA_H__
-
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <asm/atomic.h>
-#include <asm/semaphore.h>
-
-/*
- * sema_t structure just maps to struct semaphore in Linux kernel.
- */
-
-typedef struct semaphore sema_t;
-
-#define init_sema(sp, val, c, d)	sema_init(sp, val)
-#define initsema(sp, val)		sema_init(sp, val)
-#define initnsema(sp, val, name)	sema_init(sp, val)
-#define psema(sp, b)			down(sp)
-#define vsema(sp)			up(sp)
-#define valusema(sp)			(atomic_read(&(sp)->count))
-#define freesema(sema)
-
-/*
- * Map cpsema (try to get the sema) to down_trylock. We need to switch
- * the return values since cpsema returns 1 (acquired) 0 (failed) and
- * down_trylock returns the reverse 0 (acquired) 1 (failed).
- */
-
-#define cpsema(sp)			(down_trylock(sp) ? 0 : 1)
-
-/*
- * Didn't do cvsema(sp). Not sure how to map this to up/down/...
- * It does a vsema if the values is < 0 other wise nothing.
- */
-
-#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/spin.h b/fs/xfs/linux-2.6/spin.h
deleted file mode 100644
index 50a6191178f..00000000000
--- a/fs/xfs/linux-2.6/spin.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SPIN_H__
-#define __XFS_SUPPORT_SPIN_H__
-
-#include <linux/sched.h>	/* preempt needs this */
-#include <linux/spinlock.h>
-
-/*
- * Map lock_t from IRIX to Linux spinlocks.
- *
- * We do not make use of lock_t from interrupt context, so we do not
- * have to worry about disabling interrupts at all (unlike IRIX).
- */
-
-typedef spinlock_t lock_t;
-
-#define SPLDECL(s)			unsigned long s
-#ifndef DEFINE_SPINLOCK
-#define DEFINE_SPINLOCK(s)		spinlock_t s = SPIN_LOCK_UNLOCKED
-#endif
-
-#define spinlock_init(lock, name)	spin_lock_init(lock)
-#define	spinlock_destroy(lock)
-#define mutex_spinlock(lock)		({ spin_lock(lock); 0; })
-#define mutex_spinunlock(lock, s)	do { spin_unlock(lock); (void)s; } while (0)
-#define nested_spinlock(lock)		spin_lock(lock)
-#define nested_spinunlock(lock)		spin_unlock(lock)
-
-#endif /* __XFS_SUPPORT_SPIN_H__ */
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
deleted file mode 100644
index 9a8ad481b00..00000000000
--- a/fs/xfs/linux-2.6/sv.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SV_H__
-#define __XFS_SUPPORT_SV_H__
-
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-
-/*
- * Synchronisation variables.
- *
- * (Parameters "pri", "svf" and "rts" are not implemented)
- */
-
-typedef struct sv_s {
-	wait_queue_head_t waiters;
-} sv_t;
-
-#define SV_FIFO		0x0		/* sv_t is FIFO type */
-#define SV_LIFO		0x2		/* sv_t is LIFO type */
-#define SV_PRIO		0x4		/* sv_t is PRIO type */
-#define SV_KEYED	0x6		/* sv_t is KEYED type */
-#define SV_DEFAULT      SV_FIFO
-
-
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
-			     unsigned long timeout)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	add_wait_queue_exclusive(&sv->waiters, &wait);
-	__set_current_state(state);
-	spin_unlock(lock);
-
-	schedule_timeout(timeout);
-
-	remove_wait_queue(&sv->waiters, &wait);
-}
-
-#define init_sv(sv,type,name,flag) \
-	init_waitqueue_head(&(sv)->waiters)
-#define sv_init(sv,flag,name) \
-	init_waitqueue_head(&(sv)->waiters)
-#define sv_destroy(sv) \
-	/*NOTHING*/
-#define sv_wait(sv, pri, lock, s) \
-	_sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_wait_sig(sv, pri, lock, s)   \
-	_sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
-	_sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
-	_sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_signal(sv) \
-	wake_up(&(sv)->waiters)
-#define sv_broadcast(sv) \
-	wake_up_all(&(sv)->waiters)
-
-#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
deleted file mode 100644
index 9892268e300..00000000000
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ /dev/null
@@ -1,1466 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_trans.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_iomap.h"
-#include <linux/mpage.h>
-#include <linux/pagevec.h>
-#include <linux/writeback.h>
-
-STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
-
-#if defined(XFS_RW_TRACE)
-void
-xfs_page_trace(
-	int		tag,
-	struct inode	*inode,
-	struct page	*page,
-	int		mask)
-{
-	xfs_inode_t	*ip;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	loff_t		isize = i_size_read(inode);
-	loff_t		offset = page_offset(page);
-	int		delalloc = -1, unmapped = -1, unwritten = -1;
-
-	if (page_has_buffers(page))
-		xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-
-	ip = xfs_vtoi(vp);
-	if (!ip->i_rwtrace)
-		return;
-
-	ktrace_enter(ip->i_rwtrace,
-		(void *)((unsigned long)tag),
-		(void *)ip,
-		(void *)inode,
-		(void *)page,
-		(void *)((unsigned long)mask),
-		(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-		(void *)((unsigned long)((isize >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(isize & 0xffffffff)),
-		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(offset & 0xffffffff)),
-		(void *)((unsigned long)delalloc),
-		(void *)((unsigned long)unmapped),
-		(void *)((unsigned long)unwritten),
-		(void *)NULL,
-		(void *)NULL);
-}
-#else
-#define xfs_page_trace(tag, inode, page, mask)
-#endif
-
-/*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend.
- */
-STATIC void
-xfs_finish_ioend(
-	xfs_ioend_t		*ioend)
-{
-	if (atomic_dec_and_test(&ioend->io_remaining))
-		queue_work(xfsdatad_workqueue, &ioend->io_work);
-}
-
-/*
- * We're now finished for good with this ioend structure.
- * Update the page state via the associated buffer_heads,
- * release holds on the inode and bio, and finally free
- * up memory.  Do not use the ioend after this.
- */
-STATIC void
-xfs_destroy_ioend(
-	xfs_ioend_t		*ioend)
-{
-	struct buffer_head	*bh, *next;
-
-	for (bh = ioend->io_buffer_head; bh; bh = next) {
-		next = bh->b_private;
-		bh->b_end_io(bh, ioend->io_uptodate);
-	}
-
-	vn_iowake(ioend->io_vnode);
-	mempool_free(ioend, xfs_ioend_pool);
-}
-
-/*
- * Buffered IO write completion for delayed allocate extents.
- * TODO: Update ondisk isize now that we know the file data
- * has been flushed (i.e. the notorious "NULL file" problem).
- */
-STATIC void
-xfs_end_bio_delalloc(
-	void			*data)
-{
-	xfs_ioend_t		*ioend = data;
-
-	xfs_destroy_ioend(ioend);
-}
-
-/*
- * Buffered IO write completion for regular, written extents.
- */
-STATIC void
-xfs_end_bio_written(
-	void			*data)
-{
-	xfs_ioend_t		*ioend = data;
-
-	xfs_destroy_ioend(ioend);
-}
-
-/*
- * IO write completion for unwritten extents.
- *
- * Issue transactions to convert a buffer range from unwritten
- * to written extents.
- */
-STATIC void
-xfs_end_bio_unwritten(
-	void			*data)
-{
-	xfs_ioend_t		*ioend = data;
-	vnode_t			*vp = ioend->io_vnode;
-	xfs_off_t		offset = ioend->io_offset;
-	size_t			size = ioend->io_size;
-	int			error;
-
-	if (ioend->io_uptodate)
-		VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
-	xfs_destroy_ioend(ioend);
-}
-
-/*
- * Allocate and initialise an IO completion structure.
- * We need to track unwritten extent write completion here initially.
- * We'll need to extend this for updating the ondisk inode size later
- * (vs. incore size).
- */
-STATIC xfs_ioend_t *
-xfs_alloc_ioend(
-	struct inode		*inode,
-	unsigned int		type)
-{
-	xfs_ioend_t		*ioend;
-
-	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
-
-	/*
-	 * Set the count to 1 initially, which will prevent an I/O
-	 * completion callback from happening before we have started
-	 * all the I/O from calling the completion routine too early.
-	 */
-	atomic_set(&ioend->io_remaining, 1);
-	ioend->io_uptodate = 1; /* cleared if any I/O fails */
-	ioend->io_list = NULL;
-	ioend->io_type = type;
-	ioend->io_vnode = LINVFS_GET_VP(inode);
-	ioend->io_buffer_head = NULL;
-	ioend->io_buffer_tail = NULL;
-	atomic_inc(&ioend->io_vnode->v_iocount);
-	ioend->io_offset = 0;
-	ioend->io_size = 0;
-
-	if (type == IOMAP_UNWRITTEN)
-		INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend);
-	else if (type == IOMAP_DELAY)
-		INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc, ioend);
-	else
-		INIT_WORK(&ioend->io_work, xfs_end_bio_written, ioend);
-
-	return ioend;
-}
-
-STATIC int
-xfs_map_blocks(
-	struct inode		*inode,
-	loff_t			offset,
-	ssize_t			count,
-	xfs_iomap_t		*mapp,
-	int			flags)
-{
-	vnode_t			*vp = LINVFS_GET_VP(inode);
-	int			error, nmaps = 1;
-
-	VOP_BMAP(vp, offset, count, flags, mapp, &nmaps, error);
-	if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
-		VMODIFY(vp);
-	return -error;
-}
-
-STATIC inline int
-xfs_iomap_valid(
-	xfs_iomap_t		*iomapp,
-	loff_t			offset)
-{
-	return offset >= iomapp->iomap_offset &&
-		offset < iomapp->iomap_offset + iomapp->iomap_bsize;
-}
-
-/*
- * BIO completion handler for buffered IO.
- */
-STATIC int
-xfs_end_bio(
-	struct bio		*bio,
-	unsigned int		bytes_done,
-	int			error)
-{
-	xfs_ioend_t		*ioend = bio->bi_private;
-
-	if (bio->bi_size)
-		return 1;
-
-	ASSERT(ioend);
-	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
-
-	/* Toss bio and pass work off to an xfsdatad thread */
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		ioend->io_uptodate = 0;
-	bio->bi_private = NULL;
-	bio->bi_end_io = NULL;
-
-	bio_put(bio);
-	xfs_finish_ioend(ioend);
-	return 0;
-}
-
-STATIC void
-xfs_submit_ioend_bio(
-	xfs_ioend_t	*ioend,
-	struct bio	*bio)
-{
-	atomic_inc(&ioend->io_remaining);
-
-	bio->bi_private = ioend;
-	bio->bi_end_io = xfs_end_bio;
-
-	submit_bio(WRITE, bio);
-	ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
-	bio_put(bio);
-}
-
-STATIC struct bio *
-xfs_alloc_ioend_bio(
-	struct buffer_head	*bh)
-{
-	struct bio		*bio;
-	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
-
-	do {
-		bio = bio_alloc(GFP_NOIO, nvecs);
-		nvecs >>= 1;
-	} while (!bio);
-
-	ASSERT(bio->bi_private == NULL);
-	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-	bio->bi_bdev = bh->b_bdev;
-	bio_get(bio);
-	return bio;
-}
-
-STATIC void
-xfs_start_buffer_writeback(
-	struct buffer_head	*bh)
-{
-	ASSERT(buffer_mapped(bh));
-	ASSERT(buffer_locked(bh));
-	ASSERT(!buffer_delay(bh));
-	ASSERT(!buffer_unwritten(bh));
-
-	mark_buffer_async_write(bh);
-	set_buffer_uptodate(bh);
-	clear_buffer_dirty(bh);
-}
-
-STATIC void
-xfs_start_page_writeback(
-	struct page		*page,
-	struct writeback_control *wbc,
-	int			clear_dirty,
-	int			buffers)
-{
-	ASSERT(PageLocked(page));
-	ASSERT(!PageWriteback(page));
-	set_page_writeback(page);
-	if (clear_dirty)
-		clear_page_dirty(page);
-	unlock_page(page);
-	if (!buffers) {
-		end_page_writeback(page);
-		wbc->pages_skipped++;	/* We didn't write this page */
-	}
-}
-
-static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
-{
-	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
-}
-
-/*
- * Submit all of the bios for all of the ioends we have saved up, covering the
- * initial writepage page and also any probed pages.
- *
- * Because we may have multiple ioends spanning a page, we need to start
- * writeback on all the buffers before we submit them for I/O. If we mark the
- * buffers as we got, then we can end up with a page that only has buffers
- * marked async write and I/O complete on can occur before we mark the other
- * buffers async write.
- *
- * The end result of this is that we trip a bug in end_page_writeback() because
- * we call it twice for the one page as the code in end_buffer_async_write()
- * assumes that all buffers on the page are started at the same time.
- *
- * The fix is two passes across the ioend list - one to start writeback on the
- * bufferheads, and then the second one submit them for I/O.
- */
-STATIC void
-xfs_submit_ioend(
-	xfs_ioend_t		*ioend)
-{
-	xfs_ioend_t		*head = ioend;
-	xfs_ioend_t		*next;
-	struct buffer_head	*bh;
-	struct bio		*bio;
-	sector_t		lastblock = 0;
-
-	/* Pass 1 - start writeback */
-	do {
-		next = ioend->io_list;
-		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-			xfs_start_buffer_writeback(bh);
-		}
-	} while ((ioend = next) != NULL);
-
-	/* Pass 2 - submit I/O */
-	ioend = head;
-	do {
-		next = ioend->io_list;
-		bio = NULL;
-
-		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-
-			if (!bio) {
- retry:
-				bio = xfs_alloc_ioend_bio(bh);
-			} else if (bh->b_blocknr != lastblock + 1) {
-				xfs_submit_ioend_bio(ioend, bio);
-				goto retry;
-			}
-
-			if (bio_add_buffer(bio, bh) != bh->b_size) {
-				xfs_submit_ioend_bio(ioend, bio);
-				goto retry;
-			}
-
-			lastblock = bh->b_blocknr;
-		}
-		if (bio)
-			xfs_submit_ioend_bio(ioend, bio);
-		xfs_finish_ioend(ioend);
-	} while ((ioend = next) != NULL);
-}
-
-/*
- * Cancel submission of all buffer_heads so far in this endio.
- * Toss the endio too.  Only ever called for the initial page
- * in a writepage request, so only ever one page.
- */
-STATIC void
-xfs_cancel_ioend(
-	xfs_ioend_t		*ioend)
-{
-	xfs_ioend_t		*next;
-	struct buffer_head	*bh, *next_bh;
-
-	do {
-		next = ioend->io_list;
-		bh = ioend->io_buffer_head;
-		do {
-			next_bh = bh->b_private;
-			clear_buffer_async_write(bh);
-			unlock_buffer(bh);
-		} while ((bh = next_bh) != NULL);
-
-		vn_iowake(ioend->io_vnode);
-		mempool_free(ioend, xfs_ioend_pool);
-	} while ((ioend = next) != NULL);
-}
-
-/*
- * Test to see if we've been building up a completion structure for
- * earlier buffers -- if so, we try to append to this ioend if we
- * can, otherwise we finish off any current ioend and start another.
- * Return true if we've finished the given ioend.
- */
-STATIC void
-xfs_add_to_ioend(
-	struct inode		*inode,
-	struct buffer_head	*bh,
-	xfs_off_t		offset,
-	unsigned int		type,
-	xfs_ioend_t		**result,
-	int			need_ioend)
-{
-	xfs_ioend_t		*ioend = *result;
-
-	if (!ioend || need_ioend || type != ioend->io_type) {
-		xfs_ioend_t	*previous = *result;
-
-		ioend = xfs_alloc_ioend(inode, type);
-		ioend->io_offset = offset;
-		ioend->io_buffer_head = bh;
-		ioend->io_buffer_tail = bh;
-		if (previous)
-			previous->io_list = ioend;
-		*result = ioend;
-	} else {
-		ioend->io_buffer_tail->b_private = bh;
-		ioend->io_buffer_tail = bh;
-	}
-
-	bh->b_private = NULL;
-	ioend->io_size += bh->b_size;
-}
-
-STATIC void
-xfs_map_at_offset(
-	struct buffer_head	*bh,
-	loff_t			offset,
-	int			block_bits,
-	xfs_iomap_t		*iomapp)
-{
-	xfs_daddr_t		bn;
-	int			sector_shift;
-
-	ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
-	ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
-	ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL);
-
-	sector_shift = block_bits - BBSHIFT;
-	bn = (iomapp->iomap_bn >> sector_shift) +
-	      ((offset - iomapp->iomap_offset) >> block_bits);
-
-	ASSERT(bn || (iomapp->iomap_flags & IOMAP_REALTIME));
-	ASSERT((bn << sector_shift) >= iomapp->iomap_bn);
-
-	lock_buffer(bh);
-	bh->b_blocknr = bn;
-	bh->b_bdev = iomapp->iomap_target->bt_bdev;
-	set_buffer_mapped(bh);
-	clear_buffer_delay(bh);
-	clear_buffer_unwritten(bh);
-}
-
-/*
- * Look for a page at index that is suitable for clustering.
- */
-STATIC unsigned int
-xfs_probe_page(
-	struct page		*page,
-	unsigned int		pg_offset,
-	int			mapped)
-{
-	int			ret = 0;
-
-	if (PageWriteback(page))
-		return 0;
-
-	if (page->mapping && PageDirty(page)) {
-		if (page_has_buffers(page)) {
-			struct buffer_head	*bh, *head;
-
-			bh = head = page_buffers(page);
-			do {
-				if (!buffer_uptodate(bh))
-					break;
-				if (mapped != buffer_mapped(bh))
-					break;
-				ret += bh->b_size;
-				if (ret >= pg_offset)
-					break;
-			} while ((bh = bh->b_this_page) != head);
-		} else
-			ret = mapped ? 0 : PAGE_CACHE_SIZE;
-	}
-
-	return ret;
-}
-
-STATIC size_t
-xfs_probe_cluster(
-	struct inode		*inode,
-	struct page		*startpage,
-	struct buffer_head	*bh,
-	struct buffer_head	*head,
-	int			mapped)
-{
-	struct pagevec		pvec;
-	pgoff_t			tindex, tlast, tloff;
-	size_t			total = 0;
-	int			done = 0, i;
-
-	/* First sum forwards in this page */
-	do {
-		if (mapped != buffer_mapped(bh))
-			return total;
-		total += bh->b_size;
-	} while ((bh = bh->b_this_page) != head);
-
-	/* if we reached the end of the page, sum forwards in following pages */
-	tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-	tindex = startpage->index + 1;
-
-	/* Prune this back to avoid pathological behavior */
-	tloff = min(tlast, startpage->index + 64);
-
-	pagevec_init(&pvec, 0);
-	while (!done && tindex <= tloff) {
-		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
-		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-			break;
-
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			struct page *page = pvec.pages[i];
-			size_t pg_offset, len = 0;
-
-			if (tindex == tlast) {
-				pg_offset =
-				    i_size_read(inode) & (PAGE_CACHE_SIZE - 1);
-				if (!pg_offset) {
-					done = 1;
-					break;
-				}
-			} else
-				pg_offset = PAGE_CACHE_SIZE;
-
-			if (page->index == tindex && !TestSetPageLocked(page)) {
-				len = xfs_probe_page(page, pg_offset, mapped);
-				unlock_page(page);
-			}
-
-			if (!len) {
-				done = 1;
-				break;
-			}
-
-			total += len;
-			tindex++;
-		}
-
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-
-	return total;
-}
-
-/*
- * Test if a given page is suitable for writing as part of an unwritten
- * or delayed allocate extent.
- */
-STATIC int
-xfs_is_delayed_page(
-	struct page		*page,
-	unsigned int		type)
-{
-	if (PageWriteback(page))
-		return 0;
-
-	if (page->mapping && page_has_buffers(page)) {
-		struct buffer_head	*bh, *head;
-		int			acceptable = 0;
-
-		bh = head = page_buffers(page);
-		do {
-			if (buffer_unwritten(bh))
-				acceptable = (type == IOMAP_UNWRITTEN);
-			else if (buffer_delay(bh))
-				acceptable = (type == IOMAP_DELAY);
-			else if (buffer_mapped(bh))
-				acceptable = (type == 0);
-			else
-				break;
-		} while ((bh = bh->b_this_page) != head);
-
-		if (acceptable)
-			return 1;
-	}
-
-	return 0;
-}
-
-/*
- * Allocate & map buffers for page given the extent map. Write it out.
- * except for the original page of a writepage, this is called on
- * delalloc/unwritten pages only, for the original page it is possible
- * that the page has no mapping at all.
- */
-STATIC int
-xfs_convert_page(
-	struct inode		*inode,
-	struct page		*page,
-	loff_t			tindex,
-	xfs_iomap_t		*mp,
-	xfs_ioend_t		**ioendp,
-	struct writeback_control *wbc,
-	int			startio,
-	int			all_bh)
-{
-	struct buffer_head	*bh, *head;
-	xfs_off_t		end_offset;
-	unsigned long		p_offset;
-	unsigned int		type;
-	int			bbits = inode->i_blkbits;
-	int			len, page_dirty;
-	int			count = 0, done = 0, uptodate = 1;
- 	xfs_off_t		offset = page_offset(page);
-
-	if (page->index != tindex)
-		goto fail;
-	if (TestSetPageLocked(page))
-		goto fail;
-	if (PageWriteback(page))
-		goto fail_unlock_page;
-	if (page->mapping != inode->i_mapping)
-		goto fail_unlock_page;
-	if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
-		goto fail_unlock_page;
-
-	/*
-	 * page_dirty is initially a count of buffers on the page before
-	 * EOF and is decrememted as we move each into a cleanable state.
-	 *
-	 * Derivation:
-	 *
-	 * End offset is the highest offset that this page should represent.
-	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
-	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
-	 * hence give us the correct page_dirty count. On any other page,
-	 * it will be zero and in that case we need page_dirty to be the
-	 * count of buffers on the page.
-	 */
-	end_offset = min_t(unsigned long long,
-			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
-			i_size_read(inode));
-
-	len = 1 << inode->i_blkbits;
-	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
-					PAGE_CACHE_SIZE);
-	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
-	page_dirty = p_offset / len;
-
-	bh = head = page_buffers(page);
-	do {
-		if (offset >= end_offset)
-			break;
-		if (!buffer_uptodate(bh))
-			uptodate = 0;
-		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
-			done = 1;
-			continue;
-		}
-
-		if (buffer_unwritten(bh) || buffer_delay(bh)) {
-			if (buffer_unwritten(bh))
-				type = IOMAP_UNWRITTEN;
-			else
-				type = IOMAP_DELAY;
-
-			if (!xfs_iomap_valid(mp, offset)) {
-				done = 1;
-				continue;
-			}
-
-			ASSERT(!(mp->iomap_flags & IOMAP_HOLE));
-			ASSERT(!(mp->iomap_flags & IOMAP_DELAY));
-
-			xfs_map_at_offset(bh, offset, bbits, mp);
-			if (startio) {
-				xfs_add_to_ioend(inode, bh, offset,
-						type, ioendp, done);
-			} else {
-				set_buffer_dirty(bh);
-				unlock_buffer(bh);
-				mark_buffer_dirty(bh);
-			}
-			page_dirty--;
-			count++;
-		} else {
-			type = 0;
-			if (buffer_mapped(bh) && all_bh && startio) {
-				lock_buffer(bh);
-				xfs_add_to_ioend(inode, bh, offset,
-						type, ioendp, done);
-				count++;
-				page_dirty--;
-			} else {
-				done = 1;
-			}
-		}
-	} while (offset += len, (bh = bh->b_this_page) != head);
-
-	if (uptodate && bh == head)
-		SetPageUptodate(page);
-
-	if (startio) {
-		if (count) {
-			struct backing_dev_info *bdi;
-
-			bdi = inode->i_mapping->backing_dev_info;
-			if (bdi_write_congested(bdi)) {
-				wbc->encountered_congestion = 1;
-				done = 1;
-			} else if (--wbc->nr_to_write <= 0) {
-				done = 1;
-			}
-		}
-		xfs_start_page_writeback(page, wbc, !page_dirty, count);
-	}
-
-	return done;
- fail_unlock_page:
-	unlock_page(page);
- fail:
-	return 1;
-}
-
-/*
- * Convert & write out a cluster of pages in the same extent as defined
- * by mp and following the start page.
- */
-STATIC void
-xfs_cluster_write(
-	struct inode		*inode,
-	pgoff_t			tindex,
-	xfs_iomap_t		*iomapp,
-	xfs_ioend_t		**ioendp,
-	struct writeback_control *wbc,
-	int			startio,
-	int			all_bh,
-	pgoff_t			tlast)
-{
-	struct pagevec		pvec;
-	int			done = 0, i;
-
-	pagevec_init(&pvec, 0);
-	while (!done && tindex <= tlast) {
-		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
-		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-			break;
-
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-					iomapp, ioendp, wbc, startio, all_bh);
-			if (done)
-				break;
-		}
-
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
-/*
- * Calling this without startio set means we are being asked to make a dirty
- * page ready for freeing it's buffers.  When called with startio set then
- * we are coming from writepage.
- *
- * When called with startio set it is important that we write the WHOLE
- * page if possible.
- * The bh->b_state's cannot know if any of the blocks or which block for
- * that matter are dirty due to mmap writes, and therefore bh uptodate is
- * only vaild if the page itself isn't completely uptodate.  Some layers
- * may clear the page dirty flag prior to calling write page, under the
- * assumption the entire page will be written out; by not writing out the
- * whole page the page can be reused before all valid dirty data is
- * written out.  Note: in the case of a page that has been dirty'd by
- * mapwrite and but partially setup by block_prepare_write the
- * bh->b_states's will not agree and only ones setup by BPW/BCW will have
- * valid state, thus the whole page must be written out thing.
- */
-
-STATIC int
-xfs_page_state_convert(
-	struct inode	*inode,
-	struct page	*page,
-	struct writeback_control *wbc,
-	int		startio,
-	int		unmapped) /* also implies page uptodate */
-{
-	struct buffer_head	*bh, *head;
-	xfs_iomap_t		iomap;
-	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
-	loff_t			offset;
-	unsigned long           p_offset = 0;
-	unsigned int		type;
-	__uint64_t              end_offset;
-	pgoff_t                 end_index, last_index, tlast;
-	ssize_t			size, len;
-	int			flags, err, iomap_valid = 0, uptodate = 1;
-	int			page_dirty, count = 0, trylock_flag = 0;
-	int			all_bh = unmapped;
-
-	/* wait for other IO threads? */
-	if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking))
-		trylock_flag |= BMAPI_TRYLOCK;
-
-	/* Is this page beyond the end of the file? */
-	offset = i_size_read(inode);
-	end_index = offset >> PAGE_CACHE_SHIFT;
-	last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
-	if (page->index >= end_index) {
-		if ((page->index >= end_index + 1) ||
-		    !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
-			if (startio)
-				unlock_page(page);
-			return 0;
-		}
-	}
-
-	/*
-	 * page_dirty is initially a count of buffers on the page before
-	 * EOF and is decrememted as we move each into a cleanable state.
-	 *
-	 * Derivation:
-	 *
-	 * End offset is the highest offset that this page should represent.
-	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
-	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
-	 * hence give us the correct page_dirty count. On any other page,
-	 * it will be zero and in that case we need page_dirty to be the
-	 * count of buffers on the page.
- 	 */
-	end_offset = min_t(unsigned long long,
-			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
-	len = 1 << inode->i_blkbits;
-	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
-					PAGE_CACHE_SIZE);
-	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
-	page_dirty = p_offset / len;
-
-	bh = head = page_buffers(page);
-	offset = page_offset(page);
-	flags = -1;
-	type = 0;
-
-	/* TODO: cleanup count and page_dirty */
-
-	do {
-		if (offset >= end_offset)
-			break;
-		if (!buffer_uptodate(bh))
-			uptodate = 0;
-		if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
-			/*
-			 * the iomap is actually still valid, but the ioend
-			 * isn't.  shouldn't happen too often.
-			 */
-			iomap_valid = 0;
-			continue;
-		}
-
-		if (iomap_valid)
-			iomap_valid = xfs_iomap_valid(&iomap, offset);
-
-		/*
-		 * First case, map an unwritten extent and prepare for
-		 * extent state conversion transaction on completion.
-		 *
-		 * Second case, allocate space for a delalloc buffer.
-		 * We can return EAGAIN here in the release page case.
-		 *
-		 * Third case, an unmapped buffer was found, and we are
-		 * in a path where we need to write the whole page out.
- 		 */
-		if (buffer_unwritten(bh) || buffer_delay(bh) ||
-		    ((buffer_uptodate(bh) || PageUptodate(page)) &&
-		     !buffer_mapped(bh) && (unmapped || startio))) {
-		     	/*
-			 * Make sure we don't use a read-only iomap
-			 */
-		     	if (flags == BMAPI_READ)
-				iomap_valid = 0;
-
-			if (buffer_unwritten(bh)) {
-				type = IOMAP_UNWRITTEN;
-				flags = BMAPI_WRITE|BMAPI_IGNSTATE;
-			} else if (buffer_delay(bh)) {
-				type = IOMAP_DELAY;
-				flags = BMAPI_ALLOCATE;
-				if (!startio)
-					flags |= trylock_flag;
-			} else {
-				type = IOMAP_NEW;
-				flags = BMAPI_WRITE|BMAPI_MMAP;
-			}
-
-			if (!iomap_valid) {
-				if (type == IOMAP_NEW) {
-					size = xfs_probe_cluster(inode,
-							page, bh, head, 0);
-				} else {
-					size = len;
-				}
-
-				err = xfs_map_blocks(inode, offset, size,
-						&iomap, flags);
-				if (err)
-					goto error;
-				iomap_valid = xfs_iomap_valid(&iomap, offset);
-			}
-			if (iomap_valid) {
-				xfs_map_at_offset(bh, offset,
-						inode->i_blkbits, &iomap);
-				if (startio) {
-					xfs_add_to_ioend(inode, bh, offset,
-							type, &ioend,
-							!iomap_valid);
-				} else {
-					set_buffer_dirty(bh);
-					unlock_buffer(bh);
-					mark_buffer_dirty(bh);
-				}
-				page_dirty--;
-				count++;
-			}
-		} else if (buffer_uptodate(bh) && startio) {
-			/*
-			 * we got here because the buffer is already mapped.
-			 * That means it must already have extents allocated
-			 * underneath it. Map the extent by reading it.
-			 */
-			if (!iomap_valid || type != 0) {
-				flags = BMAPI_READ;
-				size = xfs_probe_cluster(inode, page, bh,
-								head, 1);
-				err = xfs_map_blocks(inode, offset, size,
-						&iomap, flags);
-				if (err)
-					goto error;
-				iomap_valid = xfs_iomap_valid(&iomap, offset);
-			}
-
-			type = 0;
-			if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
-				ASSERT(buffer_mapped(bh));
-				if (iomap_valid)
-					all_bh = 1;
-				xfs_add_to_ioend(inode, bh, offset, type,
-						&ioend, !iomap_valid);
-				page_dirty--;
-				count++;
-			} else {
-				iomap_valid = 0;
-			}
-		} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
-			   (unmapped || startio)) {
-			iomap_valid = 0;
-		}
-
-		if (!iohead)
-			iohead = ioend;
-
-	} while (offset += len, ((bh = bh->b_this_page) != head));
-
-	if (uptodate && bh == head)
-		SetPageUptodate(page);
-
-	if (startio)
-		xfs_start_page_writeback(page, wbc, 1, count);
-
-	if (ioend && iomap_valid) {
-		offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
-					PAGE_CACHE_SHIFT;
-		tlast = min_t(pgoff_t, offset, last_index);
-		xfs_cluster_write(inode, page->index + 1, &iomap, &ioend,
-					wbc, startio, all_bh, tlast);
-	}
-
-	if (iohead)
-		xfs_submit_ioend(iohead);
-
-	return page_dirty;
-
-error:
-	if (iohead)
-		xfs_cancel_ioend(iohead);
-
-	/*
-	 * If it's delalloc and we have nowhere to put it,
-	 * throw it away, unless the lower layers told
-	 * us to try again.
-	 */
-	if (err != -EAGAIN) {
-		if (!unmapped)
-			block_invalidatepage(page, 0);
-		ClearPageUptodate(page);
-	}
-	return err;
-}
-
-STATIC int
-__linvfs_get_block(
-	struct inode		*inode,
-	sector_t		iblock,
-	unsigned long		blocks,
-	struct buffer_head	*bh_result,
-	int			create,
-	int			direct,
-	bmapi_flags_t		flags)
-{
-	vnode_t			*vp = LINVFS_GET_VP(inode);
-	xfs_iomap_t		iomap;
-	xfs_off_t		offset;
-	ssize_t			size;
-	int			retpbbm = 1;
-	int			error;
-
-	offset = (xfs_off_t)iblock << inode->i_blkbits;
-	if (blocks)
-		size = (ssize_t) min_t(xfs_off_t, LONG_MAX,
-					(xfs_off_t)blocks << inode->i_blkbits);
-	else
-		size = 1 << inode->i_blkbits;
-
-	VOP_BMAP(vp, offset, size,
-		create ? flags : BMAPI_READ, &iomap, &retpbbm, error);
-	if (error)
-		return -error;
-
-	if (retpbbm == 0)
-		return 0;
-
-	if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
-		xfs_daddr_t	bn;
-		xfs_off_t	delta;
-
-		/* For unwritten extents do not report a disk address on
-		 * the read case (treat as if we're reading into a hole).
-		 */
-		if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
-			delta = offset - iomap.iomap_offset;
-			delta >>= inode->i_blkbits;
-
-			bn = iomap.iomap_bn >> (inode->i_blkbits - BBSHIFT);
-			bn += delta;
-			BUG_ON(!bn && !(iomap.iomap_flags & IOMAP_REALTIME));
-			bh_result->b_blocknr = bn;
-			set_buffer_mapped(bh_result);
-		}
-		if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
-			if (direct)
-				bh_result->b_private = inode;
-			set_buffer_unwritten(bh_result);
-			set_buffer_delay(bh_result);
-		}
-	}
-
-	/* If this is a realtime file, data might be on a new device */
-	bh_result->b_bdev = iomap.iomap_target->bt_bdev;
-
-	/* If we previously allocated a block out beyond eof and
-	 * we are now coming back to use it then we will need to
-	 * flag it as new even if it has a disk address.
-	 */
-	if (create &&
-	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
-	     (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW)))
-		set_buffer_new(bh_result);
-
-	if (iomap.iomap_flags & IOMAP_DELAY) {
-		BUG_ON(direct);
-		if (create) {
-			set_buffer_uptodate(bh_result);
-			set_buffer_mapped(bh_result);
-			set_buffer_delay(bh_result);
-		}
-	}
-
-	if (blocks) {
-		ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0);
-		offset = min_t(xfs_off_t,
-				iomap.iomap_bsize - iomap.iomap_delta,
-				(xfs_off_t)blocks << inode->i_blkbits);
-		bh_result->b_size = (u32) min_t(xfs_off_t, UINT_MAX, offset);
-	}
-
-	return 0;
-}
-
-int
-linvfs_get_block(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	int			create)
-{
-	return __linvfs_get_block(inode, iblock, 0, bh_result,
-					create, 0, BMAPI_WRITE);
-}
-
-STATIC int
-linvfs_get_blocks_direct(
-	struct inode		*inode,
-	sector_t		iblock,
-	unsigned long		max_blocks,
-	struct buffer_head	*bh_result,
-	int			create)
-{
-	return __linvfs_get_block(inode, iblock, max_blocks, bh_result,
-					create, 1, BMAPI_WRITE|BMAPI_DIRECT);
-}
-
-STATIC void
-linvfs_end_io_direct(
-	struct kiocb	*iocb,
-	loff_t		offset,
-	ssize_t		size,
-	void		*private)
-{
-	xfs_ioend_t	*ioend = iocb->private;
-
-	/*
-	 * Non-NULL private data means we need to issue a transaction to
-	 * convert a range from unwritten to written extents.  This needs
-	 * to happen from process contect but aio+dio I/O completion
-	 * happens from irq context so we need to defer it to a workqueue.
-	 * This is not nessecary for synchronous direct I/O, but we do
-	 * it anyway to keep the code uniform and simpler.
-	 *
-	 * The core direct I/O code might be changed to always call the
-	 * completion handler in the future, in which case all this can
-	 * go away.
-	 */
-	if (private && size > 0) {
-		ioend->io_offset = offset;
-		ioend->io_size = size;
-		xfs_finish_ioend(ioend);
-	} else {
-		ASSERT(size >= 0);
-		xfs_destroy_ioend(ioend);
-	}
-
-	/*
-	 * blockdev_direct_IO can return an error even afer the I/O
-	 * completion handler was called.  Thus we need to protect
-	 * against double-freeing.
-	 */
-	iocb->private = NULL;
-}
-
-STATIC ssize_t
-linvfs_direct_IO(
-	int			rw,
-	struct kiocb		*iocb,
-	const struct iovec	*iov,
-	loff_t			offset,
-	unsigned long		nr_segs)
-{
-	struct file	*file = iocb->ki_filp;
-	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	xfs_iomap_t	iomap;
-	int		maps = 1;
-	int		error;
-	ssize_t		ret;
-
-	VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
-	if (error)
-		return -error;
-
-	iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
-
-	ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
-		iomap.iomap_target->bt_bdev,
-		iov, offset, nr_segs,
-		linvfs_get_blocks_direct,
-		linvfs_end_io_direct);
-
-	if (unlikely(ret <= 0 && iocb->private))
-		xfs_destroy_ioend(iocb->private);
-	return ret;
-}
-
-
-STATIC sector_t
-linvfs_bmap(
-	struct address_space	*mapping,
-	sector_t		block)
-{
-	struct inode		*inode = (struct inode *)mapping->host;
-	vnode_t			*vp = LINVFS_GET_VP(inode);
-	int			error;
-
-	vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address);
-
-	VOP_RWLOCK(vp, VRWLOCK_READ);
-	VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
-	VOP_RWUNLOCK(vp, VRWLOCK_READ);
-	return generic_block_bmap(mapping, block, linvfs_get_block);
-}
-
-STATIC int
-linvfs_readpage(
-	struct file		*unused,
-	struct page		*page)
-{
-	return mpage_readpage(page, linvfs_get_block);
-}
-
-STATIC int
-linvfs_readpages(
-	struct file		*unused,
-	struct address_space	*mapping,
-	struct list_head	*pages,
-	unsigned		nr_pages)
-{
-	return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block);
-}
-
-STATIC void
-xfs_count_page_state(
-	struct page		*page,
-	int			*delalloc,
-	int			*unmapped,
-	int			*unwritten)
-{
-	struct buffer_head	*bh, *head;
-
-	*delalloc = *unmapped = *unwritten = 0;
-
-	bh = head = page_buffers(page);
-	do {
-		if (buffer_uptodate(bh) && !buffer_mapped(bh))
-			(*unmapped) = 1;
-		else if (buffer_unwritten(bh) && !buffer_delay(bh))
-			clear_buffer_unwritten(bh);
-		else if (buffer_unwritten(bh))
-			(*unwritten) = 1;
-		else if (buffer_delay(bh))
-			(*delalloc) = 1;
-	} while ((bh = bh->b_this_page) != head);
-}
-
-
-/*
- * writepage: Called from one of two places:
- *
- * 1. we are flushing a delalloc buffer head.
- *
- * 2. we are writing out a dirty page. Typically the page dirty
- *    state is cleared before we get here. In this case is it
- *    conceivable we have no buffer heads.
- *
- * For delalloc space on the page we need to allocate space and
- * flush it. For unmapped buffer heads on the page we should
- * allocate space if the page is uptodate. For any other dirty
- * buffer heads on the page we should flush them.
- *
- * If we detect that a transaction would be required to flush
- * the page, we have to check the process flags first, if we
- * are already in a transaction or disk I/O during allocations
- * is off, we need to fail the writepage and redirty the page.
- */
-
-STATIC int
-linvfs_writepage(
-	struct page		*page,
-	struct writeback_control *wbc)
-{
-	int			error;
-	int			need_trans;
-	int			delalloc, unmapped, unwritten;
-	struct inode		*inode = page->mapping->host;
-
-	xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
-
-	/*
-	 * We need a transaction if:
-	 *  1. There are delalloc buffers on the page
-	 *  2. The page is uptodate and we have unmapped buffers
-	 *  3. The page is uptodate and we have no buffers
-	 *  4. There are unwritten buffers on the page
-	 */
-
-	if (!page_has_buffers(page)) {
-		unmapped = 1;
-		need_trans = 1;
-	} else {
-		xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-		if (!PageUptodate(page))
-			unmapped = 0;
-		need_trans = delalloc + unmapped + unwritten;
-	}
-
-	/*
-	 * If we need a transaction and the process flags say
-	 * we are already in a transaction, or no IO is allowed
-	 * then mark the page dirty again and leave the page
-	 * as is.
-	 */
-	if (PFLAGS_TEST_FSTRANS() && need_trans)
-		goto out_fail;
-
-	/*
-	 * Delay hooking up buffer heads until we have
-	 * made our go/no-go decision.
-	 */
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-
-	/*
-	 * Convert delayed allocate, unwritten or unmapped space
-	 * to real space and flush out to disk.
-	 */
-	error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
-	if (error == -EAGAIN)
-		goto out_fail;
-	if (unlikely(error < 0))
-		goto out_unlock;
-
-	return 0;
-
-out_fail:
-	redirty_page_for_writepage(wbc, page);
-	unlock_page(page);
-	return 0;
-out_unlock:
-	unlock_page(page);
-	return error;
-}
-
-STATIC int
-linvfs_invalidate_page(
-	struct page		*page,
-	unsigned long		offset)
-{
-	xfs_page_trace(XFS_INVALIDPAGE_ENTER,
-			page->mapping->host, page, offset);
-	return block_invalidatepage(page, offset);
-}
-
-/*
- * Called to move a page into cleanable state - and from there
- * to be released. Possibly the page is already clean. We always
- * have buffer heads in this call.
- *
- * Returns 0 if the page is ok to release, 1 otherwise.
- *
- * Possible scenarios are:
- *
- * 1. We are being called to release a page which has been written
- *    to via regular I/O. buffer heads will be dirty and possibly
- *    delalloc. If no delalloc buffer heads in this case then we
- *    can just return zero.
- *
- * 2. We are called to release a page which has been written via
- *    mmap, all we need to do is ensure there is no delalloc
- *    state in the buffer heads, if not we can let the caller
- *    free them and we should come back later via writepage.
- */
-STATIC int
-linvfs_release_page(
-	struct page		*page,
-	gfp_t			gfp_mask)
-{
-	struct inode		*inode = page->mapping->host;
-	int			dirty, delalloc, unmapped, unwritten;
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = 1,
-	};
-
-	xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask);
-
-	xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
-	if (!delalloc && !unwritten)
-		goto free_buffers;
-
-	if (!(gfp_mask & __GFP_FS))
-		return 0;
-
-	/* If we are already inside a transaction or the thread cannot
-	 * do I/O, we cannot release this page.
-	 */
-	if (PFLAGS_TEST_FSTRANS())
-		return 0;
-
-	/*
-	 * Convert delalloc space to real space, do not flush the
-	 * data out to disk, that will be done by the caller.
-	 * Never need to allocate space here - we will always
-	 * come back to writepage in that case.
-	 */
-	dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
-	if (dirty == 0 && !unwritten)
-		goto free_buffers;
-	return 0;
-
-free_buffers:
-	return try_to_free_buffers(page);
-}
-
-STATIC int
-linvfs_prepare_write(
-	struct file		*file,
-	struct page		*page,
-	unsigned int		from,
-	unsigned int		to)
-{
-	return block_prepare_write(page, from, to, linvfs_get_block);
-}
-
-struct address_space_operations linvfs_aops = {
-	.readpage		= linvfs_readpage,
-	.readpages		= linvfs_readpages,
-	.writepage		= linvfs_writepage,
-	.sync_page		= block_sync_page,
-	.releasepage		= linvfs_release_page,
-	.invalidatepage		= linvfs_invalidate_page,
-	.prepare_write		= linvfs_prepare_write,
-	.commit_write		= generic_commit_write,
-	.bmap			= linvfs_bmap,
-	.direct_IO		= linvfs_direct_IO,
-	.migratepage		= buffer_migrate_page,
-};
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
deleted file mode 100644
index bfb4f2917bb..00000000000
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ /dev/null
@@ -1,1855 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <linux/stddef.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/bio.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/workqueue.h>
-#include <linux/percpu.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-#include <linux/kthread.h>
-#include "xfs_linux.h"
-
-STATIC kmem_zone_t *xfs_buf_zone;
-STATIC kmem_shaker_t xfs_buf_shake;
-STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(int, gfp_t);
-STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-
-STATIC struct workqueue_struct *xfslogd_workqueue;
-struct workqueue_struct *xfsdatad_workqueue;
-
-#ifdef XFS_BUF_TRACE
-void
-xfs_buf_trace(
-	xfs_buf_t	*bp,
-	char		*id,
-	void		*data,
-	void		*ra)
-{
-	ktrace_enter(xfs_buf_trace_buf,
-		bp, id,
-		(void *)(unsigned long)bp->b_flags,
-		(void *)(unsigned long)bp->b_hold.counter,
-		(void *)(unsigned long)bp->b_sema.count.counter,
-		(void *)current,
-		data, ra,
-		(void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
-		(void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
-		(void *)(unsigned long)bp->b_buffer_length,
-		NULL, NULL, NULL, NULL, NULL);
-}
-ktrace_t *xfs_buf_trace_buf;
-#define XFS_BUF_TRACE_SIZE	4096
-#define XB_TRACE(bp, id, data)	\
-	xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))
-#else
-#define XB_TRACE(bp, id, data)	do { } while (0)
-#endif
-
-#ifdef XFS_BUF_LOCK_TRACKING
-# define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
-# define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
-# define XB_GET_OWNER(bp)	((bp)->b_last_holder)
-#else
-# define XB_SET_OWNER(bp)	do { } while (0)
-# define XB_CLEAR_OWNER(bp)	do { } while (0)
-# define XB_GET_OWNER(bp)	do { } while (0)
-#endif
-
-#define xb_to_gfp(flags) \
-	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
-	  ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
-
-#define xb_to_km(flags) \
-	 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
-
-#define xfs_buf_allocate(flags) \
-	kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
-#define xfs_buf_deallocate(bp) \
-	kmem_zone_free(xfs_buf_zone, (bp));
-
-/*
- *	Page Region interfaces.
- *
- *	For pages in filesystems where the blocksize is smaller than the
- *	pagesize, we use the page->private field (long) to hold a bitmap
- * 	of uptodate regions within the page.
- *
- *	Each such region is "bytes per page / bits per long" bytes long.
- *
- *	NBPPR == number-of-bytes-per-page-region
- *	BTOPR == bytes-to-page-region (rounded up)
- *	BTOPRT == bytes-to-page-region-truncated (rounded down)
- */
-#if (BITS_PER_LONG == 32)
-#define PRSHIFT		(PAGE_CACHE_SHIFT - 5)	/* (32 == 1<<5) */
-#elif (BITS_PER_LONG == 64)
-#define PRSHIFT		(PAGE_CACHE_SHIFT - 6)	/* (64 == 1<<6) */
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-#define NBPPR		(PAGE_CACHE_SIZE/BITS_PER_LONG)
-#define BTOPR(b)	(((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
-#define BTOPRT(b)	(((unsigned int)(b) >> PRSHIFT))
-
-STATIC unsigned long
-page_region_mask(
-	size_t		offset,
-	size_t		length)
-{
-	unsigned long	mask;
-	int		first, final;
-
-	first = BTOPR(offset);
-	final = BTOPRT(offset + length - 1);
-	first = min(first, final);
-
-	mask = ~0UL;
-	mask <<= BITS_PER_LONG - (final - first);
-	mask >>= BITS_PER_LONG - (final);
-
-	ASSERT(offset + length <= PAGE_CACHE_SIZE);
-	ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
-
-	return mask;
-}
-
-STATIC inline void
-set_page_region(
-	struct page	*page,
-	size_t		offset,
-	size_t		length)
-{
-	set_page_private(page,
-		page_private(page) | page_region_mask(offset, length));
-	if (page_private(page) == ~0UL)
-		SetPageUptodate(page);
-}
-
-STATIC inline int
-test_page_region(
-	struct page	*page,
-	size_t		offset,
-	size_t		length)
-{
-	unsigned long	mask = page_region_mask(offset, length);
-
-	return (mask && (page_private(page) & mask) == mask);
-}
-
-/*
- *	Mapping of multi-page buffers into contiguous virtual space
- */
-
-typedef struct a_list {
-	void		*vm_addr;
-	struct a_list	*next;
-} a_list_t;
-
-STATIC a_list_t		*as_free_head;
-STATIC int		as_list_len;
-STATIC DEFINE_SPINLOCK(as_lock);
-
-/*
- *	Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
-	void		*addr)
-{
-	a_list_t	*aentry;
-
-	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
-	if (likely(aentry)) {
-		spin_lock(&as_lock);
-		aentry->next = as_free_head;
-		aentry->vm_addr = addr;
-		as_free_head = aentry;
-		as_list_len++;
-		spin_unlock(&as_lock);
-	} else {
-		vunmap(addr);
-	}
-}
-
-STATIC void
-purge_addresses(void)
-{
-	a_list_t	*aentry, *old;
-
-	if (as_free_head == NULL)
-		return;
-
-	spin_lock(&as_lock);
-	aentry = as_free_head;
-	as_free_head = NULL;
-	as_list_len = 0;
-	spin_unlock(&as_lock);
-
-	while ((old = aentry) != NULL) {
-		vunmap(aentry->vm_addr);
-		aentry = aentry->next;
-		kfree(old);
-	}
-}
-
-/*
- *	Internal xfs_buf_t object manipulation
- */
-
-STATIC void
-_xfs_buf_initialize(
-	xfs_buf_t		*bp,
-	xfs_buftarg_t		*target,
-	xfs_off_t		range_base,
-	size_t			range_length,
-	xfs_buf_flags_t		flags)
-{
-	/*
-	 * We don't want certain flags to appear in b_flags.
-	 */
-	flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
-
-	memset(bp, 0, sizeof(xfs_buf_t));
-	atomic_set(&bp->b_hold, 1);
-	init_MUTEX_LOCKED(&bp->b_iodonesema);
-	INIT_LIST_HEAD(&bp->b_list);
-	INIT_LIST_HEAD(&bp->b_hash_list);
-	init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
-	XB_SET_OWNER(bp);
-	bp->b_target = target;
-	bp->b_file_offset = range_base;
-	/*
-	 * Set buffer_length and count_desired to the same value initially.
-	 * I/O routines should use count_desired, which will be the same in
-	 * most cases but may be reset (e.g. XFS recovery).
-	 */
-	bp->b_buffer_length = bp->b_count_desired = range_length;
-	bp->b_flags = flags;
-	bp->b_bn = XFS_BUF_DADDR_NULL;
-	atomic_set(&bp->b_pin_count, 0);
-	init_waitqueue_head(&bp->b_waiters);
-
-	XFS_STATS_INC(xb_create);
-	XB_TRACE(bp, "initialize", target);
-}
-
-/*
- *	Allocate a page array capable of holding a specified number
- *	of pages, and point the page buf at it.
- */
-STATIC int
-_xfs_buf_get_pages(
-	xfs_buf_t		*bp,
-	int			page_count,
-	xfs_buf_flags_t		flags)
-{
-	/* Make sure that we have a page list */
-	if (bp->b_pages == NULL) {
-		bp->b_offset = xfs_buf_poff(bp->b_file_offset);
-		bp->b_page_count = page_count;
-		if (page_count <= XB_PAGES) {
-			bp->b_pages = bp->b_page_array;
-		} else {
-			bp->b_pages = kmem_alloc(sizeof(struct page *) *
-					page_count, xb_to_km(flags));
-			if (bp->b_pages == NULL)
-				return -ENOMEM;
-		}
-		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
-	}
-	return 0;
-}
-
-/*
- *	Frees b_pages if it was allocated.
- */
-STATIC void
-_xfs_buf_free_pages(
-	xfs_buf_t	*bp)
-{
-	if (bp->b_pages != bp->b_page_array) {
-		kmem_free(bp->b_pages,
-			  bp->b_page_count * sizeof(struct page *));
-	}
-}
-
-/*
- *	Releases the specified buffer.
- *
- * 	The modification state of any associated pages is left unchanged.
- * 	The buffer most not be on any hash - use xfs_buf_rele instead for
- * 	hashed and refcounted buffers
- */
-void
-xfs_buf_free(
-	xfs_buf_t		*bp)
-{
-	XB_TRACE(bp, "free", 0);
-
-	ASSERT(list_empty(&bp->b_hash_list));
-
-	if (bp->b_flags & _XBF_PAGE_CACHE) {
-		uint		i;
-
-		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-			free_address(bp->b_addr - bp->b_offset);
-
-		for (i = 0; i < bp->b_page_count; i++)
-			page_cache_release(bp->b_pages[i]);
-		_xfs_buf_free_pages(bp);
-	} else if (bp->b_flags & _XBF_KMEM_ALLOC) {
-		 /*
-		  * XXX(hch): bp->b_count_desired might be incorrect (see
-		  * xfs_buf_associate_memory for details), but fortunately
-		  * the Linux version of kmem_free ignores the len argument..
-		  */
-		kmem_free(bp->b_addr, bp->b_count_desired);
-		_xfs_buf_free_pages(bp);
-	}
-
-	xfs_buf_deallocate(bp);
-}
-
-/*
- *	Finds all pages for buffer in question and builds it's page list.
- */
-STATIC int
-_xfs_buf_lookup_pages(
-	xfs_buf_t		*bp,
-	uint			flags)
-{
-	struct address_space	*mapping = bp->b_target->bt_mapping;
-	size_t			blocksize = bp->b_target->bt_bsize;
-	size_t			size = bp->b_count_desired;
-	size_t			nbytes, offset;
-	gfp_t			gfp_mask = xb_to_gfp(flags);
-	unsigned short		page_count, i;
-	pgoff_t			first;
-	xfs_off_t		end;
-	int			error;
-
-	end = bp->b_file_offset + bp->b_buffer_length;
-	page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
-
-	error = _xfs_buf_get_pages(bp, page_count, flags);
-	if (unlikely(error))
-		return error;
-	bp->b_flags |= _XBF_PAGE_CACHE;
-
-	offset = bp->b_offset;
-	first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
-
-	for (i = 0; i < bp->b_page_count; i++) {
-		struct page	*page;
-		uint		retries = 0;
-
-	      retry:
-		page = find_or_create_page(mapping, first + i, gfp_mask);
-		if (unlikely(page == NULL)) {
-			if (flags & XBF_READ_AHEAD) {
-				bp->b_page_count = i;
-				for (i = 0; i < bp->b_page_count; i++)
-					unlock_page(bp->b_pages[i]);
-				return -ENOMEM;
-			}
-
-			/*
-			 * This could deadlock.
-			 *
-			 * But until all the XFS lowlevel code is revamped to
-			 * handle buffer allocation failures we can't do much.
-			 */
-			if (!(++retries % 100))
-				printk(KERN_ERR
-					"XFS: possible memory allocation "
-					"deadlock in %s (mode:0x%x)\n",
-					__FUNCTION__, gfp_mask);
-
-			XFS_STATS_INC(xb_page_retries);
-			xfsbufd_wakeup(0, gfp_mask);
-			blk_congestion_wait(WRITE, HZ/50);
-			goto retry;
-		}
-
-		XFS_STATS_INC(xb_page_found);
-
-		nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
-		size -= nbytes;
-
-		if (!PageUptodate(page)) {
-			page_count--;
-			if (blocksize >= PAGE_CACHE_SIZE) {
-				if (flags & XBF_READ)
-					bp->b_locked = 1;
-			} else if (!PagePrivate(page)) {
-				if (test_page_region(page, offset, nbytes))
-					page_count++;
-			}
-		}
-
-		bp->b_pages[i] = page;
-		offset = 0;
-	}
-
-	if (!bp->b_locked) {
-		for (i = 0; i < bp->b_page_count; i++)
-			unlock_page(bp->b_pages[i]);
-	}
-
-	if (page_count == bp->b_page_count)
-		bp->b_flags |= XBF_DONE;
-
-	XB_TRACE(bp, "lookup_pages", (long)page_count);
-	return error;
-}
-
-/*
- *	Map buffer into kernel address-space if nessecary.
- */
-STATIC int
-_xfs_buf_map_pages(
-	xfs_buf_t		*bp,
-	uint			flags)
-{
-	/* A single page buffer is always mappable */
-	if (bp->b_page_count == 1) {
-		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
-		bp->b_flags |= XBF_MAPPED;
-	} else if (flags & XBF_MAPPED) {
-		if (as_list_len > 64)
-			purge_addresses();
-		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
-					VM_MAP, PAGE_KERNEL);
-		if (unlikely(bp->b_addr == NULL))
-			return -ENOMEM;
-		bp->b_addr += bp->b_offset;
-		bp->b_flags |= XBF_MAPPED;
-	}
-
-	return 0;
-}
-
-/*
- *	Finding and Reading Buffers
- */
-
-/*
- *	Look up, and creates if absent, a lockable buffer for
- *	a given range of an inode.  The buffer is returned
- *	locked.	 If other overlapping buffers exist, they are
- *	released before the new buffer is created and locked,
- *	which may imply that this call will block until those buffers
- *	are unlocked.  No I/O is implied by this call.
- */
-xfs_buf_t *
-_xfs_buf_find(
-	xfs_buftarg_t		*btp,	/* block device target		*/
-	xfs_off_t		ioff,	/* starting offset of range	*/
-	size_t			isize,	/* length of range		*/
-	xfs_buf_flags_t		flags,
-	xfs_buf_t		*new_bp)
-{
-	xfs_off_t		range_base;
-	size_t			range_length;
-	xfs_bufhash_t		*hash;
-	xfs_buf_t		*bp, *n;
-
-	range_base = (ioff << BBSHIFT);
-	range_length = (isize << BBSHIFT);
-
-	/* Check for IOs smaller than the sector size / not sector aligned */
-	ASSERT(!(range_length < (1 << btp->bt_sshift)));
-	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
-
-	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
-
-	spin_lock(&hash->bh_lock);
-
-	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-		ASSERT(btp == bp->b_target);
-		if (bp->b_file_offset == range_base &&
-		    bp->b_buffer_length == range_length) {
-			/*
-			 * If we look at something, bring it to the
-			 * front of the list for next time.
-			 */
-			atomic_inc(&bp->b_hold);
-			list_move(&bp->b_hash_list, &hash->bh_list);
-			goto found;
-		}
-	}
-
-	/* No match found */
-	if (new_bp) {
-		_xfs_buf_initialize(new_bp, btp, range_base,
-				range_length, flags);
-		new_bp->b_hash = hash;
-		list_add(&new_bp->b_hash_list, &hash->bh_list);
-	} else {
-		XFS_STATS_INC(xb_miss_locked);
-	}
-
-	spin_unlock(&hash->bh_lock);
-	return new_bp;
-
-found:
-	spin_unlock(&hash->bh_lock);
-
-	/* Attempt to get the semaphore without sleeping,
-	 * if this does not work then we need to drop the
-	 * spinlock and do a hard attempt on the semaphore.
-	 */
-	if (down_trylock(&bp->b_sema)) {
-		if (!(flags & XBF_TRYLOCK)) {
-			/* wait for buffer ownership */
-			XB_TRACE(bp, "get_lock", 0);
-			xfs_buf_lock(bp);
-			XFS_STATS_INC(xb_get_locked_waited);
-		} else {
-			/* We asked for a trylock and failed, no need
-			 * to look at file offset and length here, we
-			 * know that this buffer at least overlaps our
-			 * buffer and is locked, therefore our buffer
-			 * either does not exist, or is this buffer.
-			 */
-			xfs_buf_rele(bp);
-			XFS_STATS_INC(xb_busy_locked);
-			return NULL;
-		}
-	} else {
-		/* trylock worked */
-		XB_SET_OWNER(bp);
-	}
-
-	if (bp->b_flags & XBF_STALE) {
-		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-		bp->b_flags &= XBF_MAPPED;
-	}
-	XB_TRACE(bp, "got_lock", 0);
-	XFS_STATS_INC(xb_get_locked);
-	return bp;
-}
-
-/*
- *	Assembles a buffer covering the specified range.
- *	Storage in memory for all portions of the buffer will be allocated,
- *	although backing storage may not be.
- */
-xfs_buf_t *
-xfs_buf_get_flags(
-	xfs_buftarg_t		*target,/* target for buffer		*/
-	xfs_off_t		ioff,	/* starting offset of range	*/
-	size_t			isize,	/* length of range		*/
-	xfs_buf_flags_t		flags)
-{
-	xfs_buf_t		*bp, *new_bp;
-	int			error = 0, i;
-
-	new_bp = xfs_buf_allocate(flags);
-	if (unlikely(!new_bp))
-		return NULL;
-
-	bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
-	if (bp == new_bp) {
-		error = _xfs_buf_lookup_pages(bp, flags);
-		if (error)
-			goto no_buffer;
-	} else {
-		xfs_buf_deallocate(new_bp);
-		if (unlikely(bp == NULL))
-			return NULL;
-	}
-
-	for (i = 0; i < bp->b_page_count; i++)
-		mark_page_accessed(bp->b_pages[i]);
-
-	if (!(bp->b_flags & XBF_MAPPED)) {
-		error = _xfs_buf_map_pages(bp, flags);
-		if (unlikely(error)) {
-			printk(KERN_WARNING "%s: failed to map pages\n",
-					__FUNCTION__);
-			goto no_buffer;
-		}
-	}
-
-	XFS_STATS_INC(xb_get);
-
-	/*
-	 * Always fill in the block number now, the mapped cases can do
-	 * their own overlay of this later.
-	 */
-	bp->b_bn = ioff;
-	bp->b_count_desired = bp->b_buffer_length;
-
-	XB_TRACE(bp, "get", (unsigned long)flags);
-	return bp;
-
- no_buffer:
-	if (flags & (XBF_LOCK | XBF_TRYLOCK))
-		xfs_buf_unlock(bp);
-	xfs_buf_rele(bp);
-	return NULL;
-}
-
-xfs_buf_t *
-xfs_buf_read_flags(
-	xfs_buftarg_t		*target,
-	xfs_off_t		ioff,
-	size_t			isize,
-	xfs_buf_flags_t		flags)
-{
-	xfs_buf_t		*bp;
-
-	flags |= XBF_READ;
-
-	bp = xfs_buf_get_flags(target, ioff, isize, flags);
-	if (bp) {
-		if (!XFS_BUF_ISDONE(bp)) {
-			XB_TRACE(bp, "read", (unsigned long)flags);
-			XFS_STATS_INC(xb_get_read);
-			xfs_buf_iostart(bp, flags);
-		} else if (flags & XBF_ASYNC) {
-			XB_TRACE(bp, "read_async", (unsigned long)flags);
-			/*
-			 * Read ahead call which is already satisfied,
-			 * drop the buffer
-			 */
-			goto no_buffer;
-		} else {
-			XB_TRACE(bp, "read_done", (unsigned long)flags);
-			/* We do not want read in the flags */
-			bp->b_flags &= ~XBF_READ;
-		}
-	}
-
-	return bp;
-
- no_buffer:
-	if (flags & (XBF_LOCK | XBF_TRYLOCK))
-		xfs_buf_unlock(bp);
-	xfs_buf_rele(bp);
-	return NULL;
-}
-
-/*
- *	If we are not low on memory then do the readahead in a deadlock
- *	safe manner.
- */
-void
-xfs_buf_readahead(
-	xfs_buftarg_t		*target,
-	xfs_off_t		ioff,
-	size_t			isize,
-	xfs_buf_flags_t		flags)
-{
-	struct backing_dev_info *bdi;
-
-	bdi = target->bt_mapping->backing_dev_info;
-	if (bdi_read_congested(bdi))
-		return;
-
-	flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
-	xfs_buf_read_flags(target, ioff, isize, flags);
-}
-
-xfs_buf_t *
-xfs_buf_get_empty(
-	size_t			len,
-	xfs_buftarg_t		*target)
-{
-	xfs_buf_t		*bp;
-
-	bp = xfs_buf_allocate(0);
-	if (bp)
-		_xfs_buf_initialize(bp, target, 0, len, 0);
-	return bp;
-}
-
-static inline struct page *
-mem_to_page(
-	void			*addr)
-{
-	if (((unsigned long)addr < VMALLOC_START) ||
-	    ((unsigned long)addr >= VMALLOC_END)) {
-		return virt_to_page(addr);
-	} else {
-		return vmalloc_to_page(addr);
-	}
-}
-
-int
-xfs_buf_associate_memory(
-	xfs_buf_t		*bp,
-	void			*mem,
-	size_t			len)
-{
-	int			rval;
-	int			i = 0;
-	size_t			ptr;
-	size_t			end, end_cur;
-	off_t			offset;
-	int			page_count;
-
-	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
-	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
-	if (offset && (len > PAGE_CACHE_SIZE))
-		page_count++;
-
-	/* Free any previous set of page pointers */
-	if (bp->b_pages)
-		_xfs_buf_free_pages(bp);
-
-	bp->b_pages = NULL;
-	bp->b_addr = mem;
-
-	rval = _xfs_buf_get_pages(bp, page_count, 0);
-	if (rval)
-		return rval;
-
-	bp->b_offset = offset;
-	ptr = (size_t) mem & PAGE_CACHE_MASK;
-	end = PAGE_CACHE_ALIGN((size_t) mem + len);
-	end_cur = end;
-	/* set up first page */
-	bp->b_pages[0] = mem_to_page(mem);
-
-	ptr += PAGE_CACHE_SIZE;
-	bp->b_page_count = ++i;
-	while (ptr < end) {
-		bp->b_pages[i] = mem_to_page((void *)ptr);
-		bp->b_page_count = ++i;
-		ptr += PAGE_CACHE_SIZE;
-	}
-	bp->b_locked = 0;
-
-	bp->b_count_desired = bp->b_buffer_length = len;
-	bp->b_flags |= XBF_MAPPED;
-
-	return 0;
-}
-
-xfs_buf_t *
-xfs_buf_get_noaddr(
-	size_t			len,
-	xfs_buftarg_t		*target)
-{
-	size_t			malloc_len = len;
-	xfs_buf_t		*bp;
-	void			*data;
-	int			error;
-
-	bp = xfs_buf_allocate(0);
-	if (unlikely(bp == NULL))
-		goto fail;
-	_xfs_buf_initialize(bp, target, 0, len, 0);
-
- try_again:
-	data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
-	if (unlikely(data == NULL))
-		goto fail_free_buf;
-
-	/* check whether alignment matches.. */
-	if ((__psunsigned_t)data !=
-	    ((__psunsigned_t)data & ~target->bt_smask)) {
-		/* .. else double the size and try again */
-		kmem_free(data, malloc_len);
-		malloc_len <<= 1;
-		goto try_again;
-	}
-
-	error = xfs_buf_associate_memory(bp, data, len);
-	if (error)
-		goto fail_free_mem;
-	bp->b_flags |= _XBF_KMEM_ALLOC;
-
-	xfs_buf_unlock(bp);
-
-	XB_TRACE(bp, "no_daddr", data);
-	return bp;
- fail_free_mem:
-	kmem_free(data, malloc_len);
- fail_free_buf:
-	xfs_buf_free(bp);
- fail:
-	return NULL;
-}
-
-/*
- *	Increment reference count on buffer, to hold the buffer concurrently
- *	with another thread which may release (free) the buffer asynchronously.
- *	Must hold the buffer already to call this function.
- */
-void
-xfs_buf_hold(
-	xfs_buf_t		*bp)
-{
-	atomic_inc(&bp->b_hold);
-	XB_TRACE(bp, "hold", 0);
-}
-
-/*
- *	Releases a hold on the specified buffer.  If the
- *	the hold count is 1, calls xfs_buf_free.
- */
-void
-xfs_buf_rele(
-	xfs_buf_t		*bp)
-{
-	xfs_bufhash_t		*hash = bp->b_hash;
-
-	XB_TRACE(bp, "rele", bp->b_relse);
-
-	if (unlikely(!hash)) {
-		ASSERT(!bp->b_relse);
-		if (atomic_dec_and_test(&bp->b_hold))
-			xfs_buf_free(bp);
-		return;
-	}
-
-	if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
-		if (bp->b_relse) {
-			atomic_inc(&bp->b_hold);
-			spin_unlock(&hash->bh_lock);
-			(*(bp->b_relse)) (bp);
-		} else if (bp->b_flags & XBF_FS_MANAGED) {
-			spin_unlock(&hash->bh_lock);
-		} else {
-			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-			list_del_init(&bp->b_hash_list);
-			spin_unlock(&hash->bh_lock);
-			xfs_buf_free(bp);
-		}
-	} else {
-		/*
-		 * Catch reference count leaks
-		 */
-		ASSERT(atomic_read(&bp->b_hold) >= 0);
-	}
-}
-
-
-/*
- *	Mutual exclusion on buffers.  Locking model:
- *
- *	Buffers associated with inodes for which buffer locking
- *	is not enabled are not protected by semaphores, and are
- *	assumed to be exclusively owned by the caller.  There is a
- *	spinlock in the buffer, used by the caller when concurrent
- *	access is possible.
- */
-
-/*
- *	Locks a buffer object, if it is not already locked.
- *	Note that this in no way locks the underlying pages, so it is only
- *	useful for synchronizing concurrent use of buffer objects, not for
- *	synchronizing independent access to the underlying pages.
- */
-int
-xfs_buf_cond_lock(
-	xfs_buf_t		*bp)
-{
-	int			locked;
-
-	locked = down_trylock(&bp->b_sema) == 0;
-	if (locked) {
-		XB_SET_OWNER(bp);
-	}
-	XB_TRACE(bp, "cond_lock", (long)locked);
-	return locked ? 0 : -EBUSY;
-}
-
-#if defined(DEBUG) || defined(XFS_BLI_TRACE)
-int
-xfs_buf_lock_value(
-	xfs_buf_t		*bp)
-{
-	return atomic_read(&bp->b_sema.count);
-}
-#endif
-
-/*
- *	Locks a buffer object.
- *	Note that this in no way locks the underlying pages, so it is only
- *	useful for synchronizing concurrent use of buffer objects, not for
- *	synchronizing independent access to the underlying pages.
- */
-void
-xfs_buf_lock(
-	xfs_buf_t		*bp)
-{
-	XB_TRACE(bp, "lock", 0);
-	if (atomic_read(&bp->b_io_remaining))
-		blk_run_address_space(bp->b_target->bt_mapping);
-	down(&bp->b_sema);
-	XB_SET_OWNER(bp);
-	XB_TRACE(bp, "locked", 0);
-}
-
-/*
- *	Releases the lock on the buffer object.
- *	If the buffer is marked delwri but is not queued, do so before we
- *	unlock the buffer as we need to set flags correctly.  We also need to
- *	take a reference for the delwri queue because the unlocker is going to
- *	drop their's and they don't know we just queued it.
- */
-void
-xfs_buf_unlock(
-	xfs_buf_t		*bp)
-{
-	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
-		atomic_inc(&bp->b_hold);
-		bp->b_flags |= XBF_ASYNC;
-		xfs_buf_delwri_queue(bp, 0);
-	}
-
-	XB_CLEAR_OWNER(bp);
-	up(&bp->b_sema);
-	XB_TRACE(bp, "unlock", 0);
-}
-
-
-/*
- *	Pinning Buffer Storage in Memory
- *	Ensure that no attempt to force a buffer to disk will succeed.
- */
-void
-xfs_buf_pin(
-	xfs_buf_t		*bp)
-{
-	atomic_inc(&bp->b_pin_count);
-	XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
-}
-
-void
-xfs_buf_unpin(
-	xfs_buf_t		*bp)
-{
-	if (atomic_dec_and_test(&bp->b_pin_count))
-		wake_up_all(&bp->b_waiters);
-	XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
-}
-
-int
-xfs_buf_ispin(
-	xfs_buf_t		*bp)
-{
-	return atomic_read(&bp->b_pin_count);
-}
-
-STATIC void
-xfs_buf_wait_unpin(
-	xfs_buf_t		*bp)
-{
-	DECLARE_WAITQUEUE	(wait, current);
-
-	if (atomic_read(&bp->b_pin_count) == 0)
-		return;
-
-	add_wait_queue(&bp->b_waiters, &wait);
-	for (;;) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (atomic_read(&bp->b_pin_count) == 0)
-			break;
-		if (atomic_read(&bp->b_io_remaining))
-			blk_run_address_space(bp->b_target->bt_mapping);
-		schedule();
-	}
-	remove_wait_queue(&bp->b_waiters, &wait);
-	set_current_state(TASK_RUNNING);
-}
-
-/*
- *	Buffer Utility Routines
- */
-
-STATIC void
-xfs_buf_iodone_work(
-	void			*v)
-{
-	xfs_buf_t		*bp = (xfs_buf_t *)v;
-
-	if (bp->b_iodone)
-		(*(bp->b_iodone))(bp);
-	else if (bp->b_flags & XBF_ASYNC)
-		xfs_buf_relse(bp);
-}
-
-void
-xfs_buf_ioend(
-	xfs_buf_t		*bp,
-	int			schedule)
-{
-	bp->b_flags &= ~(XBF_READ | XBF_WRITE);
-	if (bp->b_error == 0)
-		bp->b_flags |= XBF_DONE;
-
-	XB_TRACE(bp, "iodone", bp->b_iodone);
-
-	if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
-		if (schedule) {
-			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work, bp);
-			queue_work(xfslogd_workqueue, &bp->b_iodone_work);
-		} else {
-			xfs_buf_iodone_work(bp);
-		}
-	} else {
-		up(&bp->b_iodonesema);
-	}
-}
-
-void
-xfs_buf_ioerror(
-	xfs_buf_t		*bp,
-	int			error)
-{
-	ASSERT(error >= 0 && error <= 0xffff);
-	bp->b_error = (unsigned short)error;
-	XB_TRACE(bp, "ioerror", (unsigned long)error);
-}
-
-/*
- *	Initiate I/O on a buffer, based on the flags supplied.
- *	The b_iodone routine in the buffer supplied will only be called
- *	when all of the subsidiary I/O requests, if any, have been completed.
- */
-int
-xfs_buf_iostart(
-	xfs_buf_t		*bp,
-	xfs_buf_flags_t		flags)
-{
-	int			status = 0;
-
-	XB_TRACE(bp, "iostart", (unsigned long)flags);
-
-	if (flags & XBF_DELWRI) {
-		bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
-		bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
-		xfs_buf_delwri_queue(bp, 1);
-		return status;
-	}
-
-	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
-			XBF_READ_AHEAD | _XBF_RUN_QUEUES);
-	bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \
-			XBF_READ_AHEAD | _XBF_RUN_QUEUES);
-
-	BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL);
-
-	/* For writes allow an alternate strategy routine to precede
-	 * the actual I/O request (which may not be issued at all in
-	 * a shutdown situation, for example).
-	 */
-	status = (flags & XBF_WRITE) ?
-		xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp);
-
-	/* Wait for I/O if we are not an async request.
-	 * Note: async I/O request completion will release the buffer,
-	 * and that can already be done by this point.  So using the
-	 * buffer pointer from here on, after async I/O, is invalid.
-	 */
-	if (!status && !(flags & XBF_ASYNC))
-		status = xfs_buf_iowait(bp);
-
-	return status;
-}
-
-STATIC __inline__ int
-_xfs_buf_iolocked(
-	xfs_buf_t		*bp)
-{
-	ASSERT(bp->b_flags & (XBF_READ | XBF_WRITE));
-	if (bp->b_flags & XBF_READ)
-		return bp->b_locked;
-	return 0;
-}
-
-STATIC __inline__ void
-_xfs_buf_ioend(
-	xfs_buf_t		*bp,
-	int			schedule)
-{
-	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
-		bp->b_locked = 0;
-		xfs_buf_ioend(bp, schedule);
-	}
-}
-
-STATIC int
-xfs_buf_bio_end_io(
-	struct bio		*bio,
-	unsigned int		bytes_done,
-	int			error)
-{
-	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
-	unsigned int		blocksize = bp->b_target->bt_bsize;
-	struct bio_vec		*bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-
-	if (bio->bi_size)
-		return 1;
-
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		bp->b_error = EIO;
-
-	do {
-		struct page	*page = bvec->bv_page;
-
-		if (unlikely(bp->b_error)) {
-			if (bp->b_flags & XBF_READ)
-				ClearPageUptodate(page);
-			SetPageError(page);
-		} else if (blocksize >= PAGE_CACHE_SIZE) {
-			SetPageUptodate(page);
-		} else if (!PagePrivate(page) &&
-				(bp->b_flags & _XBF_PAGE_CACHE)) {
-			set_page_region(page, bvec->bv_offset, bvec->bv_len);
-		}
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (_xfs_buf_iolocked(bp)) {
-			unlock_page(page);
-		}
-	} while (bvec >= bio->bi_io_vec);
-
-	_xfs_buf_ioend(bp, 1);
-	bio_put(bio);
-	return 0;
-}
-
-STATIC void
-_xfs_buf_ioapply(
-	xfs_buf_t		*bp)
-{
-	int			i, rw, map_i, total_nr_pages, nr_pages;
-	struct bio		*bio;
-	int			offset = bp->b_offset;
-	int			size = bp->b_count_desired;
-	sector_t		sector = bp->b_bn;
-	unsigned int		blocksize = bp->b_target->bt_bsize;
-	int			locking = _xfs_buf_iolocked(bp);
-
-	total_nr_pages = bp->b_page_count;
-	map_i = 0;
-
-	if (bp->b_flags & _XBF_RUN_QUEUES) {
-		bp->b_flags &= ~_XBF_RUN_QUEUES;
-		rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC;
-	} else {
-		rw = (bp->b_flags & XBF_READ) ? READ : WRITE;
-	}
-
-	if (bp->b_flags & XBF_ORDERED) {
-		ASSERT(!(bp->b_flags & XBF_READ));
-		rw = WRITE_BARRIER;
-	}
-
-	/* Special code path for reading a sub page size buffer in --
-	 * we populate up the whole page, and hence the other metadata
-	 * in the same page.  This optimization is only valid when the
-	 * filesystem block size is not smaller than the page size.
-	 */
-	if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
-	    (bp->b_flags & XBF_READ) && locking &&
-	    (blocksize >= PAGE_CACHE_SIZE)) {
-		bio = bio_alloc(GFP_NOIO, 1);
-
-		bio->bi_bdev = bp->b_target->bt_bdev;
-		bio->bi_sector = sector - (offset >> BBSHIFT);
-		bio->bi_end_io = xfs_buf_bio_end_io;
-		bio->bi_private = bp;
-
-		bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
-		size = 0;
-
-		atomic_inc(&bp->b_io_remaining);
-
-		goto submit_io;
-	}
-
-	/* Lock down the pages which we need to for the request */
-	if (locking && (bp->b_flags & XBF_WRITE) && (bp->b_locked == 0)) {
-		for (i = 0; size; i++) {
-			int		nbytes = PAGE_CACHE_SIZE - offset;
-			struct page	*page = bp->b_pages[i];
-
-			if (nbytes > size)
-				nbytes = size;
-
-			lock_page(page);
-
-			size -= nbytes;
-			offset = 0;
-		}
-		offset = bp->b_offset;
-		size = bp->b_count_desired;
-	}
-
-next_chunk:
-	atomic_inc(&bp->b_io_remaining);
-	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
-	if (nr_pages > total_nr_pages)
-		nr_pages = total_nr_pages;
-
-	bio = bio_alloc(GFP_NOIO, nr_pages);
-	bio->bi_bdev = bp->b_target->bt_bdev;
-	bio->bi_sector = sector;
-	bio->bi_end_io = xfs_buf_bio_end_io;
-	bio->bi_private = bp;
-
-	for (; size && nr_pages; nr_pages--, map_i++) {
-		int	rbytes, nbytes = PAGE_CACHE_SIZE - offset;
-
-		if (nbytes > size)
-			nbytes = size;
-
-		rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
-		if (rbytes < nbytes)
-			break;
-
-		offset = 0;
-		sector += nbytes >> BBSHIFT;
-		size -= nbytes;
-		total_nr_pages--;
-	}
-
-submit_io:
-	if (likely(bio->bi_size)) {
-		submit_bio(rw, bio);
-		if (size)
-			goto next_chunk;
-	} else {
-		bio_put(bio);
-		xfs_buf_ioerror(bp, EIO);
-	}
-}
-
-int
-xfs_buf_iorequest(
-	xfs_buf_t		*bp)
-{
-	XB_TRACE(bp, "iorequest", 0);
-
-	if (bp->b_flags & XBF_DELWRI) {
-		xfs_buf_delwri_queue(bp, 1);
-		return 0;
-	}
-
-	if (bp->b_flags & XBF_WRITE) {
-		xfs_buf_wait_unpin(bp);
-	}
-
-	xfs_buf_hold(bp);
-
-	/* Set the count to 1 initially, this will stop an I/O
-	 * completion callout which happens before we have started
-	 * all the I/O from calling xfs_buf_ioend too early.
-	 */
-	atomic_set(&bp->b_io_remaining, 1);
-	_xfs_buf_ioapply(bp);
-	_xfs_buf_ioend(bp, 0);
-
-	xfs_buf_rele(bp);
-	return 0;
-}
-
-/*
- *	Waits for I/O to complete on the buffer supplied.
- *	It returns immediately if no I/O is pending.
- *	It returns the I/O error code, if any, or 0 if there was no error.
- */
-int
-xfs_buf_iowait(
-	xfs_buf_t		*bp)
-{
-	XB_TRACE(bp, "iowait", 0);
-	if (atomic_read(&bp->b_io_remaining))
-		blk_run_address_space(bp->b_target->bt_mapping);
-	down(&bp->b_iodonesema);
-	XB_TRACE(bp, "iowaited", (long)bp->b_error);
-	return bp->b_error;
-}
-
-xfs_caddr_t
-xfs_buf_offset(
-	xfs_buf_t		*bp,
-	size_t			offset)
-{
-	struct page		*page;
-
-	if (bp->b_flags & XBF_MAPPED)
-		return XFS_BUF_PTR(bp) + offset;
-
-	offset += bp->b_offset;
-	page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
-	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
-}
-
-/*
- *	Move data into or out of a buffer.
- */
-void
-xfs_buf_iomove(
-	xfs_buf_t		*bp,	/* buffer to process		*/
-	size_t			boff,	/* starting buffer offset	*/
-	size_t			bsize,	/* length to copy		*/
-	caddr_t			data,	/* data address			*/
-	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
-{
-	size_t			bend, cpoff, csize;
-	struct page		*page;
-
-	bend = boff + bsize;
-	while (boff < bend) {
-		page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
-		cpoff = xfs_buf_poff(boff + bp->b_offset);
-		csize = min_t(size_t,
-			      PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
-
-		ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
-
-		switch (mode) {
-		case XBRW_ZERO:
-			memset(page_address(page) + cpoff, 0, csize);
-			break;
-		case XBRW_READ:
-			memcpy(data, page_address(page) + cpoff, csize);
-			break;
-		case XBRW_WRITE:
-			memcpy(page_address(page) + cpoff, data, csize);
-		}
-
-		boff += csize;
-		data += csize;
-	}
-}
-
-/*
- *	Handling of buffer targets (buftargs).
- */
-
-/*
- *	Wait for any bufs with callbacks that have been submitted but
- *	have not yet returned... walk the hash list for the target.
- */
-void
-xfs_wait_buftarg(
-	xfs_buftarg_t	*btp)
-{
-	xfs_buf_t	*bp, *n;
-	xfs_bufhash_t	*hash;
-	uint		i;
-
-	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-		hash = &btp->bt_hash[i];
-again:
-		spin_lock(&hash->bh_lock);
-		list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
-			ASSERT(btp == bp->b_target);
-			if (!(bp->b_flags & XBF_FS_MANAGED)) {
-				spin_unlock(&hash->bh_lock);
-				/*
-				 * Catch superblock reference count leaks
-				 * immediately
-				 */
-				BUG_ON(bp->b_bn == 0);
-				delay(100);
-				goto again;
-			}
-		}
-		spin_unlock(&hash->bh_lock);
-	}
-}
-
-/*
- *	Allocate buffer hash table for a given target.
- *	For devices containing metadata (i.e. not the log/realtime devices)
- *	we need to allocate a much larger hash table.
- */
-STATIC void
-xfs_alloc_bufhash(
-	xfs_buftarg_t		*btp,
-	int			external)
-{
-	unsigned int		i;
-
-	btp->bt_hashshift = external ? 3 : 8;	/* 8 or 256 buckets */
-	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
-	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
-					sizeof(xfs_bufhash_t), KM_SLEEP);
-	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
-		spin_lock_init(&btp->bt_hash[i].bh_lock);
-		INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
-	}
-}
-
-STATIC void
-xfs_free_bufhash(
-	xfs_buftarg_t		*btp)
-{
-	kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t));
-	btp->bt_hash = NULL;
-}
-
-/*
- *	buftarg list for delwrite queue processing
- */
-STATIC LIST_HEAD(xfs_buftarg_list);
-STATIC DEFINE_SPINLOCK(xfs_buftarg_lock);
-
-STATIC void
-xfs_register_buftarg(
-	xfs_buftarg_t           *btp)
-{
-	spin_lock(&xfs_buftarg_lock);
-	list_add(&btp->bt_list, &xfs_buftarg_list);
-	spin_unlock(&xfs_buftarg_lock);
-}
-
-STATIC void
-xfs_unregister_buftarg(
-	xfs_buftarg_t           *btp)
-{
-	spin_lock(&xfs_buftarg_lock);
-	list_del(&btp->bt_list);
-	spin_unlock(&xfs_buftarg_lock);
-}
-
-void
-xfs_free_buftarg(
-	xfs_buftarg_t		*btp,
-	int			external)
-{
-	xfs_flush_buftarg(btp, 1);
-	if (external)
-		xfs_blkdev_put(btp->bt_bdev);
-	xfs_free_bufhash(btp);
-	iput(btp->bt_mapping->host);
-
-	/* Unregister the buftarg first so that we don't get a
-	 * wakeup finding a non-existent task
-	 */
-	xfs_unregister_buftarg(btp);
-	kthread_stop(btp->bt_task);
-
-	kmem_free(btp, sizeof(*btp));
-}
-
-STATIC int
-xfs_setsize_buftarg_flags(
-	xfs_buftarg_t		*btp,
-	unsigned int		blocksize,
-	unsigned int		sectorsize,
-	int			verbose)
-{
-	btp->bt_bsize = blocksize;
-	btp->bt_sshift = ffs(sectorsize) - 1;
-	btp->bt_smask = sectorsize - 1;
-
-	if (set_blocksize(btp->bt_bdev, sectorsize)) {
-		printk(KERN_WARNING
-			"XFS: Cannot set_blocksize to %u on device %s\n",
-			sectorsize, XFS_BUFTARG_NAME(btp));
-		return EINVAL;
-	}
-
-	if (verbose &&
-	    (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
-		printk(KERN_WARNING
-			"XFS: %u byte sectors in use on device %s.  "
-			"This is suboptimal; %u or greater is ideal.\n",
-			sectorsize, XFS_BUFTARG_NAME(btp),
-			(unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
-	}
-
-	return 0;
-}
-
-/*
- *	When allocating the initial buffer target we have not yet
- *	read in the superblock, so don't know what sized sectors
- *	are being used is at this early stage.  Play safe.
- */
-STATIC int
-xfs_setsize_buftarg_early(
-	xfs_buftarg_t		*btp,
-	struct block_device	*bdev)
-{
-	return xfs_setsize_buftarg_flags(btp,
-			PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
-}
-
-int
-xfs_setsize_buftarg(
-	xfs_buftarg_t		*btp,
-	unsigned int		blocksize,
-	unsigned int		sectorsize)
-{
-	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
-}
-
-STATIC int
-xfs_mapping_buftarg(
-	xfs_buftarg_t		*btp,
-	struct block_device	*bdev)
-{
-	struct backing_dev_info	*bdi;
-	struct inode		*inode;
-	struct address_space	*mapping;
-	static struct address_space_operations mapping_aops = {
-		.sync_page = block_sync_page,
-		.migratepage = fail_migrate_page,
-	};
-
-	inode = new_inode(bdev->bd_inode->i_sb);
-	if (!inode) {
-		printk(KERN_WARNING
-			"XFS: Cannot allocate mapping inode for device %s\n",
-			XFS_BUFTARG_NAME(btp));
-		return ENOMEM;
-	}
-	inode->i_mode = S_IFBLK;
-	inode->i_bdev = bdev;
-	inode->i_rdev = bdev->bd_dev;
-	bdi = blk_get_backing_dev_info(bdev);
-	if (!bdi)
-		bdi = &default_backing_dev_info;
-	mapping = &inode->i_data;
-	mapping->a_ops = &mapping_aops;
-	mapping->backing_dev_info = bdi;
-	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	btp->bt_mapping = mapping;
-	return 0;
-}
-
-STATIC int
-xfs_alloc_delwrite_queue(
-	xfs_buftarg_t		*btp)
-{
-	int	error = 0;
-
-	INIT_LIST_HEAD(&btp->bt_list);
-	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
-	spinlock_init(&btp->bt_delwrite_lock, "delwri_lock");
-	btp->bt_flags = 0;
-	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
-	if (IS_ERR(btp->bt_task)) {
-		error = PTR_ERR(btp->bt_task);
-		goto out_error;
-	}
-	xfs_register_buftarg(btp);
-out_error:
-	return error;
-}
-
-xfs_buftarg_t *
-xfs_alloc_buftarg(
-	struct block_device	*bdev,
-	int			external)
-{
-	xfs_buftarg_t		*btp;
-
-	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
-
-	btp->bt_dev =  bdev->bd_dev;
-	btp->bt_bdev = bdev;
-	if (xfs_setsize_buftarg_early(btp, bdev))
-		goto error;
-	if (xfs_mapping_buftarg(btp, bdev))
-		goto error;
-	if (xfs_alloc_delwrite_queue(btp))
-		goto error;
-	xfs_alloc_bufhash(btp, external);
-	return btp;
-
-error:
-	kmem_free(btp, sizeof(*btp));
-	return NULL;
-}
-
-
-/*
- *	Delayed write buffer handling
- */
-STATIC void
-xfs_buf_delwri_queue(
-	xfs_buf_t		*bp,
-	int			unlock)
-{
-	struct list_head	*dwq = &bp->b_target->bt_delwrite_queue;
-	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
-
-	XB_TRACE(bp, "delwri_q", (long)unlock);
-	ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
-
-	spin_lock(dwlk);
-	/* If already in the queue, dequeue and place at tail */
-	if (!list_empty(&bp->b_list)) {
-		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-		if (unlock)
-			atomic_dec(&bp->b_hold);
-		list_del(&bp->b_list);
-	}
-
-	bp->b_flags |= _XBF_DELWRI_Q;
-	list_add_tail(&bp->b_list, dwq);
-	bp->b_queuetime = jiffies;
-	spin_unlock(dwlk);
-
-	if (unlock)
-		xfs_buf_unlock(bp);
-}
-
-void
-xfs_buf_delwri_dequeue(
-	xfs_buf_t		*bp)
-{
-	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
-	int			dequeued = 0;
-
-	spin_lock(dwlk);
-	if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
-		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-		list_del_init(&bp->b_list);
-		dequeued = 1;
-	}
-	bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
-	spin_unlock(dwlk);
-
-	if (dequeued)
-		xfs_buf_rele(bp);
-
-	XB_TRACE(bp, "delwri_dq", (long)dequeued);
-}
-
-STATIC void
-xfs_buf_runall_queues(
-	struct workqueue_struct	*queue)
-{
-	flush_workqueue(queue);
-}
-
-STATIC int
-xfsbufd_wakeup(
-	int			priority,
-	gfp_t			mask)
-{
-	xfs_buftarg_t		*btp;
-
-	spin_lock(&xfs_buftarg_lock);
-	list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
-		if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
-			continue;
-		set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
-		wake_up_process(btp->bt_task);
-	}
-	spin_unlock(&xfs_buftarg_lock);
-	return 0;
-}
-
-STATIC int
-xfsbufd(
-	void			*data)
-{
-	struct list_head	tmp;
-	unsigned long		age;
-	xfs_buftarg_t		*target = (xfs_buftarg_t *)data;
-	xfs_buf_t		*bp, *n;
-	struct list_head	*dwq = &target->bt_delwrite_queue;
-	spinlock_t		*dwlk = &target->bt_delwrite_lock;
-
-	current->flags |= PF_MEMALLOC;
-
-	INIT_LIST_HEAD(&tmp);
-	do {
-		if (unlikely(freezing(current))) {
-			set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
-			refrigerator();
-		} else {
-			clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
-		}
-
-		schedule_timeout_interruptible(
-			xfs_buf_timer_centisecs * msecs_to_jiffies(10));
-
-		age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
-		spin_lock(dwlk);
-		list_for_each_entry_safe(bp, n, dwq, b_list) {
-			XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
-			ASSERT(bp->b_flags & XBF_DELWRI);
-
-			if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
-				if (!test_bit(XBT_FORCE_FLUSH,
-						&target->bt_flags) &&
-				    time_before(jiffies,
-						bp->b_queuetime + age)) {
-					xfs_buf_unlock(bp);
-					break;
-				}
-
-				bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
-				bp->b_flags |= XBF_WRITE;
-				list_move(&bp->b_list, &tmp);
-			}
-		}
-		spin_unlock(dwlk);
-
-		while (!list_empty(&tmp)) {
-			bp = list_entry(tmp.next, xfs_buf_t, b_list);
-			ASSERT(target == bp->b_target);
-
-			list_del_init(&bp->b_list);
-			xfs_buf_iostrategy(bp);
-
-			blk_run_address_space(target->bt_mapping);
-		}
-
-		if (as_list_len > 0)
-			purge_addresses();
-
-		clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
-	} while (!kthread_should_stop());
-
-	return 0;
-}
-
-/*
- *	Go through all incore buffers, and release buffers if they belong to
- *	the given device. This is used in filesystem error handling to
- *	preserve the consistency of its metadata.
- */
-int
-xfs_flush_buftarg(
-	xfs_buftarg_t		*target,
-	int			wait)
-{
-	struct list_head	tmp;
-	xfs_buf_t		*bp, *n;
-	int			pincount = 0;
-	struct list_head	*dwq = &target->bt_delwrite_queue;
-	spinlock_t		*dwlk = &target->bt_delwrite_lock;
-
-	xfs_buf_runall_queues(xfsdatad_workqueue);
-	xfs_buf_runall_queues(xfslogd_workqueue);
-
-	INIT_LIST_HEAD(&tmp);
-	spin_lock(dwlk);
-	list_for_each_entry_safe(bp, n, dwq, b_list) {
-		ASSERT(bp->b_target == target);
-		ASSERT(bp->b_flags & (XBF_DELWRI | _XBF_DELWRI_Q));
-		XB_TRACE(bp, "walkq2", (long)xfs_buf_ispin(bp));
-		if (xfs_buf_ispin(bp)) {
-			pincount++;
-			continue;
-		}
-
-		list_move(&bp->b_list, &tmp);
-	}
-	spin_unlock(dwlk);
-
-	/*
-	 * Dropped the delayed write list lock, now walk the temporary list
-	 */
-	list_for_each_entry_safe(bp, n, &tmp, b_list) {
-		xfs_buf_lock(bp);
-		bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
-		bp->b_flags |= XBF_WRITE;
-		if (wait)
-			bp->b_flags &= ~XBF_ASYNC;
-		else
-			list_del_init(&bp->b_list);
-
-		xfs_buf_iostrategy(bp);
-	}
-
-	/*
-	 * Remaining list items must be flushed before returning
-	 */
-	while (!list_empty(&tmp)) {
-		bp = list_entry(tmp.next, xfs_buf_t, b_list);
-
-		list_del_init(&bp->b_list);
-		xfs_iowait(bp);
-		xfs_buf_relse(bp);
-	}
-
-	if (wait)
-		blk_run_address_space(target->bt_mapping);
-
-	return pincount;
-}
-
-int __init
-xfs_buf_init(void)
-{
-	int		error = -ENOMEM;
-
-#ifdef XFS_BUF_TRACE
-	xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP);
-#endif
-
-	xfs_buf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf");
-	if (!xfs_buf_zone)
-		goto out_free_trace_buf;
-
-	xfslogd_workqueue = create_workqueue("xfslogd");
-	if (!xfslogd_workqueue)
-		goto out_free_buf_zone;
-
-	xfsdatad_workqueue = create_workqueue("xfsdatad");
-	if (!xfsdatad_workqueue)
-		goto out_destroy_xfslogd_workqueue;
-
-	xfs_buf_shake = kmem_shake_register(xfsbufd_wakeup);
-	if (!xfs_buf_shake)
-		goto out_destroy_xfsdatad_workqueue;
-
-	return 0;
-
- out_destroy_xfsdatad_workqueue:
-	destroy_workqueue(xfsdatad_workqueue);
- out_destroy_xfslogd_workqueue:
-	destroy_workqueue(xfslogd_workqueue);
- out_free_buf_zone:
-	kmem_zone_destroy(xfs_buf_zone);
- out_free_trace_buf:
-#ifdef XFS_BUF_TRACE
-	ktrace_free(xfs_buf_trace_buf);
-#endif
-	return error;
-}
-
-void
-xfs_buf_terminate(void)
-{
-	kmem_shake_deregister(xfs_buf_shake);
-	destroy_workqueue(xfsdatad_workqueue);
-	destroy_workqueue(xfslogd_workqueue);
-	kmem_zone_destroy(xfs_buf_zone);
-#ifdef XFS_BUF_TRACE
-	ktrace_free(xfs_buf_trace_buf);
-#endif
-}
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
deleted file mode 100644
index 4dd6592d5a4..00000000000
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_BUF_H__
-#define __XFS_BUF_H__
-
-#include <linux/config.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <asm/system.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/uio.h>
-
-/*
- *	Base types
- */
-
-#define XFS_BUF_DADDR_NULL	((xfs_daddr_t) (-1LL))
-
-#define xfs_buf_ctob(pp)	((pp) * PAGE_CACHE_SIZE)
-#define xfs_buf_btoc(dd)	(((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
-#define xfs_buf_btoct(dd)	((dd) >> PAGE_CACHE_SHIFT)
-#define xfs_buf_poff(aa)	((aa) & ~PAGE_CACHE_MASK)
-
-typedef enum {
-	XBRW_READ = 1,			/* transfer into target memory */
-	XBRW_WRITE = 2,			/* transfer from target memory */
-	XBRW_ZERO = 3,			/* Zero target memory */
-} xfs_buf_rw_t;
-
-typedef enum {
-	XBF_READ = (1 << 0),	/* buffer intended for reading from device */
-	XBF_WRITE = (1 << 1),	/* buffer intended for writing to device   */
-	XBF_MAPPED = (1 << 2),  /* buffer mapped (b_addr valid)            */
-	XBF_ASYNC = (1 << 4),   /* initiator will not wait for completion  */
-	XBF_DONE = (1 << 5),    /* all pages in the buffer uptodate	   */
-	XBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
-	XBF_STALE = (1 << 7),	/* buffer has been staled, do not find it  */
-	XBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
- 	XBF_ORDERED = (1 << 11),    /* use ordered writes		   */
-	XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead		   */
-
-	/* flags used only as arguments to access routines */
-	XBF_LOCK = (1 << 14),       /* lock requested			   */
-	XBF_TRYLOCK = (1 << 15),    /* lock requested, but do not wait	   */
-	XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread	   */
-
-	/* flags used only internally */
-	_XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache		   */
-	_XBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc()		   */
-	_XBF_RUN_QUEUES = (1 << 19),/* run block device task queue	   */
-	_XBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue		   */
-} xfs_buf_flags_t;
-
-typedef enum {
-	XBT_FORCE_SLEEP = (0 << 1),
-	XBT_FORCE_FLUSH = (1 << 1),
-} xfs_buftarg_flags_t;
-
-typedef struct xfs_bufhash {
-	struct list_head	bh_list;
-	spinlock_t		bh_lock;
-} xfs_bufhash_t;
-
-typedef struct xfs_buftarg {
-	dev_t			bt_dev;
-	struct block_device	*bt_bdev;
-	struct address_space	*bt_mapping;
-	unsigned int		bt_bsize;
-	unsigned int		bt_sshift;
-	size_t			bt_smask;
-
-	/* per device buffer hash table */
-	uint			bt_hashmask;
-	uint			bt_hashshift;
-	xfs_bufhash_t		*bt_hash;
-
-	/* per device delwri queue */
-	struct task_struct	*bt_task;
-	struct list_head	bt_list;
-	struct list_head	bt_delwrite_queue;
-	spinlock_t		bt_delwrite_lock;
-	unsigned long		bt_flags;
-} xfs_buftarg_t;
-
-/*
- *	xfs_buf_t:  Buffer structure for pagecache-based buffers
- *
- * This buffer structure is used by the pagecache buffer management routines
- * to refer to an assembly of pages forming a logical buffer.
- *
- * The buffer structure is used on a temporary basis only, and discarded when
- * released.  The real data storage is recorded in the pagecache. Buffers are
- * hashed to the block device on which the file system resides.
- */
-
-struct xfs_buf;
-typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
-
-#define XB_PAGES	2
-
-typedef struct xfs_buf {
-	struct semaphore	b_sema;		/* semaphore for lockables */
-	unsigned long		b_queuetime;	/* time buffer was queued */
-	atomic_t		b_pin_count;	/* pin count */
-	wait_queue_head_t	b_waiters;	/* unpin waiters */
-	struct list_head	b_list;
-	xfs_buf_flags_t		b_flags;	/* status flags */
-	struct list_head	b_hash_list;	/* hash table list */
-	xfs_bufhash_t		*b_hash;	/* hash table list start */
-	xfs_buftarg_t		*b_target;	/* buffer target (device) */
-	atomic_t		b_hold;		/* reference count */
-	xfs_daddr_t		b_bn;		/* block number for I/O */
-	xfs_off_t		b_file_offset;	/* offset in file */
-	size_t			b_buffer_length;/* size of buffer in bytes */
-	size_t			b_count_desired;/* desired transfer size */
-	void			*b_addr;	/* virtual address of buffer */
-	struct work_struct	b_iodone_work;
-	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
-	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
-	xfs_buf_relse_t		b_relse;	/* releasing function */
-	xfs_buf_bdstrat_t	b_strat;	/* pre-write function */
-	struct semaphore	b_iodonesema;	/* Semaphore for I/O waiters */
-	void			*b_fspriv;
-	void			*b_fspriv2;
-	void			*b_fspriv3;
-	unsigned short		b_error;	/* error code on I/O */
-	unsigned short		b_locked;	/* page array is locked */
-	unsigned int		b_page_count;	/* size of page array */
-	unsigned int		b_offset;	/* page offset in first page */
-	struct page		**b_pages;	/* array of page pointers */
-	struct page		*b_page_array[XB_PAGES]; /* inline pages */
-#ifdef XFS_BUF_LOCK_TRACKING
-	int			b_last_holder;
-#endif
-} xfs_buf_t;
-
-
-/* Finding and Reading Buffers */
-extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
-				xfs_buf_flags_t, xfs_buf_t *);
-#define xfs_incore(buftarg,blkno,len,lockit) \
-	_xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
-
-extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t,
-				xfs_buf_flags_t);
-#define xfs_buf_get(target, blkno, len, flags) \
-	xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
-
-extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t,
-				xfs_buf_flags_t);
-#define xfs_buf_read(target, blkno, len, flags) \
-	xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
-
-extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
-extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
-extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
-extern void xfs_buf_hold(xfs_buf_t *);
-extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
-				xfs_buf_flags_t);
-
-/* Releasing Buffers */
-extern void xfs_buf_free(xfs_buf_t *);
-extern void xfs_buf_rele(xfs_buf_t *);
-
-/* Locking and Unlocking Buffers */
-extern int xfs_buf_cond_lock(xfs_buf_t *);
-extern int xfs_buf_lock_value(xfs_buf_t *);
-extern void xfs_buf_lock(xfs_buf_t *);
-extern void xfs_buf_unlock(xfs_buf_t *);
-
-/* Buffer Read and Write Routines */
-extern void xfs_buf_ioend(xfs_buf_t *,	int);
-extern void xfs_buf_ioerror(xfs_buf_t *, int);
-extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t);
-extern int xfs_buf_iorequest(xfs_buf_t *);
-extern int xfs_buf_iowait(xfs_buf_t *);
-extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
-				xfs_buf_rw_t);
-
-static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
-{
-	return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
-}
-
-static inline int xfs_buf_geterror(xfs_buf_t *bp)
-{
-	return bp ? bp->b_error : ENOMEM;
-}
-
-/* Buffer Utility Routines */
-extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
-
-/* Pinning Buffer Storage in Memory */
-extern void xfs_buf_pin(xfs_buf_t *);
-extern void xfs_buf_unpin(xfs_buf_t *);
-extern int xfs_buf_ispin(xfs_buf_t *);
-
-/* Delayed Write Buffer Routines */
-extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
-
-/* Buffer Daemon Setup Routines */
-extern int xfs_buf_init(void);
-extern void xfs_buf_terminate(void);
-
-#ifdef XFS_BUF_TRACE
-extern ktrace_t *xfs_buf_trace_buf;
-extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
-#else
-#define xfs_buf_trace(bp,id,ptr,ra)	do { } while (0)
-#endif
-
-#define xfs_buf_target_name(target)	\
-	({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
-
-
-#define XFS_B_ASYNC		XBF_ASYNC
-#define XFS_B_DELWRI		XBF_DELWRI
-#define XFS_B_READ		XBF_READ
-#define XFS_B_WRITE		XBF_WRITE
-#define XFS_B_STALE		XBF_STALE
-
-#define XFS_BUF_TRYLOCK		XBF_TRYLOCK
-#define XFS_INCORE_TRYLOCK	XBF_TRYLOCK
-#define XFS_BUF_LOCK		XBF_LOCK
-#define XFS_BUF_MAPPED		XBF_MAPPED
-
-#define BUF_BUSY		XBF_DONT_BLOCK
-
-#define XFS_BUF_BFLAGS(bp)	((bp)->b_flags)
-#define XFS_BUF_ZEROFLAGS(bp)	\
-	((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI))
-
-#define XFS_BUF_STALE(bp)	((bp)->b_flags |= XFS_B_STALE)
-#define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XFS_B_STALE)
-#define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XFS_B_STALE)
-#define XFS_BUF_SUPER_STALE(bp)	do {				\
-					XFS_BUF_STALE(bp);	\
-					xfs_buf_delwri_dequeue(bp);	\
-					XFS_BUF_DONE(bp);	\
-				} while (0)
-
-#define XFS_BUF_MANAGE		XBF_FS_MANAGED
-#define XFS_BUF_UNMANAGE(bp)	((bp)->b_flags &= ~XBF_FS_MANAGED)
-
-#define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
-#define XFS_BUF_UNDELAYWRITE(bp)	xfs_buf_delwri_dequeue(bp)
-#define XFS_BUF_ISDELAYWRITE(bp)	((bp)->b_flags & XBF_DELWRI)
-
-#define XFS_BUF_ERROR(bp,no)	xfs_buf_ioerror(bp,no)
-#define XFS_BUF_GETERROR(bp)	xfs_buf_geterror(bp)
-#define XFS_BUF_ISERROR(bp)	(xfs_buf_geterror(bp) ? 1 : 0)
-
-#define XFS_BUF_DONE(bp)	((bp)->b_flags |= XBF_DONE)
-#define XFS_BUF_UNDONE(bp)	((bp)->b_flags &= ~XBF_DONE)
-#define XFS_BUF_ISDONE(bp)	((bp)->b_flags & XBF_DONE)
-
-#define XFS_BUF_BUSY(bp)	do { } while (0)
-#define XFS_BUF_UNBUSY(bp)	do { } while (0)
-#define XFS_BUF_ISBUSY(bp)	(1)
-
-#define XFS_BUF_ASYNC(bp)	((bp)->b_flags |= XBF_ASYNC)
-#define XFS_BUF_UNASYNC(bp)	((bp)->b_flags &= ~XBF_ASYNC)
-#define XFS_BUF_ISASYNC(bp)	((bp)->b_flags & XBF_ASYNC)
-
-#define XFS_BUF_ORDERED(bp)	((bp)->b_flags |= XBF_ORDERED)
-#define XFS_BUF_UNORDERED(bp)	((bp)->b_flags &= ~XBF_ORDERED)
-#define XFS_BUF_ISORDERED(bp)	((bp)->b_flags & XBF_ORDERED)
-
-#define XFS_BUF_SHUT(bp)	do { } while (0)
-#define XFS_BUF_UNSHUT(bp)	do { } while (0)
-#define XFS_BUF_ISSHUT(bp)	(0)
-
-#define XFS_BUF_HOLD(bp)	xfs_buf_hold(bp)
-#define XFS_BUF_READ(bp)	((bp)->b_flags |= XBF_READ)
-#define XFS_BUF_UNREAD(bp)	((bp)->b_flags &= ~XBF_READ)
-#define XFS_BUF_ISREAD(bp)	((bp)->b_flags & XBF_READ)
-
-#define XFS_BUF_WRITE(bp)	((bp)->b_flags |= XBF_WRITE)
-#define XFS_BUF_UNWRITE(bp)	((bp)->b_flags &= ~XBF_WRITE)
-#define XFS_BUF_ISWRITE(bp)	((bp)->b_flags & XBF_WRITE)
-
-#define XFS_BUF_ISUNINITIAL(bp)	(0)
-#define XFS_BUF_UNUNINITIAL(bp)	(0)
-
-#define XFS_BUF_BP_ISMAPPED(bp)	(1)
-
-#define XFS_BUF_IODONE_FUNC(bp)			((bp)->b_iodone)
-#define XFS_BUF_SET_IODONE_FUNC(bp, func)	((bp)->b_iodone = (func))
-#define XFS_BUF_CLR_IODONE_FUNC(bp)		((bp)->b_iodone = NULL)
-#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func)	((bp)->b_strat = (func))
-#define XFS_BUF_CLR_BDSTRAT_FUNC(bp)		((bp)->b_strat = NULL)
-
-#define XFS_BUF_FSPRIVATE(bp, type)		((type)(bp)->b_fspriv)
-#define XFS_BUF_SET_FSPRIVATE(bp, val)		((bp)->b_fspriv = (void*)(val))
-#define XFS_BUF_FSPRIVATE2(bp, type)		((type)(bp)->b_fspriv2)
-#define XFS_BUF_SET_FSPRIVATE2(bp, val)		((bp)->b_fspriv2 = (void*)(val))
-#define XFS_BUF_FSPRIVATE3(bp, type)		((type)(bp)->b_fspriv3)
-#define XFS_BUF_SET_FSPRIVATE3(bp, val)		((bp)->b_fspriv3 = (void*)(val))
-#define XFS_BUF_SET_START(bp)			do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func)	((bp)->b_relse = (func))
-
-#define XFS_BUF_PTR(bp)			(xfs_caddr_t)((bp)->b_addr)
-#define XFS_BUF_SET_PTR(bp, val, cnt)	xfs_buf_associate_memory(bp, val, cnt)
-#define XFS_BUF_ADDR(bp)		((bp)->b_bn)
-#define XFS_BUF_SET_ADDR(bp, bno)	((bp)->b_bn = (xfs_daddr_t)(bno))
-#define XFS_BUF_OFFSET(bp)		((bp)->b_file_offset)
-#define XFS_BUF_SET_OFFSET(bp, off)	((bp)->b_file_offset = (off))
-#define XFS_BUF_COUNT(bp)		((bp)->b_count_desired)
-#define XFS_BUF_SET_COUNT(bp, cnt)	((bp)->b_count_desired = (cnt))
-#define XFS_BUF_SIZE(bp)		((bp)->b_buffer_length)
-#define XFS_BUF_SET_SIZE(bp, cnt)	((bp)->b_buffer_length = (cnt))
-
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	do { } while (0)
-#define XFS_BUF_SET_VTYPE(bp, type)		do { } while (0)
-#define XFS_BUF_SET_REF(bp, ref)		do { } while (0)
-
-#define XFS_BUF_ISPINNED(bp)	xfs_buf_ispin(bp)
-
-#define XFS_BUF_VALUSEMA(bp)	xfs_buf_lock_value(bp)
-#define XFS_BUF_CPSEMA(bp)	(xfs_buf_cond_lock(bp) == 0)
-#define XFS_BUF_VSEMA(bp)	xfs_buf_unlock(bp)
-#define XFS_BUF_PSEMA(bp,x)	xfs_buf_lock(bp)
-#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema);
-
-#define XFS_BUF_SET_TARGET(bp, target)	((bp)->b_target = (target))
-#define XFS_BUF_TARGET(bp)		((bp)->b_target)
-#define XFS_BUFTARG_NAME(target)	xfs_buf_target_name(target)
-
-static inline int xfs_bawrite(void *mp, xfs_buf_t *bp)
-{
-	bp->b_fspriv3 = mp;
-	bp->b_strat = xfs_bdstrat_cb;
-	xfs_buf_delwri_dequeue(bp);
-	return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-}
-
-static inline void xfs_buf_relse(xfs_buf_t *bp)
-{
-	if (!bp->b_relse)
-		xfs_buf_unlock(bp);
-	xfs_buf_rele(bp);
-}
-
-#define xfs_bpin(bp)		xfs_buf_pin(bp)
-#define xfs_bunpin(bp)		xfs_buf_unpin(bp)
-
-#define xfs_buftrace(id, bp)	\
-	    xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
-
-#define xfs_biodone(bp)		xfs_buf_ioend(bp, 0)
-
-#define xfs_biomove(bp, off, len, data, rw) \
-	    xfs_buf_iomove((bp), (off), (len), (data), \
-		((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ)
-
-#define xfs_biozero(bp, off, len) \
-	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-
-
-static inline int XFS_bwrite(xfs_buf_t *bp)
-{
-	int	iowait = (bp->b_flags & XBF_ASYNC) == 0;
-	int	error = 0;
-
-	if (!iowait)
-		bp->b_flags |= _XBF_RUN_QUEUES;
-
-	xfs_buf_delwri_dequeue(bp);
-	xfs_buf_iostrategy(bp);
-	if (iowait) {
-		error = xfs_buf_iowait(bp);
-		xfs_buf_relse(bp);
-	}
-	return error;
-}
-
-#define XFS_bdwrite(bp)		xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC)
-
-static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
-{
-	bp->b_strat = xfs_bdstrat_cb;
-	bp->b_fspriv3 = mp;
-	return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
-}
-
-#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
-
-#define xfs_iowait(bp)	xfs_buf_iowait(bp)
-
-#define xfs_baread(target, rablkno, ralen)  \
-	xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
-
-
-/*
- *	Handling of buftargs.
- */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
-extern void xfs_free_buftarg(xfs_buftarg_t *, int);
-extern void xfs_wait_buftarg(xfs_buftarg_t *);
-extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
-extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
-
-#define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
-#define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
-
-#define xfs_binval(buftarg)		xfs_flush_buftarg(buftarg, 1)
-#define XFS_bflush(buftarg)		xfs_flush_buftarg(buftarg, 1)
-
-#endif	/* __XFS_BUF_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
deleted file mode 100644
index e7f3da61c6c..00000000000
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_CRED_H__
-#define __XFS_CRED_H__
-
-#include <linux/capability.h>
-
-/*
- * Credentials
- */
-typedef struct cred {
-	/* EMPTY */
-} cred_t;
-
-extern struct cred *sys_cred;
-
-/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static __inline int capable_cred(cred_t *cr, int cid)
-{
-	return (cr == sys_cred) ? 1 : capable(cid);
-}
-
-#endif  /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
deleted file mode 100644
index 80eb249f2fa..00000000000
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2004-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_types.h"
-#include "xfs_dmapi.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_mount.h"
-#include "xfs_export.h"
-
-/*
- * XFS encodes and decodes the fileid portion of NFS filehandles
- * itself instead of letting the generic NFS code do it.  This
- * allows filesystems with 64 bit inode numbers to be exported.
- *
- * Note that a side effect is that xfs_vget() won't be passed a
- * zero inode/generation pair under normal circumstances.  As
- * however a malicious client could send us such data, the check
- * remains in that code.
- */
-
-STATIC struct dentry *
-linvfs_decode_fh(
-	struct super_block	*sb,
-	__u32			*fh,
-	int			fh_len,
-	int			fileid_type,
-	int (*acceptable)(
-		void		*context,
-		struct dentry	*de),
-	void			*context)
-{
-	xfs_fid2_t		ifid;
-	xfs_fid2_t		pfid;
-	void			*parent = NULL;
-	int			is64 = 0;
-	__u32			*p = fh;
-
-#if XFS_BIG_INUMS
-	is64 = (fileid_type & XFS_FILEID_TYPE_64FLAG);
-	fileid_type &= ~XFS_FILEID_TYPE_64FLAG;
-#endif
-
-	/*
-	 * Note that we only accept fileids which are long enough
-	 * rather than allow the parent generation number to default
-	 * to zero.  XFS considers zero a valid generation number not
-	 * an invalid/wildcard value.  There's little point printk'ing
-	 * a warning here as we don't have the client information
-	 * which would make such a warning useful.
-	 */
-	if (fileid_type > 2 ||
-	    fh_len < xfs_fileid_length((fileid_type == 2), is64))
-		return NULL;
-
-	p = xfs_fileid_decode_fid2(p, &ifid, is64);
-
-	if (fileid_type == 2) {
-		p = xfs_fileid_decode_fid2(p, &pfid, is64);
-		parent = &pfid;
-	}
-
-	fh = (__u32 *)&ifid;
-	return find_exported_dentry(sb, fh, parent, acceptable, context);
-}
-
-
-STATIC int
-linvfs_encode_fh(
-	struct dentry		*dentry,
-	__u32			*fh,
-	int			*max_len,
-	int			connectable)
-{
-	struct inode		*inode = dentry->d_inode;
-	int			type = 1;
-	__u32			*p = fh;
-	int			len;
-	int			is64 = 0;
-#if XFS_BIG_INUMS
-	vfs_t			*vfs = LINVFS_GET_VFS(inode->i_sb);
-
-	if (!(vfs->vfs_flag & VFS_32BITINODES)) {
-		/* filesystem may contain 64bit inode numbers */
-		is64 = XFS_FILEID_TYPE_64FLAG;
-	}
-#endif
-
-	/* Directories don't need their parent encoded, they have ".." */
-	if (S_ISDIR(inode->i_mode))
-	    connectable = 0;
-
-	/*
-	 * Only encode if there is enough space given.  In practice
-	 * this means we can't export a filesystem with 64bit inodes
-	 * over NFSv2 with the subtree_check export option; the other
-	 * seven combinations work.  The real answer is "don't use v2".
-	 */
-	len = xfs_fileid_length(connectable, is64);
-	if (*max_len < len)
-		return 255;
-	*max_len = len;
-
-	p = xfs_fileid_encode_inode(p, inode, is64);
-	if (connectable) {
-		spin_lock(&dentry->d_lock);
-		p = xfs_fileid_encode_inode(p, dentry->d_parent->d_inode, is64);
-		spin_unlock(&dentry->d_lock);
-		type = 2;
-	}
-	BUG_ON((p - fh) != len);
-	return type | is64;
-}
-
-STATIC struct dentry *
-linvfs_get_dentry(
-	struct super_block	*sb,
-	void			*data)
-{
-	vnode_t			*vp;
-	struct inode		*inode;
-	struct dentry		*result;
-	vfs_t			*vfsp = LINVFS_GET_VFS(sb);
-	int			error;
-
-	VFS_VGET(vfsp, &vp, (fid_t *)data, error);
-	if (error || vp == NULL)
-		return ERR_PTR(-ESTALE) ;
-
-	inode = LINVFS_GET_IP(vp);
-	result = d_alloc_anon(inode);
-        if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
-}
-
-STATIC struct dentry *
-linvfs_get_parent(
-	struct dentry		*child)
-{
-	int			error;
-	vnode_t			*vp, *cvp;
-	struct dentry		*parent;
-	struct dentry		dotdot;
-
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-	dotdot.d_inode = NULL;
-
-	cvp = NULL;
-	vp = LINVFS_GET_VP(child->d_inode);
-	VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error);
-	if (unlikely(error))
-		return ERR_PTR(-error);
-
-	parent = d_alloc_anon(LINVFS_GET_IP(cvp));
-	if (unlikely(!parent)) {
-		VN_RELE(cvp);
-		return ERR_PTR(-ENOMEM);
-	}
-	return parent;
-}
-
-struct export_operations linvfs_export_ops = {
-	.decode_fh		= linvfs_decode_fh,
-	.encode_fh		= linvfs_encode_fh,
-	.get_parent		= linvfs_get_parent,
-	.get_dentry		= linvfs_get_dentry,
-};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
deleted file mode 100644
index ced4404339c..00000000000
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ /dev/null
@@ -1,595 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_trans.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_ioctl32.h"
-
-#include <linux/dcache.h>
-#include <linux/smp_lock.h>
-
-static struct vm_operations_struct linvfs_file_vm_ops;
-#ifdef CONFIG_XFS_DMAPI
-static struct vm_operations_struct linvfs_dmapi_file_vm_ops;
-#endif
-
-STATIC inline ssize_t
-__linvfs_read(
-	struct kiocb		*iocb,
-	char			__user *buf,
-	int			ioflags,
-	size_t			count,
-	loff_t			pos)
-{
-	struct iovec		iov = {buf, count};
-	struct file		*file = iocb->ki_filp;
-	vnode_t			*vp = LINVFS_GET_VP(file->f_dentry->d_inode);
-	ssize_t			rval;
-
-	BUG_ON(iocb->ki_pos != pos);
-
-	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
-	VOP_READ(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
-	return rval;
-}
-
-
-STATIC ssize_t
-linvfs_aio_read(
-	struct kiocb		*iocb,
-	char			__user *buf,
-	size_t			count,
-	loff_t			pos)
-{
-	return __linvfs_read(iocb, buf, IO_ISAIO, count, pos);
-}
-
-STATIC ssize_t
-linvfs_aio_read_invis(
-	struct kiocb		*iocb,
-	char			__user *buf,
-	size_t			count,
-	loff_t			pos)
-{
-	return __linvfs_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
-}
-
-
-STATIC inline ssize_t
-__linvfs_write(
-	struct kiocb	*iocb,
-	const char	__user *buf,
-	int		ioflags,
-	size_t		count,
-	loff_t		pos)
-{
-	struct iovec	iov = {(void __user *)buf, count};
-	struct file	*file = iocb->ki_filp;
-	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	ssize_t		rval;
-
-	BUG_ON(iocb->ki_pos != pos);
-	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
-
-	VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
-	return rval;
-}
-
-
-STATIC ssize_t
-linvfs_aio_write(
-	struct kiocb		*iocb,
-	const char		__user *buf,
-	size_t			count,
-	loff_t			pos)
-{
-	return __linvfs_write(iocb, buf, IO_ISAIO, count, pos);
-}
-
-STATIC ssize_t
-linvfs_aio_write_invis(
-	struct kiocb		*iocb,
-	const char		__user *buf,
-	size_t			count,
-	loff_t			pos)
-{
-	return __linvfs_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
-}
-
-
-STATIC inline ssize_t
-__linvfs_readv(
-	struct file		*file,
-	const struct iovec 	*iov,
-	int			ioflags,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	struct		kiocb kiocb;
-	ssize_t		rval;
-
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
-
-	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
-	VOP_READ(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
-
-	*ppos = kiocb.ki_pos;
-	return rval;
-}
-
-STATIC ssize_t
-linvfs_readv(
-	struct file		*file,
-	const struct iovec 	*iov,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	return __linvfs_readv(file, iov, 0, nr_segs, ppos);
-}
-
-STATIC ssize_t
-linvfs_readv_invis(
-	struct file		*file,
-	const struct iovec 	*iov,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	return __linvfs_readv(file, iov, IO_INVIS, nr_segs, ppos);
-}
-
-
-STATIC inline ssize_t
-__linvfs_writev(
-	struct file		*file,
-	const struct iovec 	*iov,
-	int			ioflags,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	struct		kiocb kiocb;
-	ssize_t		rval;
-
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
-	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
-
-	VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
-
-	*ppos = kiocb.ki_pos;
-	return rval;
-}
-
-
-STATIC ssize_t
-linvfs_writev(
-	struct file		*file,
-	const struct iovec 	*iov,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	return __linvfs_writev(file, iov, 0, nr_segs, ppos);
-}
-
-STATIC ssize_t
-linvfs_writev_invis(
-	struct file		*file,
-	const struct iovec 	*iov,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	return __linvfs_writev(file, iov, IO_INVIS, nr_segs, ppos);
-}
-
-STATIC ssize_t
-linvfs_sendfile(
-	struct file		*filp,
-	loff_t			*ppos,
-	size_t			count,
-	read_actor_t		actor,
-	void			*target)
-{
-	vnode_t			*vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SENDFILE(vp, filp, ppos, 0, count, actor, target, NULL, rval);
-	return rval;
-}
-
-
-STATIC int
-linvfs_open(
-	struct inode	*inode,
-	struct file	*filp)
-{
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	int		error;
-
-	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
-		return -EFBIG;
-
-	ASSERT(vp);
-	VOP_OPEN(vp, NULL, error);
-	return -error;
-}
-
-
-STATIC int
-linvfs_release(
-	struct inode	*inode,
-	struct file	*filp)
-{
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	int		error = 0;
-
-	if (vp)
-		VOP_RELEASE(vp, error);
-	return -error;
-}
-
-
-STATIC int
-linvfs_fsync(
-	struct file	*filp,
-	struct dentry	*dentry,
-	int		datasync)
-{
-	struct inode	*inode = dentry->d_inode;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	int		error;
-	int		flags = FSYNC_WAIT;
-
-	if (datasync)
-		flags |= FSYNC_DATA;
-
-	ASSERT(vp);
-	VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error);
-	return -error;
-}
-
-/*
- * linvfs_readdir maps to VOP_READDIR().
- * We need to build a uio, cred, ...
- */
-
-#define nextdp(dp)      ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen))
-
-#ifdef CONFIG_XFS_DMAPI
-
-STATIC struct page *
-linvfs_filemap_nopage(
-	struct vm_area_struct	*area,
-	unsigned long		address,
-	int			*type)
-{
-	struct inode	*inode = area->vm_file->f_dentry->d_inode;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
-	int		error;
-
-	ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
-
-	error = XFS_SEND_MMAP(mp, area, 0);
-	if (error)
-		return NULL;
-
-	return filemap_nopage(area, address, type);
-}
-
-#endif /* CONFIG_XFS_DMAPI */
-
-
-STATIC int
-linvfs_readdir(
-	struct file	*filp,
-	void		*dirent,
-	filldir_t	filldir)
-{
-	int		error = 0;
-	vnode_t		*vp;
-	uio_t		uio;
-	iovec_t		iov;
-	int		eof = 0;
-	caddr_t		read_buf;
-	int		namelen, size = 0;
-	size_t		rlen = PAGE_CACHE_SIZE;
-	xfs_off_t	start_offset, curr_offset;
-	xfs_dirent_t	*dbp = NULL;
-
-	vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
-	ASSERT(vp);
-
-	/* Try fairly hard to get memory */
-	do {
-		if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL)))
-			break;
-		rlen >>= 1;
-	} while (rlen >= 1024);
-
-	if (read_buf == NULL)
-		return -ENOMEM;
-
-	uio.uio_iov = &iov;
-	uio.uio_segflg = UIO_SYSSPACE;
-	curr_offset = filp->f_pos;
-	if (filp->f_pos != 0x7fffffff)
-		uio.uio_offset = filp->f_pos;
-	else
-		uio.uio_offset = 0xffffffff;
-
-	while (!eof) {
-		uio.uio_resid = iov.iov_len = rlen;
-		iov.iov_base = read_buf;
-		uio.uio_iovcnt = 1;
-
-		start_offset = uio.uio_offset;
-
-		VOP_READDIR(vp, &uio, NULL, &eof, error);
-		if ((uio.uio_offset == start_offset) || error) {
-			size = 0;
-			break;
-		}
-
-		size = rlen - uio.uio_resid;
-		dbp = (xfs_dirent_t *)read_buf;
-		while (size > 0) {
-			namelen = strlen(dbp->d_name);
-
-			if (filldir(dirent, dbp->d_name, namelen,
-					(loff_t) curr_offset & 0x7fffffff,
-					(ino_t) dbp->d_ino,
-					DT_UNKNOWN)) {
-				goto done;
-			}
-			size -= dbp->d_reclen;
-			curr_offset = (loff_t)dbp->d_off /* & 0x7fffffff */;
-			dbp = nextdp(dbp);
-		}
-	}
-done:
-	if (!error) {
-		if (size == 0)
-			filp->f_pos = uio.uio_offset & 0x7fffffff;
-		else if (dbp)
-			filp->f_pos = curr_offset;
-	}
-
-	kfree(read_buf);
-	return -error;
-}
-
-
-STATIC int
-linvfs_file_mmap(
-	struct file	*filp,
-	struct vm_area_struct *vma)
-{
-	struct inode	*ip = filp->f_dentry->d_inode;
-	vnode_t		*vp = LINVFS_GET_VP(ip);
-	vattr_t		va = { .va_mask = XFS_AT_UPDATIME };
-	int		error;
-
-	vma->vm_ops = &linvfs_file_vm_ops;
-
-#ifdef CONFIG_XFS_DMAPI
-	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
-		vma->vm_ops = &linvfs_dmapi_file_vm_ops;
-	}
-#endif /* CONFIG_XFS_DMAPI */
-
-	VOP_SETATTR(vp, &va, XFS_AT_UPDATIME, NULL, error);
-	if (!error)
-		vn_revalidate(vp);	/* update Linux inode flags */
-	return 0;
-}
-
-
-STATIC long
-linvfs_ioctl(
-	struct file	*filp,
-	unsigned int	cmd,
-	unsigned long	arg)
-{
-	int		error;
-	struct inode *inode = filp->f_dentry->d_inode;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-
-	VOP_IOCTL(vp, inode, filp, 0, cmd, (void __user *)arg, error);
-	VMODIFY(vp);
-
-	/* NOTE:  some of the ioctl's return positive #'s as a
-	 *	  byte count indicating success, such as
-	 *	  readlink_by_handle.  So we don't "sign flip"
-	 *	  like most other routines.  This means true
-	 *	  errors need to be returned as a negative value.
-	 */
-	return error;
-}
-
-STATIC long
-linvfs_ioctl_invis(
-	struct file	*filp,
-	unsigned int	cmd,
-	unsigned long	arg)
-{
-	int		error;
-	struct inode *inode = filp->f_dentry->d_inode;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-
-	ASSERT(vp);
-	VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error);
-	VMODIFY(vp);
-
-	/* NOTE:  some of the ioctl's return positive #'s as a
-	 *	  byte count indicating success, such as
-	 *	  readlink_by_handle.  So we don't "sign flip"
-	 *	  like most other routines.  This means true
-	 *	  errors need to be returned as a negative value.
-	 */
-	return error;
-}
-
-#ifdef CONFIG_XFS_DMAPI
-#ifdef HAVE_VMOP_MPROTECT
-STATIC int
-linvfs_mprotect(
-	struct vm_area_struct *vma,
-	unsigned int	newflags)
-{
-	vnode_t		*vp = LINVFS_GET_VP(vma->vm_file->f_dentry->d_inode);
-	int		error = 0;
-
-	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
-		if ((vma->vm_flags & VM_MAYSHARE) &&
-		    (newflags & VM_WRITE) && !(vma->vm_flags & VM_WRITE)) {
-			xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
-
-			error = XFS_SEND_MMAP(mp, vma, VM_WRITE);
-		    }
-	}
-	return error;
-}
-#endif /* HAVE_VMOP_MPROTECT */
-#endif /* CONFIG_XFS_DMAPI */
-
-#ifdef HAVE_FOP_OPEN_EXEC
-/* If the user is attempting to execute a file that is offline then
- * we have to trigger a DMAPI READ event before the file is marked as busy
- * otherwise the invisible I/O will not be able to write to the file to bring
- * it back online.
- */
-STATIC int
-linvfs_open_exec(
-	struct inode	*inode)
-{
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
-	int		error = 0;
-	xfs_inode_t	*ip;
-
-	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
-		ip = xfs_vtoi(vp);
-		if (!ip) {
-			error = -EINVAL;
-			goto open_exec_out;
-		}
-		if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) {
-			error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
-					       0, 0, 0, NULL);
-		}
-	}
-open_exec_out:
-	return error;
-}
-#endif /* HAVE_FOP_OPEN_EXEC */
-
-struct file_operations linvfs_file_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= do_sync_read,
-	.write		= do_sync_write,
-	.readv		= linvfs_readv,
-	.writev		= linvfs_writev,
-	.aio_read	= linvfs_aio_read,
-	.aio_write	= linvfs_aio_write,
-	.sendfile	= linvfs_sendfile,
-	.unlocked_ioctl	= linvfs_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= linvfs_compat_ioctl,
-#endif
-	.mmap		= linvfs_file_mmap,
-	.open		= linvfs_open,
-	.release	= linvfs_release,
-	.fsync		= linvfs_fsync,
-#ifdef HAVE_FOP_OPEN_EXEC
-	.open_exec	= linvfs_open_exec,
-#endif
-};
-
-struct file_operations linvfs_invis_file_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= do_sync_read,
-	.write		= do_sync_write,
-	.readv		= linvfs_readv_invis,
-	.writev		= linvfs_writev_invis,
-	.aio_read	= linvfs_aio_read_invis,
-	.aio_write	= linvfs_aio_write_invis,
-	.sendfile	= linvfs_sendfile,
-	.unlocked_ioctl	= linvfs_ioctl_invis,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= linvfs_compat_invis_ioctl,
-#endif
-	.mmap		= linvfs_file_mmap,
-	.open		= linvfs_open,
-	.release	= linvfs_release,
-	.fsync		= linvfs_fsync,
-};
-
-
-struct file_operations linvfs_dir_operations = {
-	.read		= generic_read_dir,
-	.readdir	= linvfs_readdir,
-	.unlocked_ioctl	= linvfs_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= linvfs_compat_ioctl,
-#endif
-	.fsync		= linvfs_fsync,
-};
-
-static struct vm_operations_struct linvfs_file_vm_ops = {
-	.nopage		= filemap_nopage,
-	.populate	= filemap_populate,
-};
-
-#ifdef CONFIG_XFS_DMAPI
-static struct vm_operations_struct linvfs_dmapi_file_vm_ops = {
-	.nopage		= linvfs_filemap_nopage,
-	.populate	= filemap_populate,
-#ifdef HAVE_VMOP_MPROTECT
-	.mprotect	= linvfs_mprotect,
-#endif
-};
-#endif /* CONFIG_XFS_DMAPI */
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
deleted file mode 100644
index 4fa4b1a5187..00000000000
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include "xfs.h"
-
-/*
- * Stub for no-op vnode operations that return error status.
- */
-int
-fs_noerr(void)
-{
-	return 0;
-}
-
-/*
- * Operation unsupported under this file system.
- */
-int
-fs_nosys(void)
-{
-	return ENOSYS;
-}
-
-/*
- * Stub for inactive, strategy, and read/write lock/unlock.  Does nothing.
- */
-/* ARGSUSED */
-void
-fs_noval(void)
-{
-}
-
-/*
- * vnode pcache layer for vnode_tosspages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
-void
-fs_tosspages(
-	bhv_desc_t	*bdp,
-	xfs_off_t	first,
-	xfs_off_t	last,
-	int		fiopt)
-{
-	vnode_t		*vp = BHV_TO_VNODE(bdp);
-	struct inode	*ip = LINVFS_GET_IP(vp);
-
-	if (VN_CACHED(vp))
-		truncate_inode_pages(ip->i_mapping, first);
-}
-
-
-/*
- * vnode pcache layer for vnode_flushinval_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
-void
-fs_flushinval_pages(
-	bhv_desc_t	*bdp,
-	xfs_off_t	first,
-	xfs_off_t	last,
-	int		fiopt)
-{
-	vnode_t		*vp = BHV_TO_VNODE(bdp);
-	struct inode	*ip = LINVFS_GET_IP(vp);
-
-	if (VN_CACHED(vp)) {
-		filemap_write_and_wait(ip->i_mapping);
-
-		truncate_inode_pages(ip->i_mapping, first);
-	}
-}
-
-/*
- * vnode pcache layer for vnode_flush_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
-int
-fs_flush_pages(
-	bhv_desc_t	*bdp,
-	xfs_off_t	first,
-	xfs_off_t	last,
-	uint64_t	flags,
-	int		fiopt)
-{
-	vnode_t		*vp = BHV_TO_VNODE(bdp);
-	struct inode	*ip = LINVFS_GET_IP(vp);
-
-	if (VN_CACHED(vp)) {
-		filemap_fdatawrite(ip->i_mapping);
-		if (flags & XFS_B_ASYNC)
-			return 0;
-		filemap_fdatawait(ip->i_mapping);
-	}
-
-	return 0;
-}
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
deleted file mode 100644
index aee9ccdd18f..00000000000
--- a/fs/xfs/linux-2.6/xfs_fs_subr.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef	__XFS_FS_SUBR_H__
-#define __XFS_FS_SUBR_H__
-
-struct cred;
-extern int  fs_noerr(void);
-extern int  fs_nosys(void);
-extern void fs_noval(void);
-extern void fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-extern void fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-extern int  fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int);
-
-#endif	/* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
deleted file mode 100644
index e1a22bfcf86..00000000000
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_GLOBALS_H__
-#define __XFS_GLOBALS_H__
-
-extern uint64_t	xfs_panic_mask;		/* set to cause more panics */
-extern unsigned long xfs_physmem;
-extern struct cred *sys_cred;
-
-#endif	/* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
deleted file mode 100644
index 4db47790415..00000000000
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ /dev/null
@@ -1,1334 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_itable.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_bmap.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_dfrag.h"
-#include "xfs_fsops.h"
-
-#include <linux/capability.h>
-#include <linux/dcache.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/pagemap.h>
-
-/*
- * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
- * a file or fs handle.
- *
- * XFS_IOC_PATH_TO_FSHANDLE
- *    returns fs handle for a mount point or path within that mount point
- * XFS_IOC_FD_TO_HANDLE
- *    returns full handle for a FD opened in user space
- * XFS_IOC_PATH_TO_HANDLE
- *    returns full handle for a path
- */
-STATIC int
-xfs_find_handle(
-	unsigned int		cmd,
-	void			__user *arg)
-{
-	int			hsize;
-	xfs_handle_t		handle;
-	xfs_fsop_handlereq_t	hreq;
-	struct inode		*inode;
-	struct vnode		*vp;
-
-	if (copy_from_user(&hreq, arg, sizeof(hreq)))
-		return -XFS_ERROR(EFAULT);
-
-	memset((char *)&handle, 0, sizeof(handle));
-
-	switch (cmd) {
-	case XFS_IOC_PATH_TO_FSHANDLE:
-	case XFS_IOC_PATH_TO_HANDLE: {
-		struct nameidata	nd;
-		int			error;
-
-		error = user_path_walk_link((const char __user *)hreq.path, &nd);
-		if (error)
-			return error;
-
-		ASSERT(nd.dentry);
-		ASSERT(nd.dentry->d_inode);
-		inode = igrab(nd.dentry->d_inode);
-		path_release(&nd);
-		break;
-	}
-
-	case XFS_IOC_FD_TO_HANDLE: {
-		struct file	*file;
-
-		file = fget(hreq.fd);
-		if (!file)
-		    return -EBADF;
-
-		ASSERT(file->f_dentry);
-		ASSERT(file->f_dentry->d_inode);
-		inode = igrab(file->f_dentry->d_inode);
-		fput(file);
-		break;
-	}
-
-	default:
-		ASSERT(0);
-		return -XFS_ERROR(EINVAL);
-	}
-
-	if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
-		/* we're not in XFS anymore, Toto */
-		iput(inode);
-		return -XFS_ERROR(EINVAL);
-	}
-
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFREG:
-	case S_IFDIR:
-	case S_IFLNK:
-		break;
-	default:
-		iput(inode);
-		return -XFS_ERROR(EBADF);
-	}
-
-	/* we need the vnode */
-	vp = LINVFS_GET_VP(inode);
-
-	/* now we can grab the fsid */
-	memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t));
-	hsize = sizeof(xfs_fsid_t);
-
-	if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
-		xfs_inode_t	*ip;
-		int		lock_mode;
-
-		/* need to get access to the xfs_inode to read the generation */
-		ip = xfs_vtoi(vp);
-		ASSERT(ip);
-		lock_mode = xfs_ilock_map_shared(ip);
-
-		/* fill in fid section of handle from inode */
-		handle.ha_fid.xfs_fid_len = sizeof(xfs_fid_t) -
-					    sizeof(handle.ha_fid.xfs_fid_len);
-		handle.ha_fid.xfs_fid_pad = 0;
-		handle.ha_fid.xfs_fid_gen = ip->i_d.di_gen;
-		handle.ha_fid.xfs_fid_ino = ip->i_ino;
-
-		xfs_iunlock_map_shared(ip, lock_mode);
-
-		hsize = XFS_HSIZE(handle);
-	}
-
-	/* now copy our handle into the user buffer & write out the size */
-	if (copy_to_user(hreq.ohandle, &handle, hsize) ||
-	    copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) {
-		iput(inode);
-		return -XFS_ERROR(EFAULT);
-	}
-
-	iput(inode);
-	return 0;
-}
-
-
-/*
- * Convert userspace handle data into vnode (and inode).
- * We [ab]use the fact that all the fsop_handlereq ioctl calls
- * have a data structure argument whose first component is always
- * a xfs_fsop_handlereq_t, so we can cast to and from this type.
- * This allows us to optimise the copy_from_user calls and gives
- * a handy, shared routine.
- *
- * If no error, caller must always VN_RELE the returned vp.
- */
-STATIC int
-xfs_vget_fsop_handlereq(
-	xfs_mount_t		*mp,
-	struct inode		*parinode,	/* parent inode pointer    */
-	xfs_fsop_handlereq_t	*hreq,
-	vnode_t			**vp,
-	struct inode		**inode)
-{
-	void			__user *hanp;
-	size_t			hlen;
-	xfs_fid_t		*xfid;
-	xfs_handle_t		*handlep;
-	xfs_handle_t		handle;
-	xfs_inode_t		*ip;
-	struct inode		*inodep;
-	vnode_t			*vpp;
-	xfs_ino_t		ino;
-	__u32			igen;
-	int			error;
-
-	/*
-	 * Only allow handle opens under a directory.
-	 */
-	if (!S_ISDIR(parinode->i_mode))
-		return XFS_ERROR(ENOTDIR);
-
-	hanp = hreq->ihandle;
-	hlen = hreq->ihandlen;
-	handlep = &handle;
-
-	if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
-		return XFS_ERROR(EINVAL);
-	if (copy_from_user(handlep, hanp, hlen))
-		return XFS_ERROR(EFAULT);
-	if (hlen < sizeof(*handlep))
-		memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
-	if (hlen > sizeof(handlep->ha_fsid)) {
-		if (handlep->ha_fid.xfs_fid_len !=
-				(hlen - sizeof(handlep->ha_fsid)
-					- sizeof(handlep->ha_fid.xfs_fid_len))
-		    || handlep->ha_fid.xfs_fid_pad)
-			return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Crack the handle, obtain the inode # & generation #
-	 */
-	xfid = (struct xfs_fid *)&handlep->ha_fid;
-	if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) {
-		ino  = xfid->xfs_fid_ino;
-		igen = xfid->xfs_fid_gen;
-	} else {
-		return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Get the XFS inode, building a vnode to go with it.
-	 */
-	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
-	if (error)
-		return error;
-	if (ip == NULL)
-		return XFS_ERROR(EIO);
-	if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) {
-		xfs_iput_new(ip, XFS_ILOCK_SHARED);
-		return XFS_ERROR(ENOENT);
-	}
-
-	vpp = XFS_ITOV(ip);
-	inodep = LINVFS_GET_IP(vpp);
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-	*vp = vpp;
-	*inode = inodep;
-	return 0;
-}
-
-STATIC int
-xfs_open_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct file		*parfilp,
-	struct inode		*parinode)
-{
-	int			error;
-	int			new_fd;
-	int			permflag;
-	struct file		*filp;
-	struct inode		*inode;
-	struct dentry		*dentry;
-	vnode_t			*vp;
-	xfs_fsop_handlereq_t	hreq;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
-	if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
-
-	error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode);
-	if (error)
-		return -error;
-
-	/* Restrict xfs_open_by_handle to directories & regular files. */
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
-		iput(inode);
-		return -XFS_ERROR(EINVAL);
-	}
-
-#if BITS_PER_LONG != 32
-	hreq.oflags |= O_LARGEFILE;
-#endif
-	/* Put open permission in namei format. */
-	permflag = hreq.oflags;
-	if ((permflag+1) & O_ACCMODE)
-		permflag++;
-	if (permflag & O_TRUNC)
-		permflag |= 2;
-
-	if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
-	    (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
-		iput(inode);
-		return -XFS_ERROR(EPERM);
-	}
-
-	if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
-		iput(inode);
-		return -XFS_ERROR(EACCES);
-	}
-
-	/* Can't write directories. */
-	if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
-		iput(inode);
-		return -XFS_ERROR(EISDIR);
-	}
-
-	if ((new_fd = get_unused_fd()) < 0) {
-		iput(inode);
-		return new_fd;
-	}
-
-	dentry = d_alloc_anon(inode);
-	if (dentry == NULL) {
-		iput(inode);
-		put_unused_fd(new_fd);
-		return -XFS_ERROR(ENOMEM);
-	}
-
-	/* Ensure umount returns EBUSY on umounts while this file is open. */
-	mntget(parfilp->f_vfsmnt);
-
-	/* Create file pointer. */
-	filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags);
-	if (IS_ERR(filp)) {
-		put_unused_fd(new_fd);
-		return -XFS_ERROR(-PTR_ERR(filp));
-	}
-	if (inode->i_mode & S_IFREG)
-		filp->f_op = &linvfs_invis_file_operations;
-
-	fd_install(new_fd, filp);
-	return new_fd;
-}
-
-STATIC int
-xfs_readlink_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct file		*parfilp,
-	struct inode		*parinode)
-{
-	int			error;
-	struct iovec		aiov;
-	struct uio		auio;
-	struct inode		*inode;
-	xfs_fsop_handlereq_t	hreq;
-	vnode_t			*vp;
-	__u32			olen;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
-	if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
-
-	error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode);
-	if (error)
-		return -error;
-
-	/* Restrict this handle operation to symlinks only. */
-	if (!S_ISLNK(inode->i_mode)) {
-		VN_RELE(vp);
-		return -XFS_ERROR(EINVAL);
-	}
-
-	if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) {
-		VN_RELE(vp);
-		return -XFS_ERROR(EFAULT);
-	}
-	aiov.iov_len	= olen;
-	aiov.iov_base	= hreq.ohandle;
-
-	auio.uio_iov	= &aiov;
-	auio.uio_iovcnt	= 1;
-	auio.uio_offset	= 0;
-	auio.uio_segflg	= UIO_USERSPACE;
-	auio.uio_resid	= olen;
-
-	VOP_READLINK(vp, &auio, IO_INVIS, NULL, error);
-
-	VN_RELE(vp);
-	return (olen - auio.uio_resid);
-}
-
-STATIC int
-xfs_fssetdm_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct file		*parfilp,
-	struct inode		*parinode)
-{
-	int			error;
-	struct fsdmidata	fsd;
-	xfs_fsop_setdm_handlereq_t dmhreq;
-	struct inode		*inode;
-	bhv_desc_t		*bdp;
-	vnode_t			*vp;
-
-	if (!capable(CAP_MKNOD))
-		return -XFS_ERROR(EPERM);
-	if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
-
-	error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &vp, &inode);
-	if (error)
-		return -error;
-
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
-		VN_RELE(vp);
-		return -XFS_ERROR(EPERM);
-	}
-
-	if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
-		VN_RELE(vp);
-		return -XFS_ERROR(EFAULT);
-	}
-
-	bdp = bhv_base_unlocked(VN_BHV_HEAD(vp));
-	error = xfs_set_dmattrs(bdp, fsd.fsd_dmevmask, fsd.fsd_dmstate, NULL);
-
-	VN_RELE(vp);
-	if (error)
-		return -error;
-	return 0;
-}
-
-STATIC int
-xfs_attrlist_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct file		*parfilp,
-	struct inode		*parinode)
-{
-	int			error;
-	attrlist_cursor_kern_t	*cursor;
-	xfs_fsop_attrlist_handlereq_t al_hreq;
-	struct inode		*inode;
-	vnode_t			*vp;
-	char			*kbuf;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
-	if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
-	if (al_hreq.buflen > XATTR_LIST_MAX)
-		return -XFS_ERROR(EINVAL);
-
-	error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq,
-			&vp, &inode);
-	if (error)
-		goto out;
-
-	kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
-	if (!kbuf)
-		goto out_vn_rele;
-
-	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-	VOP_ATTR_LIST(vp, kbuf, al_hreq.buflen, al_hreq.flags,
-			cursor, NULL, error);
-	if (error)
-		goto out_kfree;
-
-	if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
-		error = -EFAULT;
-
- out_kfree:
-	kfree(kbuf);
- out_vn_rele:
-	VN_RELE(vp);
- out:
-	return -error;
-}
-
-STATIC int
-xfs_attrmulti_attr_get(
-	struct vnode		*vp,
-	char			*name,
-	char			__user *ubuf,
-	__uint32_t		*len,
-	__uint32_t		flags)
-{
-	char			*kbuf;
-	int			error = EFAULT;
-	
-	if (*len > XATTR_SIZE_MAX)
-		return EINVAL;
-	kbuf = kmalloc(*len, GFP_KERNEL);
-	if (!kbuf)
-		return ENOMEM;
-
-	VOP_ATTR_GET(vp, name, kbuf, len, flags, NULL, error);
-	if (error)
-		goto out_kfree;
-
-	if (copy_to_user(ubuf, kbuf, *len))
-		error = EFAULT;
-
- out_kfree:
-	kfree(kbuf);
-	return error;
-}
-
-STATIC int
-xfs_attrmulti_attr_set(
-	struct vnode		*vp,
-	char			*name,
-	const char		__user *ubuf,
-	__uint32_t		len,
-	__uint32_t		flags)
-{
-	char			*kbuf;
-	int			error = EFAULT;
-
-	if (IS_RDONLY(&vp->v_inode))
-		return -EROFS;
-	if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
-		return EPERM;
-	if (len > XATTR_SIZE_MAX)
-		return EINVAL;
-
-	kbuf = kmalloc(len, GFP_KERNEL);
-	if (!kbuf)
-		return ENOMEM;
-
-	if (copy_from_user(kbuf, ubuf, len))
-		goto out_kfree;
-			
-	VOP_ATTR_SET(vp, name, kbuf, len, flags, NULL, error);
-
- out_kfree:
-	kfree(kbuf);
-	return error;
-}
-
-STATIC int
-xfs_attrmulti_attr_remove(
-	struct vnode		*vp,
-	char			*name,
-	__uint32_t		flags)
-{
-	int			error;
-
-
-	if (IS_RDONLY(&vp->v_inode))
-		return -EROFS;
-	if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
-		return EPERM;
-
-	VOP_ATTR_REMOVE(vp, name, flags, NULL, error);
-	return error;
-}
-
-STATIC int
-xfs_attrmulti_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct file		*parfilp,
-	struct inode		*parinode)
-{
-	int			error;
-	xfs_attr_multiop_t	*ops;
-	xfs_fsop_attrmulti_handlereq_t am_hreq;
-	struct inode		*inode;
-	vnode_t			*vp;
-	unsigned int		i, size;
-	char			*attr_name;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -XFS_ERROR(EPERM);
-	if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
-		return -XFS_ERROR(EFAULT);
-
-	error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &vp, &inode);
-	if (error)
-		goto out;
-
-	error = E2BIG;
-	size = am_hreq.opcount * sizeof(attr_multiop_t);
-	if (!size || size > 16 * PAGE_SIZE)
-		goto out_vn_rele;
-
-	error = ENOMEM;
-	ops = kmalloc(size, GFP_KERNEL);
-	if (!ops)
-		goto out_vn_rele;
-
-	error = EFAULT;
-	if (copy_from_user(ops, am_hreq.ops, size))
-		goto out_kfree_ops;
-
-	attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
-	if (!attr_name)
-		goto out_kfree_ops;
-
-
-	error = 0;
-	for (i = 0; i < am_hreq.opcount; i++) {
-		ops[i].am_error = strncpy_from_user(attr_name,
-				ops[i].am_attrname, MAXNAMELEN);
-		if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
-			error = -ERANGE;
-		if (ops[i].am_error < 0)
-			break;
-
-		switch (ops[i].am_opcode) {
-		case ATTR_OP_GET:
-			ops[i].am_error = xfs_attrmulti_attr_get(vp,
-					attr_name, ops[i].am_attrvalue,
-					&ops[i].am_length, ops[i].am_flags);
-			break;
-		case ATTR_OP_SET:
-			ops[i].am_error = xfs_attrmulti_attr_set(vp,
-					attr_name, ops[i].am_attrvalue,
-					ops[i].am_length, ops[i].am_flags);
-			break;
-		case ATTR_OP_REMOVE:
-			ops[i].am_error = xfs_attrmulti_attr_remove(vp,
-					attr_name, ops[i].am_flags);
-			break;
-		default:
-			ops[i].am_error = EINVAL;
-		}
-	}
-
-	if (copy_to_user(am_hreq.ops, ops, size))
-		error = XFS_ERROR(EFAULT);
-
-	kfree(attr_name);
- out_kfree_ops:
-	kfree(ops);
- out_vn_rele:
-	VN_RELE(vp);
- out:
-	return -error;
-}
-
-/* prototypes for a few of the stack-hungry cases that have
- * their own functions.  Functions are defined after their use
- * so gcc doesn't get fancy and inline them with -03 */
-
-STATIC int
-xfs_ioc_space(
-	bhv_desc_t		*bdp,
-	vnode_t			*vp,
-	struct file		*filp,
-	int			flags,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_bulkstat(
-	xfs_mount_t		*mp,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_fsgeometry_v1(
-	xfs_mount_t		*mp,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_fsgeometry(
-	xfs_mount_t		*mp,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_xattr(
-	vnode_t			*vp,
-	xfs_inode_t		*ip,
-	struct file		*filp,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_getbmap(
-	bhv_desc_t		*bdp,
-	struct file		*filp,
-	int			flags,
-	unsigned int		cmd,
-	void			__user *arg);
-
-STATIC int
-xfs_ioc_getbmapx(
-	bhv_desc_t		*bdp,
-	void			__user *arg);
-
-int
-xfs_ioctl(
-	bhv_desc_t		*bdp,
-	struct inode		*inode,
-	struct file		*filp,
-	int			ioflags,
-	unsigned int		cmd,
-	void			__user *arg)
-{
-	int			error;
-	vnode_t			*vp;
-	xfs_inode_t		*ip;
-	xfs_mount_t		*mp;
-
-	vp = LINVFS_GET_VP(inode);
-
-	vn_trace_entry(vp, "xfs_ioctl", (inst_t *)__return_address);
-
-	ip = XFS_BHVTOI(bdp);
-	mp = ip->i_mount;
-
-	switch (cmd) {
-
-	case XFS_IOC_ALLOCSP:
-	case XFS_IOC_FREESP:
-	case XFS_IOC_RESVSP:
-	case XFS_IOC_UNRESVSP:
-	case XFS_IOC_ALLOCSP64:
-	case XFS_IOC_FREESP64:
-	case XFS_IOC_RESVSP64:
-	case XFS_IOC_UNRESVSP64:
-		/*
-		 * Only allow the sys admin to reserve space unless
-		 * unwritten extents are enabled.
-		 */
-		if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) &&
-		    !capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg);
-
-	case XFS_IOC_DIOINFO: {
-		struct dioattr	da;
-		xfs_buftarg_t	*target =
-			(ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
-			mp->m_rtdev_targp : mp->m_ddev_targp;
-
-		da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
-		da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-
-		if (copy_to_user(arg, &da, sizeof(da)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_FSBULKSTAT_SINGLE:
-	case XFS_IOC_FSBULKSTAT:
-	case XFS_IOC_FSINUMBERS:
-		return xfs_ioc_bulkstat(mp, cmd, arg);
-
-	case XFS_IOC_FSGEOMETRY_V1:
-		return xfs_ioc_fsgeometry_v1(mp, arg);
-
-	case XFS_IOC_FSGEOMETRY:
-		return xfs_ioc_fsgeometry(mp, arg);
-
-	case XFS_IOC_GETVERSION:
-	case XFS_IOC_GETXFLAGS:
-	case XFS_IOC_SETXFLAGS:
-	case XFS_IOC_FSGETXATTR:
-	case XFS_IOC_FSSETXATTR:
-	case XFS_IOC_FSGETXATTRA:
-		return xfs_ioc_xattr(vp, ip, filp, cmd, arg);
-
-	case XFS_IOC_FSSETDM: {
-		struct fsdmidata	dmi;
-
-		if (copy_from_user(&dmi, arg, sizeof(dmi)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate,
-							NULL);
-		return -error;
-	}
-
-	case XFS_IOC_GETBMAP:
-	case XFS_IOC_GETBMAPA:
-		return xfs_ioc_getbmap(bdp, filp, ioflags, cmd, arg);
-
-	case XFS_IOC_GETBMAPX:
-		return xfs_ioc_getbmapx(bdp, arg);
-
-	case XFS_IOC_FD_TO_HANDLE:
-	case XFS_IOC_PATH_TO_HANDLE:
-	case XFS_IOC_PATH_TO_FSHANDLE:
-		return xfs_find_handle(cmd, arg);
-
-	case XFS_IOC_OPEN_BY_HANDLE:
-		return xfs_open_by_handle(mp, arg, filp, inode);
-
-	case XFS_IOC_FSSETDM_BY_HANDLE:
-		return xfs_fssetdm_by_handle(mp, arg, filp, inode);
-
-	case XFS_IOC_READLINK_BY_HANDLE:
-		return xfs_readlink_by_handle(mp, arg, filp, inode);
-
-	case XFS_IOC_ATTRLIST_BY_HANDLE:
-		return xfs_attrlist_by_handle(mp, arg, filp, inode);
-
-	case XFS_IOC_ATTRMULTI_BY_HANDLE:
-		return xfs_attrmulti_by_handle(mp, arg, filp, inode);
-
-	case XFS_IOC_SWAPEXT: {
-		error = xfs_swapext((struct xfs_swapext __user *)arg);
-		return -error;
-	}
-
-	case XFS_IOC_FSCOUNTS: {
-		xfs_fsop_counts_t out;
-
-		error = xfs_fs_counts(mp, &out);
-		if (error)
-			return -error;
-
-		if (copy_to_user(arg, &out, sizeof(out)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_SET_RESBLKS: {
-		xfs_fsop_resblks_t inout;
-		__uint64_t	   in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&inout, arg, sizeof(inout)))
-			return -XFS_ERROR(EFAULT);
-
-		/* input parameter is passed in resblks field of structure */
-		in = inout.resblks;
-		error = xfs_reserve_blocks(mp, &in, &inout);
-		if (error)
-			return -error;
-
-		if (copy_to_user(arg, &inout, sizeof(inout)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_GET_RESBLKS: {
-		xfs_fsop_resblks_t out;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		error = xfs_reserve_blocks(mp, NULL, &out);
-		if (error)
-			return -error;
-
-		if (copy_to_user(arg, &out, sizeof(out)))
-			return -XFS_ERROR(EFAULT);
-
-		return 0;
-	}
-
-	case XFS_IOC_FSGROWFSDATA: {
-		xfs_growfs_data_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_growfs_data(mp, &in);
-		return -error;
-	}
-
-	case XFS_IOC_FSGROWFSLOG: {
-		xfs_growfs_log_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_growfs_log(mp, &in);
-		return -error;
-	}
-
-	case XFS_IOC_FSGROWFSRT: {
-		xfs_growfs_rt_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_growfs_rt(mp, &in);
-		return -error;
-	}
-
-	case XFS_IOC_FREEZE:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (inode->i_sb->s_frozen == SB_UNFROZEN)
-			freeze_bdev(inode->i_sb->s_bdev);
-		return 0;
-
-	case XFS_IOC_THAW:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-		if (inode->i_sb->s_frozen != SB_UNFROZEN)
-			thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
-		return 0;
-
-	case XFS_IOC_GOINGDOWN: {
-		__uint32_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (get_user(in, (__uint32_t __user *)arg))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_fs_goingdown(mp, in);
-		return -error;
-	}
-
-	case XFS_IOC_ERROR_INJECTION: {
-		xfs_error_injection_t in;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (copy_from_user(&in, arg, sizeof(in)))
-			return -XFS_ERROR(EFAULT);
-
-		error = xfs_errortag_add(in.errtag, mp);
-		return -error;
-	}
-
-	case XFS_IOC_ERROR_CLEARALL:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		error = xfs_errortag_clearall(mp);
-		return -error;
-
-	default:
-		return -ENOTTY;
-	}
-}
-
-STATIC int
-xfs_ioc_space(
-	bhv_desc_t		*bdp,
-	vnode_t			*vp,
-	struct file		*filp,
-	int			ioflags,
-	unsigned int		cmd,
-	void			__user *arg)
-{
-	xfs_flock64_t		bf;
-	int			attr_flags = 0;
-	int			error;
-
-	if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
-		return -XFS_ERROR(EPERM);
-
-	if (!(filp->f_mode & FMODE_WRITE))
-		return -XFS_ERROR(EBADF);
-
-	if (!VN_ISREG(vp))
-		return -XFS_ERROR(EINVAL);
-
-	if (copy_from_user(&bf, arg, sizeof(bf)))
-		return -XFS_ERROR(EFAULT);
-
-	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-		attr_flags |= ATTR_NONBLOCK;
-	if (ioflags & IO_INVIS)
-		attr_flags |= ATTR_DMI;
-
-	error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos,
-					      NULL, attr_flags);
-	return -error;
-}
-
-STATIC int
-xfs_ioc_bulkstat(
-	xfs_mount_t		*mp,
-	unsigned int		cmd,
-	void			__user *arg)
-{
-	xfs_fsop_bulkreq_t	bulkreq;
-	int			count;	/* # of records returned */
-	xfs_ino_t		inlast;	/* last inode number */
-	int			done;
-	int			error;
-
-	/* done = 1 if there are more stats to get and if bulkstat */
-	/* should be called again (unused here, but used in dmapi) */
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -XFS_ERROR(EIO);
-
-	if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
-		return -XFS_ERROR(EFAULT);
-
-	if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
-		return -XFS_ERROR(EFAULT);
-
-	if ((count = bulkreq.icount) <= 0)
-		return -XFS_ERROR(EINVAL);
-
-	if (cmd == XFS_IOC_FSINUMBERS)
-		error = xfs_inumbers(mp, &inlast, &count,
-						bulkreq.ubuffer);
-	else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
-		error = xfs_bulkstat_single(mp, &inlast,
-						bulkreq.ubuffer, &done);
-	else {	/* XFS_IOC_FSBULKSTAT */
-		if (count == 1 && inlast != 0) {
-			inlast++;
-			error = xfs_bulkstat_single(mp, &inlast,
-					bulkreq.ubuffer, &done);
-		} else {
-			error = xfs_bulkstat(mp, &inlast, &count,
-				(bulkstat_one_pf)xfs_bulkstat_one, NULL,
-				sizeof(xfs_bstat_t), bulkreq.ubuffer,
-				BULKSTAT_FG_QUICK, &done);
-		}
-	}
-
-	if (error)
-		return -error;
-
-	if (bulkreq.ocount != NULL) {
-		if (copy_to_user(bulkreq.lastip, &inlast,
-						sizeof(xfs_ino_t)))
-			return -XFS_ERROR(EFAULT);
-
-		if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-			return -XFS_ERROR(EFAULT);
-	}
-
-	return 0;
-}
-
-STATIC int
-xfs_ioc_fsgeometry_v1(
-	xfs_mount_t		*mp,
-	void			__user *arg)
-{
-	xfs_fsop_geom_v1_t	fsgeo;
-	int			error;
-
-	error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
-	if (error)
-		return -error;
-
-	if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
-		return -XFS_ERROR(EFAULT);
-	return 0;
-}
-
-STATIC int
-xfs_ioc_fsgeometry(
-	xfs_mount_t		*mp,
-	void			__user *arg)
-{
-	xfs_fsop_geom_t		fsgeo;
-	int			error;
-
-	error = xfs_fs_geometry(mp, &fsgeo, 4);
-	if (error)
-		return -error;
-
-	if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
-		return -XFS_ERROR(EFAULT);
-	return 0;
-}
-
-/*
- * Linux extended inode flags interface.
- */
-#define LINUX_XFLAG_SYNC	0x00000008 /* Synchronous updates */
-#define LINUX_XFLAG_IMMUTABLE	0x00000010 /* Immutable file */
-#define LINUX_XFLAG_APPEND	0x00000020 /* writes to file may only append */
-#define LINUX_XFLAG_NODUMP	0x00000040 /* do not dump file */
-#define LINUX_XFLAG_NOATIME	0x00000080 /* do not update atime */
-
-STATIC unsigned int
-xfs_merge_ioc_xflags(
-	unsigned int	flags,
-	unsigned int	start)
-{
-	unsigned int	xflags = start;
-
-	if (flags & LINUX_XFLAG_IMMUTABLE)
-		xflags |= XFS_XFLAG_IMMUTABLE;
-	else
-		xflags &= ~XFS_XFLAG_IMMUTABLE;
-	if (flags & LINUX_XFLAG_APPEND)
-		xflags |= XFS_XFLAG_APPEND;
-	else
-		xflags &= ~XFS_XFLAG_APPEND;
-	if (flags & LINUX_XFLAG_SYNC)
-		xflags |= XFS_XFLAG_SYNC;
-	else
-		xflags &= ~XFS_XFLAG_SYNC;
-	if (flags & LINUX_XFLAG_NOATIME)
-		xflags |= XFS_XFLAG_NOATIME;
-	else
-		xflags &= ~XFS_XFLAG_NOATIME;
-	if (flags & LINUX_XFLAG_NODUMP)
-		xflags |= XFS_XFLAG_NODUMP;
-	else
-		xflags &= ~XFS_XFLAG_NODUMP;
-
-	return xflags;
-}
-
-STATIC unsigned int
-xfs_di2lxflags(
-	__uint16_t	di_flags)
-{
-	unsigned int	flags = 0;
-
-	if (di_flags & XFS_DIFLAG_IMMUTABLE)
-		flags |= LINUX_XFLAG_IMMUTABLE;
-	if (di_flags & XFS_DIFLAG_APPEND)
-		flags |= LINUX_XFLAG_APPEND;
-	if (di_flags & XFS_DIFLAG_SYNC)
-		flags |= LINUX_XFLAG_SYNC;
-	if (di_flags & XFS_DIFLAG_NOATIME)
-		flags |= LINUX_XFLAG_NOATIME;
-	if (di_flags & XFS_DIFLAG_NODUMP)
-		flags |= LINUX_XFLAG_NODUMP;
-	return flags;
-}
-
-STATIC int
-xfs_ioc_xattr(
-	vnode_t			*vp,
-	xfs_inode_t		*ip,
-	struct file		*filp,
-	unsigned int		cmd,
-	void			__user *arg)
-{
-	struct fsxattr		fa;
-	vattr_t			va;
-	int			error;
-	int			attr_flags;
-	unsigned int		flags;
-
-	switch (cmd) {
-	case XFS_IOC_FSGETXATTR: {
-		va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \
-			     XFS_AT_NEXTENTS | XFS_AT_PROJID;
-		VOP_GETATTR(vp, &va, 0, NULL, error);
-		if (error)
-			return -error;
-
-		fa.fsx_xflags	= va.va_xflags;
-		fa.fsx_extsize	= va.va_extsize;
-		fa.fsx_nextents = va.va_nextents;
-		fa.fsx_projid	= va.va_projid;
-
-		if (copy_to_user(arg, &fa, sizeof(fa)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_FSSETXATTR: {
-		if (copy_from_user(&fa, arg, sizeof(fa)))
-			return -XFS_ERROR(EFAULT);
-
-		attr_flags = 0;
-		if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-			attr_flags |= ATTR_NONBLOCK;
-
-		va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
-		va.va_xflags  = fa.fsx_xflags;
-		va.va_extsize = fa.fsx_extsize;
-		va.va_projid  = fa.fsx_projid;
-
-		VOP_SETATTR(vp, &va, attr_flags, NULL, error);
-		if (!error)
-			vn_revalidate(vp);	/* update Linux inode flags */
-		return -error;
-	}
-
-	case XFS_IOC_FSGETXATTRA: {
-		va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \
-			     XFS_AT_ANEXTENTS | XFS_AT_PROJID;
-		VOP_GETATTR(vp, &va, 0, NULL, error);
-		if (error)
-			return -error;
-
-		fa.fsx_xflags	= va.va_xflags;
-		fa.fsx_extsize	= va.va_extsize;
-		fa.fsx_nextents = va.va_anextents;
-		fa.fsx_projid	= va.va_projid;
-
-		if (copy_to_user(arg, &fa, sizeof(fa)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_GETXFLAGS: {
-		flags = xfs_di2lxflags(ip->i_d.di_flags);
-		if (copy_to_user(arg, &flags, sizeof(flags)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	case XFS_IOC_SETXFLAGS: {
-		if (copy_from_user(&flags, arg, sizeof(flags)))
-			return -XFS_ERROR(EFAULT);
-
-		if (flags & ~(LINUX_XFLAG_IMMUTABLE | LINUX_XFLAG_APPEND | \
-			      LINUX_XFLAG_NOATIME | LINUX_XFLAG_NODUMP | \
-			      LINUX_XFLAG_SYNC))
-			return -XFS_ERROR(EOPNOTSUPP);
-
-		attr_flags = 0;
-		if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-			attr_flags |= ATTR_NONBLOCK;
-
-		va.va_mask = XFS_AT_XFLAGS;
-		va.va_xflags = xfs_merge_ioc_xflags(flags,
-				xfs_ip2xflags(ip));
-
-		VOP_SETATTR(vp, &va, attr_flags, NULL, error);
-		if (!error)
-			vn_revalidate(vp);	/* update Linux inode flags */
-		return -error;
-	}
-
-	case XFS_IOC_GETVERSION: {
-		flags = LINVFS_GET_IP(vp)->i_generation;
-		if (copy_to_user(arg, &flags, sizeof(flags)))
-			return -XFS_ERROR(EFAULT);
-		return 0;
-	}
-
-	default:
-		return -ENOTTY;
-	}
-}
-
-STATIC int
-xfs_ioc_getbmap(
-	bhv_desc_t		*bdp,
-	struct file		*filp,
-	int			ioflags,
-	unsigned int		cmd,
-	void			__user *arg)
-{
-	struct getbmap		bm;
-	int			iflags;
-	int			error;
-
-	if (copy_from_user(&bm, arg, sizeof(bm)))
-		return -XFS_ERROR(EFAULT);
-
-	if (bm.bmv_count < 2)
-		return -XFS_ERROR(EINVAL);
-
-	iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
-	if (ioflags & IO_INVIS)
-		iflags |= BMV_IF_NO_DMAPI_READ;
-
-	error = xfs_getbmap(bdp, &bm, (struct getbmap __user *)arg+1, iflags);
-	if (error)
-		return -error;
-
-	if (copy_to_user(arg, &bm, sizeof(bm)))
-		return -XFS_ERROR(EFAULT);
-	return 0;
-}
-
-STATIC int
-xfs_ioc_getbmapx(
-	bhv_desc_t		*bdp,
-	void			__user *arg)
-{
-	struct getbmapx		bmx;
-	struct getbmap		bm;
-	int			iflags;
-	int			error;
-
-	if (copy_from_user(&bmx, arg, sizeof(bmx)))
-		return -XFS_ERROR(EFAULT);
-
-	if (bmx.bmv_count < 2)
-		return -XFS_ERROR(EINVAL);
-
-	/*
-	 * Map input getbmapx structure to a getbmap
-	 * structure for xfs_getbmap.
-	 */
-	GETBMAP_CONVERT(bmx, bm);
-
-	iflags = bmx.bmv_iflags;
-
-	if (iflags & (~BMV_IF_VALID))
-		return -XFS_ERROR(EINVAL);
-
-	iflags |= BMV_IF_EXTENDED;
-
-	error = xfs_getbmap(bdp, &bm, (struct getbmapx __user *)arg+1, iflags);
-	if (error)
-		return -error;
-
-	GETBMAP_CONVERT(bm, bmx);
-
-	if (copy_to_user(arg, &bmx, sizeof(bmx)))
-		return -XFS_ERROR(EFAULT);
-
-	return 0;
-}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
deleted file mode 100644
index a7c9ba1a9f7..00000000000
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2004-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <linux/config.h>
-#include <linux/compat.h>
-#include <linux/init.h>
-#include <linux/ioctl.h>
-#include <linux/syscalls.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <asm/uaccess.h>
-#include "xfs.h"
-#include "xfs_types.h"
-#include "xfs_fs.h"
-#include "xfs_vfs.h"
-#include "xfs_vnode.h"
-#include "xfs_dfrag.h"
-
-#define  _NATIVE_IOC(cmd, type) \
-	  _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
-
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
-#define BROKEN_X86_ALIGNMENT
-/* on ia32 l_start is on a 32-bit boundary */
-typedef struct xfs_flock64_32 {
-	__s16		l_type;
-	__s16		l_whence;
-	__s64		l_start	__attribute__((packed));
-			/* len == 0 means until end of file */
-	__s64		l_len __attribute__((packed));
-	__s32		l_sysid;
-	__u32		l_pid;
-	__s32		l_pad[4];	/* reserve area */
-} xfs_flock64_32_t;
-
-#define XFS_IOC_ALLOCSP_32	_IOW ('X', 10, struct xfs_flock64_32)
-#define XFS_IOC_FREESP_32	_IOW ('X', 11, struct xfs_flock64_32)
-#define XFS_IOC_ALLOCSP64_32	_IOW ('X', 36, struct xfs_flock64_32)
-#define XFS_IOC_FREESP64_32	_IOW ('X', 37, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP_32	_IOW ('X', 40, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP_32	_IOW ('X', 41, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP64_32	_IOW ('X', 42, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP64_32	_IOW ('X', 43, struct xfs_flock64_32)
-
-/* just account for different alignment */
-STATIC unsigned long
-xfs_ioctl32_flock(
-	unsigned long		arg)
-{
-	xfs_flock64_32_t	__user *p32 = (void __user *)arg;
-	xfs_flock64_t		__user *p = compat_alloc_user_space(sizeof(*p));
-
-	if (copy_in_user(&p->l_type,	&p32->l_type,	sizeof(s16)) ||
-	    copy_in_user(&p->l_whence,	&p32->l_whence, sizeof(s16)) ||
-	    copy_in_user(&p->l_start,	&p32->l_start,	sizeof(s64)) ||
-	    copy_in_user(&p->l_len,	&p32->l_len,	sizeof(s64)) ||
-	    copy_in_user(&p->l_sysid,	&p32->l_sysid,	sizeof(s32)) ||
-	    copy_in_user(&p->l_pid,	&p32->l_pid,	sizeof(u32)) ||
-	    copy_in_user(&p->l_pad,	&p32->l_pad,	4*sizeof(u32)))
-		return -EFAULT;
-	
-	return (unsigned long)p;
-}
-
-#else
-
-typedef struct xfs_fsop_bulkreq32 {
-	compat_uptr_t	lastip;		/* last inode # pointer		*/
-	__s32		icount;		/* count of entries in buffer	*/
-	compat_uptr_t	ubuffer;	/* user buffer for inode desc.	*/
-	__s32		ocount;		/* output count pointer		*/
-} xfs_fsop_bulkreq32_t;
-
-STATIC unsigned long
-xfs_ioctl32_bulkstat(
-	unsigned long		arg)
-{
-	xfs_fsop_bulkreq32_t	__user *p32 = (void __user *)arg;
-	xfs_fsop_bulkreq_t	__user *p = compat_alloc_user_space(sizeof(*p));
-	u32			addr;
-
-	if (get_user(addr, &p32->lastip) ||
-	    put_user(compat_ptr(addr), &p->lastip) ||
-	    copy_in_user(&p->icount, &p32->icount, sizeof(s32)) ||
-	    get_user(addr, &p32->ubuffer) ||
-	    put_user(compat_ptr(addr), &p->ubuffer) ||
-	    get_user(addr, &p32->ocount) ||
-	    put_user(compat_ptr(addr), &p->ocount))
-		return -EFAULT;
-
-	return (unsigned long)p;
-}
-#endif
-
-STATIC long
-__linvfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg)
-{
-	int		error;
-	struct		inode *inode = f->f_dentry->d_inode;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-
-	switch (cmd) {
-	case XFS_IOC_DIOINFO:
-	case XFS_IOC_FSGEOMETRY_V1:
-	case XFS_IOC_FSGEOMETRY:
-	case XFS_IOC_GETVERSION:
-	case XFS_IOC_GETXFLAGS:
-	case XFS_IOC_SETXFLAGS:
-	case XFS_IOC_FSGETXATTR:
-	case XFS_IOC_FSSETXATTR:
-	case XFS_IOC_FSGETXATTRA:
-	case XFS_IOC_FSSETDM:
-	case XFS_IOC_GETBMAP:
-	case XFS_IOC_GETBMAPA:
-	case XFS_IOC_GETBMAPX:
-/* not handled
-	case XFS_IOC_FD_TO_HANDLE:
-	case XFS_IOC_PATH_TO_HANDLE:
-	case XFS_IOC_PATH_TO_FSHANDLE:
-	case XFS_IOC_OPEN_BY_HANDLE:
-	case XFS_IOC_FSSETDM_BY_HANDLE:
-	case XFS_IOC_READLINK_BY_HANDLE:
-	case XFS_IOC_ATTRLIST_BY_HANDLE:
-	case XFS_IOC_ATTRMULTI_BY_HANDLE:
-*/
-	case XFS_IOC_FSCOUNTS:
-	case XFS_IOC_SET_RESBLKS:
-	case XFS_IOC_GET_RESBLKS:
-	case XFS_IOC_FSGROWFSDATA:
-	case XFS_IOC_FSGROWFSLOG:
-	case XFS_IOC_FSGROWFSRT:
-	case XFS_IOC_FREEZE:
-	case XFS_IOC_THAW:
-	case XFS_IOC_GOINGDOWN:
-	case XFS_IOC_ERROR_INJECTION:
-	case XFS_IOC_ERROR_CLEARALL:
-		break;
-
-#ifdef BROKEN_X86_ALIGNMENT
-	/* xfs_flock_t has wrong u32 vs u64 alignment */
-	case XFS_IOC_ALLOCSP_32:
-	case XFS_IOC_FREESP_32:
-	case XFS_IOC_ALLOCSP64_32:
-	case XFS_IOC_FREESP64_32:
-	case XFS_IOC_RESVSP_32:
-	case XFS_IOC_UNRESVSP_32:
-	case XFS_IOC_RESVSP64_32:
-	case XFS_IOC_UNRESVSP64_32:
-		arg = xfs_ioctl32_flock(arg);
-		cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
-		break;
-
-#else /* These are handled fine if no alignment issues */
-	case XFS_IOC_ALLOCSP:
-	case XFS_IOC_FREESP:
-	case XFS_IOC_RESVSP:
-	case XFS_IOC_UNRESVSP:
-	case XFS_IOC_ALLOCSP64:
-	case XFS_IOC_FREESP64:
-	case XFS_IOC_RESVSP64:
-	case XFS_IOC_UNRESVSP64:
-		break;
-
-	/* xfs_bstat_t still has wrong u32 vs u64 alignment */
-	case XFS_IOC_SWAPEXT:
-		break;
-
-	case XFS_IOC_FSBULKSTAT_SINGLE:
-	case XFS_IOC_FSBULKSTAT:
-	case XFS_IOC_FSINUMBERS:
-		arg = xfs_ioctl32_bulkstat(arg);
-		break;
-#endif
-	default:
-		return -ENOIOCTLCMD;
-	}
-
-	VOP_IOCTL(vp, inode, f, mode, cmd, (void __user *)arg, error);
-	VMODIFY(vp);
-
-	return error;
-}
-
-long
-linvfs_compat_ioctl(
-	struct file		*f,
-	unsigned		cmd,
-	unsigned long		arg)
-{
-	return __linvfs_compat_ioctl(0, f, cmd, arg);
-}
-
-long
-linvfs_compat_invis_ioctl(
-	struct file		*f,
-	unsigned		cmd,
-	unsigned long		arg)
-{
-	return __linvfs_compat_ioctl(IO_INVIS, f, cmd, arg);
-}
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
deleted file mode 100644
index eda7919b70a..00000000000
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ /dev/null
@@ -1,858 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-
-#include <linux/capability.h>
-#include <linux/xattr.h>
-#include <linux/namei.h>
-#include <linux/security.h>
-
-/*
- * Get a XFS inode from a given vnode.
- */
-xfs_inode_t *
-xfs_vtoi(
-	struct vnode	*vp)
-{
-	bhv_desc_t      *bdp;
-
-	bdp = bhv_lookup_range(VN_BHV_HEAD(vp),
-			VNODE_POSITION_XFS, VNODE_POSITION_XFS);
-	if (unlikely(bdp == NULL))
-		return NULL;
-	return XFS_BHVTOI(bdp);
-}
-
-/*
- * Bring the atime in the XFS inode uptodate.
- * Used before logging the inode to disk or when the Linux inode goes away.
- */
-void
-xfs_synchronize_atime(
-	xfs_inode_t	*ip)
-{
-	vnode_t		*vp;
-
-	vp = XFS_ITOV_NULL(ip);
-	if (vp) {
-		struct inode *inode = &vp->v_inode;
-		ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
-		ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
-	}
-}
-
-/*
- * Change the requested timestamp in the given inode.
- * We don't lock across timestamp updates, and we don't log them but
- * we do record the fact that there is dirty information in core.
- *
- * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
- *		with XFS_ICHGTIME_ACC to be sure that access time
- *		update will take.  Calling first with XFS_ICHGTIME_ACC
- *		and then XFS_ICHGTIME_MOD may fail to modify the access
- *		timestamp if the filesystem is mounted noacctm.
- */
-void
-xfs_ichgtime(
-	xfs_inode_t	*ip,
-	int		flags)
-{
-	struct inode	*inode = LINVFS_GET_IP(XFS_ITOV(ip));
-	timespec_t	tv;
-
-	nanotime(&tv);
-	if (flags & XFS_ICHGTIME_MOD) {
-		inode->i_mtime = tv;
-		ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
-		ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
-	}
-	if (flags & XFS_ICHGTIME_ACC) {
-		inode->i_atime = tv;
-		ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
-		ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
-	}
-	if (flags & XFS_ICHGTIME_CHG) {
-		inode->i_ctime = tv;
-		ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
-		ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
-	}
-
-	/*
-	 * We update the i_update_core field _after_ changing
-	 * the timestamps in order to coordinate properly with
-	 * xfs_iflush() so that we don't lose timestamp updates.
-	 * This keeps us from having to hold the inode lock
-	 * while doing this.  We use the SYNCHRONIZE macro to
-	 * ensure that the compiler does not reorder the update
-	 * of i_update_core above the timestamp updates above.
-	 */
-	SYNCHRONIZE();
-	ip->i_update_core = 1;
-	if (!(inode->i_state & I_LOCK))
-		mark_inode_dirty_sync(inode);
-}
-
-/*
- * Variant on the above which avoids querying the system clock
- * in situations where we know the Linux inode timestamps have
- * just been updated (and so we can update our inode cheaply).
- */
-void
-xfs_ichgtime_fast(
-	xfs_inode_t	*ip,
-	struct inode	*inode,
-	int		flags)
-{
-	timespec_t	*tvp;
-
-	/*
-	 * Atime updates for read() & friends are handled lazily now, and
-	 * explicit updates must go through xfs_ichgtime()
-	 */
-	ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
-
-	/*
-	 * We're not supposed to change timestamps in readonly-mounted
-	 * filesystems.  Throw it away if anyone asks us.
-	 */
-	if (unlikely(IS_RDONLY(inode)))
-		return;
-
-	if (flags & XFS_ICHGTIME_MOD) {
-		tvp = &inode->i_mtime;
-		ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
-		ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec;
-	}
-	if (flags & XFS_ICHGTIME_CHG) {
-		tvp = &inode->i_ctime;
-		ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec;
-		ip->i_d.di_ctime.t_nsec = (__int32_t)tvp->tv_nsec;
-	}
-
-	/*
-	 * We update the i_update_core field _after_ changing
-	 * the timestamps in order to coordinate properly with
-	 * xfs_iflush() so that we don't lose timestamp updates.
-	 * This keeps us from having to hold the inode lock
-	 * while doing this.  We use the SYNCHRONIZE macro to
-	 * ensure that the compiler does not reorder the update
-	 * of i_update_core above the timestamp updates above.
-	 */
-	SYNCHRONIZE();
-	ip->i_update_core = 1;
-	if (!(inode->i_state & I_LOCK))
-		mark_inode_dirty_sync(inode);
-}
-
-
-/*
- * Pull the link count and size up from the xfs inode to the linux inode
- */
-STATIC void
-validate_fields(
-	struct inode	*ip)
-{
-	vnode_t		*vp = LINVFS_GET_VP(ip);
-	vattr_t		va;
-	int		error;
-
-	va.va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS;
-	VOP_GETATTR(vp, &va, ATTR_LAZY, NULL, error);
-	if (likely(!error)) {
-		ip->i_nlink = va.va_nlink;
-		ip->i_blocks = va.va_nblocks;
-
-		/* we're under i_mutex so i_size can't change under us */
-		if (i_size_read(ip) != va.va_size)
-			i_size_write(ip, va.va_size);
-	}
-}
-
-/*
- * Hook in SELinux.  This is not quite correct yet, what we really need
- * here (as we do for default ACLs) is a mechanism by which creation of
- * these attrs can be journalled at inode creation time (along with the
- * inode, of course, such that log replay can't cause these to be lost).
- */
-STATIC int
-linvfs_init_security(
-	struct vnode	*vp,
-	struct inode	*dir)
-{
-	struct inode	*ip = LINVFS_GET_IP(vp);
-	size_t		length;
-	void		*value;
-	char		*name;
-	int		error;
-
-	error = security_inode_init_security(ip, dir, &name, &value, &length);
-	if (error) {
-		if (error == -EOPNOTSUPP)
-			return 0;
-		return -error;
-	}
-
-	VOP_ATTR_SET(vp, name, value, length, ATTR_SECURE, NULL, error);
-	if (!error)
-		VMODIFY(vp);
-
-	kfree(name);
-	kfree(value);
-	return error;
-}
-
-/*
- * Determine whether a process has a valid fs_struct (kernel daemons
- * like knfsd don't have an fs_struct).
- *
- * XXX(hch):  nfsd is broken, better fix it instead.
- */
-STATIC inline int
-has_fs_struct(struct task_struct *task)
-{
-	return (task->fs != init_task.fs);
-}
-
-STATIC inline void
-cleanup_inode(
-	vnode_t		*dvp,
-	vnode_t		*vp,
-	struct dentry	*dentry,	
-	int		mode)
-{
-	struct dentry   teardown = {};
-	int             err2;
-
-	/* Oh, the horror.
-	 * If we can't add the ACL or we fail in 
-	 * linvfs_init_security we must back out.
-	 * ENOSPC can hit here, among other things.
-	 */
-	teardown.d_inode = LINVFS_GET_IP(vp);
-	teardown.d_name = dentry->d_name;
-
-	if (S_ISDIR(mode))
-	  	VOP_RMDIR(dvp, &teardown, NULL, err2);
-	else
-		VOP_REMOVE(dvp, &teardown, NULL, err2);
-	VN_RELE(vp);
-}
-
-STATIC int
-linvfs_mknod(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	int		mode,
-	dev_t		rdev)
-{
-	struct inode	*ip;
-	vattr_t		va;
-	vnode_t		*vp = NULL, *dvp = LINVFS_GET_VP(dir);
-	xfs_acl_t	*default_acl = NULL;
-	attrexists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
-	int		error;
-
-	/*
-	 * Irix uses Missed'em'V split, but doesn't want to see
-	 * the upper 5 bits of (14bit) major.
-	 */
-	if (!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)
-		return -EINVAL;
-
-	if (test_default_acl && test_default_acl(dvp)) {
-		if (!_ACL_ALLOC(default_acl))
-			return -ENOMEM;
-		if (!_ACL_GET_DEFAULT(dvp, default_acl)) {
-			_ACL_FREE(default_acl);
-			default_acl = NULL;
-		}
-	}
-
-	if (IS_POSIXACL(dir) && !default_acl && has_fs_struct(current))
-		mode &= ~current->fs->umask;
-
-	memset(&va, 0, sizeof(va));
-	va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
-	va.va_mode = mode;
-
-	switch (mode & S_IFMT) {
-	case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
-		va.va_rdev = sysv_encode_dev(rdev);
-		va.va_mask |= XFS_AT_RDEV;
-		/*FALLTHROUGH*/
-	case S_IFREG:
-		VOP_CREATE(dvp, dentry, &va, &vp, NULL, error);
-		break;
-	case S_IFDIR:
-		VOP_MKDIR(dvp, dentry, &va, &vp, NULL, error);
-		break;
-	default:
-		error = EINVAL;
-		break;
-	}
-
-	if (!error)
-	{
-		error = linvfs_init_security(vp, dir);
-		if (error)
-			cleanup_inode(dvp, vp, dentry, mode);
-	}
-
-	if (default_acl) {
-		if (!error) {
-			error = _ACL_INHERIT(vp, &va, default_acl);
-			if (!error) 
-				VMODIFY(vp);
-			else
-				cleanup_inode(dvp, vp, dentry, mode);
-		}
-		_ACL_FREE(default_acl);
-	}
-
-	if (!error) {
-		ASSERT(vp);
-		ip = LINVFS_GET_IP(vp);
-
-		if (S_ISCHR(mode) || S_ISBLK(mode))
-			ip->i_rdev = rdev;
-		else if (S_ISDIR(mode))
-			validate_fields(ip);
-		d_instantiate(dentry, ip);
-		validate_fields(dir);
-	}
-	return -error;
-}
-
-STATIC int
-linvfs_create(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	int		mode,
-	struct nameidata *nd)
-{
-	return linvfs_mknod(dir, dentry, mode, 0);
-}
-
-STATIC int
-linvfs_mkdir(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	int		mode)
-{
-	return linvfs_mknod(dir, dentry, mode|S_IFDIR, 0);
-}
-
-STATIC struct dentry *
-linvfs_lookup(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	struct nameidata *nd)
-{
-	struct vnode	*vp = LINVFS_GET_VP(dir), *cvp;
-	int		error;
-
-	if (dentry->d_name.len >= MAXNAMELEN)
-		return ERR_PTR(-ENAMETOOLONG);
-
-	VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error);
-	if (error) {
-		if (unlikely(error != ENOENT))
-			return ERR_PTR(-error);
-		d_add(dentry, NULL);
-		return NULL;
-	}
-
-	return d_splice_alias(LINVFS_GET_IP(cvp), dentry);
-}
-
-STATIC int
-linvfs_link(
-	struct dentry	*old_dentry,
-	struct inode	*dir,
-	struct dentry	*dentry)
-{
-	struct inode	*ip;	/* inode of guy being linked to */
-	vnode_t		*tdvp;	/* target directory for new name/link */
-	vnode_t		*vp;	/* vp of name being linked */
-	int		error;
-
-	ip = old_dentry->d_inode;	/* inode being linked to */
-	if (S_ISDIR(ip->i_mode))
-		return -EPERM;
-
-	tdvp = LINVFS_GET_VP(dir);
-	vp = LINVFS_GET_VP(ip);
-
-	VOP_LINK(tdvp, vp, dentry, NULL, error);
-	if (!error) {
-		VMODIFY(tdvp);
-		VN_HOLD(vp);
-		validate_fields(ip);
-		d_instantiate(dentry, ip);
-	}
-	return -error;
-}
-
-STATIC int
-linvfs_unlink(
-	struct inode	*dir,
-	struct dentry	*dentry)
-{
-	struct inode	*inode;
-	vnode_t		*dvp;	/* directory containing name to remove */
-	int		error;
-
-	inode = dentry->d_inode;
-	dvp = LINVFS_GET_VP(dir);
-
-	VOP_REMOVE(dvp, dentry, NULL, error);
-	if (!error) {
-		validate_fields(dir);	/* For size only */
-		validate_fields(inode);
-	}
-
-	return -error;
-}
-
-STATIC int
-linvfs_symlink(
-	struct inode	*dir,
-	struct dentry	*dentry,
-	const char	*symname)
-{
-	struct inode	*ip;
-	vattr_t		va;
-	vnode_t		*dvp;	/* directory containing name of symlink */
-	vnode_t		*cvp;	/* used to lookup symlink to put in dentry */
-	int		error;
-
-	dvp = LINVFS_GET_VP(dir);
-	cvp = NULL;
-
-	memset(&va, 0, sizeof(va));
-	va.va_mode = S_IFLNK |
-		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
-	va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
-
-	error = 0;
-	VOP_SYMLINK(dvp, dentry, &va, (char *)symname, &cvp, NULL, error);
-	if (likely(!error && cvp)) {
-		error = linvfs_init_security(cvp, dir);
-		if (likely(!error)) {
-			ip = LINVFS_GET_IP(cvp);
-			d_instantiate(dentry, ip);
-			validate_fields(dir);
-			validate_fields(ip);
-		}
-	}
-	return -error;
-}
-
-STATIC int
-linvfs_rmdir(
-	struct inode	*dir,
-	struct dentry	*dentry)
-{
-	struct inode	*inode = dentry->d_inode;
-	vnode_t		*dvp = LINVFS_GET_VP(dir);
-	int		error;
-
-	VOP_RMDIR(dvp, dentry, NULL, error);
-	if (!error) {
-		validate_fields(inode);
-		validate_fields(dir);
-	}
-	return -error;
-}
-
-STATIC int
-linvfs_rename(
-	struct inode	*odir,
-	struct dentry	*odentry,
-	struct inode	*ndir,
-	struct dentry	*ndentry)
-{
-	struct inode	*new_inode = ndentry->d_inode;
-	vnode_t		*fvp;	/* from directory */
-	vnode_t		*tvp;	/* target directory */
-	int		error;
-
-	fvp = LINVFS_GET_VP(odir);
-	tvp = LINVFS_GET_VP(ndir);
-
-	VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error);
-	if (error)
-		return -error;
-
-	if (new_inode)
-		validate_fields(new_inode);
-
-	validate_fields(odir);
-	if (ndir != odir)
-		validate_fields(ndir);
-	return 0;
-}
-
-/*
- * careful here - this function can get called recursively, so
- * we need to be very careful about how much stack we use.
- * uio is kmalloced for this reason...
- */
-STATIC void *
-linvfs_follow_link(
-	struct dentry		*dentry,
-	struct nameidata	*nd)
-{
-	vnode_t			*vp;
-	uio_t			*uio;
-	iovec_t			iov;
-	int			error;
-	char			*link;
-
-	ASSERT(dentry);
-	ASSERT(nd);
-
-	link = (char *)kmalloc(MAXPATHLEN+1, GFP_KERNEL);
-	if (!link) {
-		nd_set_link(nd, ERR_PTR(-ENOMEM));
-		return NULL;
-	}
-
-	uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL);
-	if (!uio) {
-		kfree(link);
-		nd_set_link(nd, ERR_PTR(-ENOMEM));
-		return NULL;
-	}
-
-	vp = LINVFS_GET_VP(dentry->d_inode);
-
-	iov.iov_base = link;
-	iov.iov_len = MAXPATHLEN;
-
-	uio->uio_iov = &iov;
-	uio->uio_offset = 0;
-	uio->uio_segflg = UIO_SYSSPACE;
-	uio->uio_resid = MAXPATHLEN;
-	uio->uio_iovcnt = 1;
-
-	VOP_READLINK(vp, uio, 0, NULL, error);
-	if (error) {
-		kfree(link);
-		link = ERR_PTR(-error);
-	} else {
-		link[MAXPATHLEN - uio->uio_resid] = '\0';
-	}
-	kfree(uio);
-
-	nd_set_link(nd, link);
-	return NULL;
-}
-
-STATIC void
-linvfs_put_link(
-	struct dentry	*dentry,
-	struct nameidata *nd,
-	void		*p)
-{
-	char		*s = nd_get_link(nd);
-
-	if (!IS_ERR(s))
-		kfree(s);
-}
-
-#ifdef CONFIG_XFS_POSIX_ACL
-STATIC int
-linvfs_permission(
-	struct inode	*inode,
-	int		mode,
-	struct nameidata *nd)
-{
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	int		error;
-
-	mode <<= 6;		/* convert from linux to vnode access bits */
-	VOP_ACCESS(vp, mode, NULL, error);
-	return -error;
-}
-#else
-#define linvfs_permission NULL
-#endif
-
-STATIC int
-linvfs_getattr(
-	struct vfsmount	*mnt,
-	struct dentry	*dentry,
-	struct kstat	*stat)
-{
-	struct inode	*inode = dentry->d_inode;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	int		error = 0;
-
-	if (unlikely(vp->v_flag & VMODIFIED))
-		error = vn_revalidate(vp);
-	if (!error)
-		generic_fillattr(inode, stat);
-	return 0;
-}
-
-STATIC int
-linvfs_setattr(
-	struct dentry	*dentry,
-	struct iattr	*attr)
-{
-	struct inode	*inode = dentry->d_inode;
-	unsigned int	ia_valid = attr->ia_valid;
-	vnode_t		*vp = LINVFS_GET_VP(inode);
-	vattr_t		vattr;
-	int		flags = 0;
-	int		error;
-
-	memset(&vattr, 0, sizeof(vattr_t));
-	if (ia_valid & ATTR_UID) {
-		vattr.va_mask |= XFS_AT_UID;
-		vattr.va_uid = attr->ia_uid;
-	}
-	if (ia_valid & ATTR_GID) {
-		vattr.va_mask |= XFS_AT_GID;
-		vattr.va_gid = attr->ia_gid;
-	}
-	if (ia_valid & ATTR_SIZE) {
-		vattr.va_mask |= XFS_AT_SIZE;
-		vattr.va_size = attr->ia_size;
-	}
-	if (ia_valid & ATTR_ATIME) {
-		vattr.va_mask |= XFS_AT_ATIME;
-		vattr.va_atime = attr->ia_atime;
-	}
-	if (ia_valid & ATTR_MTIME) {
-		vattr.va_mask |= XFS_AT_MTIME;
-		vattr.va_mtime = attr->ia_mtime;
-	}
-	if (ia_valid & ATTR_CTIME) {
-		vattr.va_mask |= XFS_AT_CTIME;
-		vattr.va_ctime = attr->ia_ctime;
-	}
-	if (ia_valid & ATTR_MODE) {
-		vattr.va_mask |= XFS_AT_MODE;
-		vattr.va_mode = attr->ia_mode;
-		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-			inode->i_mode &= ~S_ISGID;
-	}
-
-	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))
-		flags |= ATTR_UTIME;
-#ifdef ATTR_NO_BLOCK
-	if ((ia_valid & ATTR_NO_BLOCK))
-		flags |= ATTR_NONBLOCK;
-#endif
-
-	VOP_SETATTR(vp, &vattr, flags, NULL, error);
-	if (error)
-		return -error;
-	vn_revalidate(vp);
-	return error;
-}
-
-STATIC void
-linvfs_truncate(
-	struct inode	*inode)
-{
-	block_truncate_page(inode->i_mapping, inode->i_size, linvfs_get_block);
-}
-
-STATIC int
-linvfs_setxattr(
-	struct dentry	*dentry,
-	const char	*name,
-	const void	*data,
-	size_t		size,
-	int		flags)
-{
-	vnode_t		*vp = LINVFS_GET_VP(dentry->d_inode);
-	char		*attr = (char *)name;
-	attrnames_t	*namesp;
-	int		xflags = 0;
-	int		error;
-
-	namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
-	if (!namesp)
-		return -EOPNOTSUPP;
-	attr += namesp->attr_namelen;
-	error = namesp->attr_capable(vp, NULL);
-	if (error)
-		return error;
-
-	/* Convert Linux syscall to XFS internal ATTR flags */
-	if (flags & XATTR_CREATE)
-		xflags |= ATTR_CREATE;
-	if (flags & XATTR_REPLACE)
-		xflags |= ATTR_REPLACE;
-	xflags |= namesp->attr_flag;
-	return namesp->attr_set(vp, attr, (void *)data, size, xflags);
-}
-
-STATIC ssize_t
-linvfs_getxattr(
-	struct dentry	*dentry,
-	const char	*name,
-	void		*data,
-	size_t		size)
-{
-	vnode_t		*vp = LINVFS_GET_VP(dentry->d_inode);
-	char		*attr = (char *)name;
-	attrnames_t	*namesp;
-	int		xflags = 0;
-	ssize_t		error;
-
-	namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
-	if (!namesp)
-		return -EOPNOTSUPP;
-	attr += namesp->attr_namelen;
-	error = namesp->attr_capable(vp, NULL);
-	if (error)
-		return error;
-
-	/* Convert Linux syscall to XFS internal ATTR flags */
-	if (!size) {
-		xflags |= ATTR_KERNOVAL;
-		data = NULL;
-	}
-	xflags |= namesp->attr_flag;
-	return namesp->attr_get(vp, attr, (void *)data, size, xflags);
-}
-
-STATIC ssize_t
-linvfs_listxattr(
-	struct dentry		*dentry,
-	char			*data,
-	size_t			size)
-{
-	vnode_t			*vp = LINVFS_GET_VP(dentry->d_inode);
-	int			error, xflags = ATTR_KERNAMELS;
-	ssize_t			result;
-
-	if (!size)
-		xflags |= ATTR_KERNOVAL;
-	xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS;
-
-	error = attr_generic_list(vp, data, size, xflags, &result);
-	if (error < 0)
-		return error;
-	return result;
-}
-
-STATIC int
-linvfs_removexattr(
-	struct dentry	*dentry,
-	const char	*name)
-{
-	vnode_t		*vp = LINVFS_GET_VP(dentry->d_inode);
-	char		*attr = (char *)name;
-	attrnames_t	*namesp;
-	int		xflags = 0;
-	int		error;
-
-	namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
-	if (!namesp)
-		return -EOPNOTSUPP;
-	attr += namesp->attr_namelen;
-	error = namesp->attr_capable(vp, NULL);
-	if (error)
-		return error;
-	xflags |= namesp->attr_flag;
-	return namesp->attr_remove(vp, attr, xflags);
-}
-
-
-struct inode_operations linvfs_file_inode_operations = {
-	.permission		= linvfs_permission,
-	.truncate		= linvfs_truncate,
-	.getattr		= linvfs_getattr,
-	.setattr		= linvfs_setattr,
-	.setxattr		= linvfs_setxattr,
-	.getxattr		= linvfs_getxattr,
-	.listxattr		= linvfs_listxattr,
-	.removexattr		= linvfs_removexattr,
-};
-
-struct inode_operations linvfs_dir_inode_operations = {
-	.create			= linvfs_create,
-	.lookup			= linvfs_lookup,
-	.link			= linvfs_link,
-	.unlink			= linvfs_unlink,
-	.symlink		= linvfs_symlink,
-	.mkdir			= linvfs_mkdir,
-	.rmdir			= linvfs_rmdir,
-	.mknod			= linvfs_mknod,
-	.rename			= linvfs_rename,
-	.permission		= linvfs_permission,
-	.getattr		= linvfs_getattr,
-	.setattr		= linvfs_setattr,
-	.setxattr		= linvfs_setxattr,
-	.getxattr		= linvfs_getxattr,
-	.listxattr		= linvfs_listxattr,
-	.removexattr		= linvfs_removexattr,
-};
-
-struct inode_operations linvfs_symlink_inode_operations = {
-	.readlink		= generic_readlink,
-	.follow_link		= linvfs_follow_link,
-	.put_link		= linvfs_put_link,
-	.permission		= linvfs_permission,
-	.getattr		= linvfs_getattr,
-	.setattr		= linvfs_setattr,
-	.setxattr		= linvfs_setxattr,
-	.getxattr		= linvfs_getxattr,
-	.listxattr		= linvfs_listxattr,
-	.removexattr		= linvfs_removexattr,
-};
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
deleted file mode 100644
index e0ab45fbfeb..00000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ /dev/null
@@ -1,1021 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_inode_item.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_iomap.h"
-
-#include <linux/capability.h>
-#include <linux/writeback.h>
-
-
-#if defined(XFS_RW_TRACE)
-void
-xfs_rw_enter_trace(
-	int			tag,
-	xfs_iocore_t		*io,
-	void			*data,
-	size_t			segs,
-	loff_t			offset,
-	int			ioflags)
-{
-	xfs_inode_t	*ip = XFS_IO_INODE(io);
-
-	if (ip->i_rwtrace == NULL)
-		return;
-	ktrace_enter(ip->i_rwtrace,
-		(void *)(unsigned long)tag,
-		(void *)ip,
-		(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
-		(void *)data,
-		(void *)((unsigned long)segs),
-		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(offset & 0xffffffff)),
-		(void *)((unsigned long)ioflags),
-		(void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(io->io_new_size & 0xffffffff)),
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL);
-}
-
-void
-xfs_inval_cached_trace(
-	xfs_iocore_t	*io,
-	xfs_off_t	offset,
-	xfs_off_t	len,
-	xfs_off_t	first,
-	xfs_off_t	last)
-{
-	xfs_inode_t	*ip = XFS_IO_INODE(io);
-
-	if (ip->i_rwtrace == NULL)
-		return;
-	ktrace_enter(ip->i_rwtrace,
-		(void *)(__psint_t)XFS_INVAL_CACHED,
-		(void *)ip,
-		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(offset & 0xffffffff)),
-		(void *)((unsigned long)((len >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(len & 0xffffffff)),
-		(void *)((unsigned long)((first >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(first & 0xffffffff)),
-		(void *)((unsigned long)((last >> 32) & 0xffffffff)),
-		(void *)((unsigned long)(last & 0xffffffff)),
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL,
-		(void *)NULL);
-}
-#endif
-
-/*
- *	xfs_iozero
- *
- *	xfs_iozero clears the specified range of buffer supplied,
- *	and marks all the affected blocks as valid and modified.  If
- *	an affected block is not allocated, it will be allocated.  If
- *	an affected block is not completely overwritten, and is not
- *	valid before the operation, it will be read from disk before
- *	being partially zeroed.
- */
-STATIC int
-xfs_iozero(
-	struct inode		*ip,	/* inode			*/
-	loff_t			pos,	/* offset in file		*/
-	size_t			count,	/* size of data to zero		*/
-	loff_t			end_size)	/* max file size to set */
-{
-	unsigned		bytes;
-	struct page		*page;
-	struct address_space	*mapping;
-	char			*kaddr;
-	int			status;
-
-	mapping = ip->i_mapping;
-	do {
-		unsigned long index, offset;
-
-		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-		index = pos >> PAGE_CACHE_SHIFT;
-		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
-			bytes = count;
-
-		status = -ENOMEM;
-		page = grab_cache_page(mapping, index);
-		if (!page)
-			break;
-
-		kaddr = kmap(page);
-		status = mapping->a_ops->prepare_write(NULL, page, offset,
-							offset + bytes);
-		if (status) {
-			goto unlock;
-		}
-
-		memset((void *) (kaddr + offset), 0, bytes);
-		flush_dcache_page(page);
-		status = mapping->a_ops->commit_write(NULL, page, offset,
-							offset + bytes);
-		if (!status) {
-			pos += bytes;
-			count -= bytes;
-			if (pos > i_size_read(ip))
-				i_size_write(ip, pos < end_size ? pos : end_size);
-		}
-
-unlock:
-		kunmap(page);
-		unlock_page(page);
-		page_cache_release(page);
-		if (status)
-			break;
-	} while (count);
-
-	return (-status);
-}
-
-ssize_t			/* bytes read, or (-)  error */
-xfs_read(
-	bhv_desc_t		*bdp,
-	struct kiocb		*iocb,
-	const struct iovec	*iovp,
-	unsigned int		segs,
-	loff_t			*offset,
-	int			ioflags,
-	cred_t			*credp)
-{
-	struct file		*file = iocb->ki_filp;
-	struct inode		*inode = file->f_mapping->host;
-	size_t			size = 0;
-	ssize_t			ret;
-	xfs_fsize_t		n;
-	xfs_inode_t		*ip;
-	xfs_mount_t		*mp;
-	vnode_t			*vp;
-	unsigned long		seg;
-
-	ip = XFS_BHVTOI(bdp);
-	vp = BHV_TO_VNODE(bdp);
-	mp = ip->i_mount;
-
-	XFS_STATS_INC(xs_read_calls);
-
-	/* START copy & waste from filemap.c */
-	for (seg = 0; seg < segs; seg++) {
-		const struct iovec *iv = &iovp[seg];
-
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		size += iv->iov_len;
-		if (unlikely((ssize_t)(size|iv->iov_len) < 0))
-			return XFS_ERROR(-EINVAL);
-	}
-	/* END copy & waste from filemap.c */
-
-	if (unlikely(ioflags & IO_ISDIRECT)) {
-		xfs_buftarg_t	*target =
-			(ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp;
-		if ((*offset & target->bt_smask) ||
-		    (size & target->bt_smask)) {
-			if (*offset == ip->i_d.di_size) {
-				return (0);
-			}
-			return -XFS_ERROR(EINVAL);
-		}
-	}
-
-	n = XFS_MAXIOFFSET(mp) - *offset;
-	if ((n <= 0) || (size == 0))
-		return 0;
-
-	if (n < size)
-		size = n;
-
-	if (XFS_FORCED_SHUTDOWN(mp)) {
-		return -EIO;
-	}
-
-	if (unlikely(ioflags & IO_ISDIRECT))
-		mutex_lock(&inode->i_mutex);
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
-	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
-	    !(ioflags & IO_INVIS)) {
-		vrwlock_t locktype = VRWLOCK_READ;
-		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
-
-		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ,
-					BHV_TO_VNODE(bdp), *offset, size,
-					dmflags, &locktype);
-		if (ret) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-			goto unlock_isem;
-		}
-	}
-
-	xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
-				(void *)iovp, segs, *offset, ioflags);
-	ret = __generic_file_aio_read(iocb, iovp, segs, offset);
-	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
-		ret = wait_on_sync_kiocb(iocb);
-	if (ret > 0)
-		XFS_STATS_ADD(xs_read_bytes, ret);
-
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
-unlock_isem:
-	if (unlikely(ioflags & IO_ISDIRECT))
-		mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-
-ssize_t
-xfs_sendfile(
-	bhv_desc_t		*bdp,
-	struct file		*filp,
-	loff_t			*offset,
-	int			ioflags,
-	size_t			count,
-	read_actor_t		actor,
-	void			*target,
-	cred_t			*credp)
-{
-	ssize_t			ret;
-	xfs_fsize_t		n;
-	xfs_inode_t		*ip;
-	xfs_mount_t		*mp;
-	vnode_t			*vp;
-
-	ip = XFS_BHVTOI(bdp);
-	vp = BHV_TO_VNODE(bdp);
-	mp = ip->i_mount;
-
-	XFS_STATS_INC(xs_read_calls);
-
-	n = XFS_MAXIOFFSET(mp) - *offset;
-	if ((n <= 0) || (count == 0))
-		return 0;
-
-	if (n < count)
-		count = n;
-
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return -EIO;
-
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-
-	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
-	    (!(ioflags & IO_INVIS))) {
-		vrwlock_t locktype = VRWLOCK_READ;
-		int error;
-
-		error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count,
-				      FILP_DELAY_FLAG(filp), &locktype);
-		if (error) {
-			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-			return -error;
-		}
-	}
-	xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
-		   (void *)(unsigned long)target, count, *offset, ioflags);
-	ret = generic_file_sendfile(filp, offset, count, actor, target);
-
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
-	if (ret > 0)
-		XFS_STATS_ADD(xs_read_bytes, ret);
-
-	return ret;
-}
-
-/*
- * This routine is called to handle zeroing any space in the last
- * block of the file that is beyond the EOF.  We do this since the
- * size is being increased without writing anything to that block
- * and we don't want anyone to read the garbage on the disk.
- */
-STATIC int				/* error (positive) */
-xfs_zero_last_block(
-	struct inode	*ip,
-	xfs_iocore_t	*io,
-	xfs_fsize_t	isize,
-	xfs_fsize_t	end_size)
-{
-	xfs_fileoff_t	last_fsb;
-	xfs_mount_t	*mp;
-	int		nimaps;
-	int		zero_offset;
-	int		zero_len;
-	int		error = 0;
-	xfs_bmbt_irec_t	imap;
-	loff_t		loff;
-
-	ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
-
-	mp = io->io_mount;
-
-	zero_offset = XFS_B_FSB_OFFSET(mp, isize);
-	if (zero_offset == 0) {
-		/*
-		 * There are no extra bytes in the last block on disk to
-		 * zero, so return.
-		 */
-		return 0;
-	}
-
-	last_fsb = XFS_B_TO_FSBT(mp, isize);
-	nimaps = 1;
-	error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
-			  &nimaps, NULL);
-	if (error) {
-		return error;
-	}
-	ASSERT(nimaps > 0);
-	/*
-	 * If the block underlying isize is just a hole, then there
-	 * is nothing to zero.
-	 */
-	if (imap.br_startblock == HOLESTARTBLOCK) {
-		return 0;
-	}
-	/*
-	 * Zero the part of the last block beyond the EOF, and write it
-	 * out sync.  We need to drop the ilock while we do this so we
-	 * don't deadlock when the buffer cache calls back to us.
-	 */
-	XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
-	loff = XFS_FSB_TO_B(mp, last_fsb);
-
-	zero_len = mp->m_sb.sb_blocksize - zero_offset;
-
-	error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
-
-	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
-	ASSERT(error >= 0);
-	return error;
-}
-
-/*
- * Zero any on disk space between the current EOF and the new,
- * larger EOF.  This handles the normal case of zeroing the remainder
- * of the last block in the file and the unusual case of zeroing blocks
- * out beyond the size of the file.  This second case only happens
- * with fixed size extents and when the system crashes before the inode
- * size was updated but after blocks were allocated.  If fill is set,
- * then any holes in the range are filled and zeroed.  If not, the holes
- * are left alone as holes.
- */
-
-int					/* error (positive) */
-xfs_zero_eof(
-	vnode_t		*vp,
-	xfs_iocore_t	*io,
-	xfs_off_t	offset,		/* starting I/O offset */
-	xfs_fsize_t	isize,		/* current inode size */
-	xfs_fsize_t	end_size)	/* terminal inode size */
-{
-	struct inode	*ip = LINVFS_GET_IP(vp);
-	xfs_fileoff_t	start_zero_fsb;
-	xfs_fileoff_t	end_zero_fsb;
-	xfs_fileoff_t	zero_count_fsb;
-	xfs_fileoff_t	last_fsb;
-	xfs_extlen_t	buf_len_fsb;
-	xfs_mount_t	*mp;
-	int		nimaps;
-	int		error = 0;
-	xfs_bmbt_irec_t	imap;
-
-	ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
-	ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
-	ASSERT(offset > isize);
-
-	mp = io->io_mount;
-
-	/*
-	 * First handle zeroing the block on which isize resides.
-	 * We only zero a part of that block so it is handled specially.
-	 */
-	error = xfs_zero_last_block(ip, io, isize, end_size);
-	if (error) {
-		ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
-		ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
-		return error;
-	}
-
-	/*
-	 * Calculate the range between the new size and the old
-	 * where blocks needing to be zeroed may exist.  To get the
-	 * block where the last byte in the file currently resides,
-	 * we need to subtract one from the size and truncate back
-	 * to a block boundary.  We subtract 1 in case the size is
-	 * exactly on a block boundary.
-	 */
-	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
-	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
-	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
-	if (last_fsb == end_zero_fsb) {
-		/*
-		 * The size was only incremented on its last block.
-		 * We took care of that above, so just return.
-		 */
-		return 0;
-	}
-
-	ASSERT(start_zero_fsb <= end_zero_fsb);
-	while (start_zero_fsb <= end_zero_fsb) {
-		nimaps = 1;
-		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-		error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
-				  0, NULL, 0, &imap, &nimaps, NULL);
-		if (error) {
-			ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
-			ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
-			return error;
-		}
-		ASSERT(nimaps > 0);
-
-		if (imap.br_state == XFS_EXT_UNWRITTEN ||
-		    imap.br_startblock == HOLESTARTBLOCK) {
-			/*
-			 * This loop handles initializing pages that were
-			 * partially initialized by the code below this
-			 * loop. It basically zeroes the part of the page
-			 * that sits on a hole and sets the page as P_HOLE
-			 * and calls remapf if it is a mapped file.
-			 */
-			start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-			continue;
-		}
-
-		/*
-		 * There are blocks in the range requested.
-		 * Zero them a single write at a time.  We actually
-		 * don't zero the entire range returned if it is
-		 * too big and simply loop around to get the rest.
-		 * That is not the most efficient thing to do, but it
-		 * is simple and this path should not be exercised often.
-		 */
-		buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
-					      mp->m_writeio_blocks << 8);
-		/*
-		 * Drop the inode lock while we're doing the I/O.
-		 * We'll still have the iolock to protect us.
-		 */
-		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
-
-		error = xfs_iozero(ip,
-				   XFS_FSB_TO_B(mp, start_zero_fsb),
-				   XFS_FSB_TO_B(mp, buf_len_fsb),
-				   end_size);
-
-		if (error) {
-			goto out_lock;
-		}
-
-		start_zero_fsb = imap.br_startoff + buf_len_fsb;
-		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-
-		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
-	}
-
-	return 0;
-
-out_lock:
-
-	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
-	ASSERT(error >= 0);
-	return error;
-}
-
-ssize_t				/* bytes written, or (-) error */
-xfs_write(
-	bhv_desc_t		*bdp,
-	struct kiocb		*iocb,
-	const struct iovec	*iovp,
-	unsigned int		nsegs,
-	loff_t			*offset,
-	int			ioflags,
-	cred_t			*credp)
-{
-	struct file		*file = iocb->ki_filp;
-	struct address_space	*mapping = file->f_mapping;
-	struct inode		*inode = mapping->host;
-	unsigned long		segs = nsegs;
-	xfs_inode_t		*xip;
-	xfs_mount_t		*mp;
-	ssize_t			ret = 0, error = 0;
-	xfs_fsize_t		isize, new_size;
-	xfs_iocore_t		*io;
-	vnode_t			*vp;
-	unsigned long		seg;
-	int			iolock;
-	int			eventsent = 0;
-	vrwlock_t		locktype;
-	size_t			ocount = 0, count;
-	loff_t			pos;
-	int			need_isem = 1, need_flush = 0;
-
-	XFS_STATS_INC(xs_write_calls);
-
-	vp = BHV_TO_VNODE(bdp);
-	xip = XFS_BHVTOI(bdp);
-
-	for (seg = 0; seg < segs; seg++) {
-		const struct iovec *iv = &iovp[seg];
-
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		ocount += iv->iov_len;
-		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
-			return -EINVAL;
-		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
-			continue;
-		if (seg == 0)
-			return -EFAULT;
-		segs = seg;
-		ocount -= iv->iov_len;  /* This segment is no good */
-		break;
-	}
-
-	count = ocount;
-	pos = *offset;
-
-	if (count == 0)
-		return 0;
-
-	io = &xip->i_iocore;
-	mp = io->io_mount;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
-	fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);
-
-	if (ioflags & IO_ISDIRECT) {
-		xfs_buftarg_t	*target =
-			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp;
-
-		if ((pos & target->bt_smask) || (count & target->bt_smask))
-			return XFS_ERROR(-EINVAL);
-
-		if (!VN_CACHED(vp) && pos < i_size_read(inode))
-			need_isem = 0;
-
-		if (VN_CACHED(vp))
-			need_flush = 1;
-	}
-
-relock:
-	if (need_isem) {
-		iolock = XFS_IOLOCK_EXCL;
-		locktype = VRWLOCK_WRITE;
-
-		mutex_lock(&inode->i_mutex);
-	} else {
-		iolock = XFS_IOLOCK_SHARED;
-		locktype = VRWLOCK_WRITE_DIRECT;
-	}
-
-	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
-
-	isize = i_size_read(inode);
-
-	if (file->f_flags & O_APPEND)
-		*offset = isize;
-
-start:
-	error = -generic_write_checks(file, &pos, &count,
-					S_ISBLK(inode->i_mode));
-	if (error) {
-		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-		goto out_unlock_isem;
-	}
-
-	new_size = pos + count;
-	if (new_size > isize)
-		io->io_new_size = new_size;
-
-	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
-	    !(ioflags & IO_INVIS) && !eventsent)) {
-		loff_t		savedsize = pos;
-		int		dmflags = FILP_DELAY_FLAG(file);
-
-		if (need_isem)
-			dmflags |= DM_FLAGS_IMUX;
-
-		xfs_iunlock(xip, XFS_ILOCK_EXCL);
-		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
-				      pos, count,
-				      dmflags, &locktype);
-		if (error) {
-			xfs_iunlock(xip, iolock);
-			goto out_unlock_isem;
-		}
-		xfs_ilock(xip, XFS_ILOCK_EXCL);
-		eventsent = 1;
-
-		/*
-		 * The iolock was dropped and reaquired in XFS_SEND_DATA
-		 * so we have to recheck the size when appending.
-		 * We will only "goto start;" once, since having sent the
-		 * event prevents another call to XFS_SEND_DATA, which is
-		 * what allows the size to change in the first place.
-		 */
-		if ((file->f_flags & O_APPEND) && savedsize != isize) {
-			pos = isize = xip->i_d.di_size;
-			goto start;
-		}
-	}
-
-	if (likely(!(ioflags & IO_INVIS))) {
-		file_update_time(file);
-		xfs_ichgtime_fast(xip, inode,
-				  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-	}
-
-	/*
-	 * If the offset is beyond the size of the file, we have a couple
-	 * of things to do. First, if there is already space allocated
-	 * we need to either create holes or zero the disk or ...
-	 *
-	 * If there is a page where the previous size lands, we need
-	 * to zero it out up to the new size.
-	 */
-
-	if (pos > isize) {
-		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos,
-					isize, pos + count);
-		if (error) {
-			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
-			goto out_unlock_isem;
-		}
-	}
-	xfs_iunlock(xip, XFS_ILOCK_EXCL);
-
-	/*
-	 * If we're writing the file then make sure to clear the
-	 * setuid and setgid bits if the process is not being run
-	 * by root.  This keeps people from modifying setuid and
-	 * setgid binaries.
-	 */
-
-	if (((xip->i_d.di_mode & S_ISUID) ||
-	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
-		(S_ISGID | S_IXGRP))) &&
-	     !capable(CAP_FSETID)) {
-		error = xfs_write_clear_setuid(xip);
-		if (likely(!error))
-			error = -remove_suid(file->f_dentry);
-		if (unlikely(error)) {
-			xfs_iunlock(xip, iolock);
-			goto out_unlock_isem;
-		}
-	}
-
-retry:
-	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
-
-	if ((ioflags & IO_ISDIRECT)) {
-		if (need_flush) {
-			xfs_inval_cached_trace(io, pos, -1,
-					ctooff(offtoct(pos)), -1);
-			VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
-					-1, FI_REMAPF_LOCKED);
-		}
-
-		if (need_isem) {
-			/* demote the lock now the cached pages are gone */
-			XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
-			mutex_unlock(&inode->i_mutex);
-
-			iolock = XFS_IOLOCK_SHARED;
-			locktype = VRWLOCK_WRITE_DIRECT;
-			need_isem = 0;
-		}
-
- 		xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs,
-				*offset, ioflags);
-		ret = generic_file_direct_write(iocb, iovp,
-				&segs, pos, offset, count, ocount);
-
-		/*
-		 * direct-io write to a hole: fall through to buffered I/O
-		 * for completing the rest of the request.
-		 */
-		if (ret >= 0 && ret != count) {
-			XFS_STATS_ADD(xs_write_bytes, ret);
-
-			pos += ret;
-			count -= ret;
-
-			need_isem = 1;
-			ioflags &= ~IO_ISDIRECT;
-			xfs_iunlock(xip, iolock);
-			goto relock;
-		}
-	} else {
-		xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs,
-				*offset, ioflags);
-		ret = generic_file_buffered_write(iocb, iovp, segs,
-				pos, offset, count, ret);
-	}
-
-	current->backing_dev_info = NULL;
-
-	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
-		ret = wait_on_sync_kiocb(iocb);
-
-	if ((ret == -ENOSPC) &&
-	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
-	    !(ioflags & IO_INVIS)) {
-
-		xfs_rwunlock(bdp, locktype);
-		if (need_isem)
-			mutex_unlock(&inode->i_mutex);
-		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
-				DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
-				0, 0, 0); /* Delay flag intentionally  unused */
-		if (error)
-			goto out_nounlocks;
-		if (need_isem)
-			mutex_lock(&inode->i_mutex);
-		xfs_rwlock(bdp, locktype);
-		pos = xip->i_d.di_size;
-		ret = 0;
-		goto retry;
-	}
-
-	isize = i_size_read(inode);
-	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
-		*offset = isize;
-
-	if (*offset > xip->i_d.di_size) {
-		xfs_ilock(xip, XFS_ILOCK_EXCL);
-		if (*offset > xip->i_d.di_size) {
-			xip->i_d.di_size = *offset;
-			i_size_write(inode, *offset);
-			xip->i_update_core = 1;
-			xip->i_update_size = 1;
-		}
-		xfs_iunlock(xip, XFS_ILOCK_EXCL);
-	}
-
-	error = -ret;
-	if (ret <= 0)
-		goto out_unlock_internal;
-
-	XFS_STATS_ADD(xs_write_bytes, ret);
-
-	/* Handle various SYNC-type writes */
-	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
-		/*
-		 * If we're treating this as O_DSYNC and we have not updated the
-		 * size, force the log.
-		 */
-		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
-		    !(xip->i_update_size)) {
-			xfs_inode_log_item_t	*iip = xip->i_itemp;
-
-			/*
-			 * If an allocation transaction occurred
-			 * without extending the size, then we have to force
-			 * the log up the proper point to ensure that the
-			 * allocation is permanent.  We can't count on
-			 * the fact that buffered writes lock out direct I/O
-			 * writes - the direct I/O write could have extended
-			 * the size nontransactionally, then finished before
-			 * we started.  xfs_write_file will think that the file
-			 * didn't grow but the update isn't safe unless the
-			 * size change is logged.
-			 *
-			 * Force the log if we've committed a transaction
-			 * against the inode or if someone else has and
-			 * the commit record hasn't gone to disk (e.g.
-			 * the inode is pinned).  This guarantees that
-			 * all changes affecting the inode are permanent
-			 * when we return.
-			 */
-			if (iip && iip->ili_last_lsn) {
-				xfs_log_force(mp, iip->ili_last_lsn,
-						XFS_LOG_FORCE | XFS_LOG_SYNC);
-			} else if (xfs_ipincount(xip) > 0) {
-				xfs_log_force(mp, (xfs_lsn_t)0,
-						XFS_LOG_FORCE | XFS_LOG_SYNC);
-			}
-
-		} else {
-			xfs_trans_t	*tp;
-
-			/*
-			 * O_SYNC or O_DSYNC _with_ a size update are handled
-			 * the same way.
-			 *
-			 * If the write was synchronous then we need to make
-			 * sure that the inode modification time is permanent.
-			 * We'll have updated the timestamp above, so here
-			 * we use a synchronous transaction to log the inode.
-			 * It's not fast, but it's necessary.
-			 *
-			 * If this a dsync write and the size got changed
-			 * non-transactionally, then we need to ensure that
-			 * the size change gets logged in a synchronous
-			 * transaction.
-			 */
-
-			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
-			if ((error = xfs_trans_reserve(tp, 0,
-						      XFS_SWRITE_LOG_RES(mp),
-						      0, 0, 0))) {
-				/* Transaction reserve failed */
-				xfs_trans_cancel(tp, 0);
-			} else {
-				/* Transaction reserve successful */
-				xfs_ilock(xip, XFS_ILOCK_EXCL);
-				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
-				xfs_trans_ihold(tp, xip);
-				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
-				xfs_trans_set_sync(tp);
-				error = xfs_trans_commit(tp, 0, NULL);
-				xfs_iunlock(xip, XFS_ILOCK_EXCL);
-			}
-			if (error)
-				goto out_unlock_internal;
-		}
-	
-		xfs_rwunlock(bdp, locktype);
-		if (need_isem)
-			mutex_unlock(&inode->i_mutex);
-
-		error = sync_page_range(inode, mapping, pos, ret);
-		if (!error)
-			error = ret;
-		return error;
-	}
-
- out_unlock_internal:
-	xfs_rwunlock(bdp, locktype);
- out_unlock_isem:
-	if (need_isem)
-		mutex_unlock(&inode->i_mutex);
- out_nounlocks:
-	return -error;
-}
-
-/*
- * All xfs metadata buffers except log state machine buffers
- * get this attached as their b_bdstrat callback function.
- * This is so that we can catch a buffer
- * after prematurely unpinning it to forcibly shutdown the filesystem.
- */
-int
-xfs_bdstrat_cb(struct xfs_buf *bp)
-{
-	xfs_mount_t	*mp;
-
-	mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
-	if (!XFS_FORCED_SHUTDOWN(mp)) {
-		xfs_buf_iorequest(bp);
-		return 0;
-	} else {
-		xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
-		/*
-		 * Metadata write that didn't get logged but
-		 * written delayed anyway. These aren't associated
-		 * with a transaction, and can be ignored.
-		 */
-		if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
-		    (XFS_BUF_ISREAD(bp)) == 0)
-			return (xfs_bioerror_relse(bp));
-		else
-			return (xfs_bioerror(bp));
-	}
-}
-
-
-int
-xfs_bmap(bhv_desc_t	*bdp,
-	xfs_off_t	offset,
-	ssize_t		count,
-	int		flags,
-	xfs_iomap_t	*iomapp,
-	int		*niomaps)
-{
-	xfs_inode_t	*ip = XFS_BHVTOI(bdp);
-	xfs_iocore_t	*io = &ip->i_iocore;
-
-	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
-	ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
-	       ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
-
-	return xfs_iomap(io, offset, count, flags, iomapp, niomaps);
-}
-
-/*
- * Wrapper around bdstrat so that we can stop data
- * from going to disk in case we are shutting down the filesystem.
- * Typically user data goes thru this path; one of the exceptions
- * is the superblock.
- */
-int
-xfsbdstrat(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp)
-{
-	ASSERT(mp);
-	if (!XFS_FORCED_SHUTDOWN(mp)) {
-		/* Grio redirection would go here
-		 * if (XFS_BUF_IS_GRIO(bp)) {
-		 */
-
-		xfs_buf_iorequest(bp);
-		return 0;
-	}
-
-	xfs_buftrace("XFSBDSTRAT IOERROR", bp);
-	return (xfs_bioerror_relse(bp));
-}
-
-/*
- * If the underlying (data/log/rt) device is readonly, there are some
- * operations that cannot proceed.
- */
-int
-xfs_dev_is_read_only(
-	xfs_mount_t		*mp,
-	char			*message)
-{
-	if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
-	    xfs_readonly_buftarg(mp->m_logdev_targp) ||
-	    (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
-		cmn_err(CE_NOTE,
-			"XFS: %s required on read-only device.", message);
-		cmn_err(CE_NOTE,
-			"XFS: write access unavailable, cannot proceed.");
-		return EROFS;
-	}
-	return 0;
-}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
deleted file mode 100644
index 38864a88d42..00000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_LRW_H__
-#define __XFS_LRW_H__
-
-struct vnode;
-struct bhv_desc;
-struct xfs_mount;
-struct xfs_iocore;
-struct xfs_inode;
-struct xfs_bmbt_irec;
-struct xfs_buf;
-struct xfs_iomap;
-
-#if defined(XFS_RW_TRACE)
-/*
- * Defines for the trace mechanisms in xfs_lrw.c.
- */
-#define	XFS_RW_KTRACE_SIZE	128
-
-#define	XFS_READ_ENTER		1
-#define	XFS_WRITE_ENTER		2
-#define XFS_IOMAP_READ_ENTER	3
-#define	XFS_IOMAP_WRITE_ENTER	4
-#define	XFS_IOMAP_READ_MAP	5
-#define	XFS_IOMAP_WRITE_MAP	6
-#define	XFS_IOMAP_WRITE_NOSPACE	7
-#define	XFS_ITRUNC_START	8
-#define	XFS_ITRUNC_FINISH1	9
-#define	XFS_ITRUNC_FINISH2	10
-#define	XFS_CTRUNC1		11
-#define	XFS_CTRUNC2		12
-#define	XFS_CTRUNC3		13
-#define	XFS_CTRUNC4		14
-#define	XFS_CTRUNC5		15
-#define	XFS_CTRUNC6		16
-#define	XFS_BUNMAPI		17
-#define	XFS_INVAL_CACHED	18
-#define	XFS_DIORD_ENTER		19
-#define	XFS_DIOWR_ENTER		20
-#define	XFS_SENDFILE_ENTER	21
-#define	XFS_WRITEPAGE_ENTER	22
-#define	XFS_RELEASEPAGE_ENTER	23
-#define	XFS_INVALIDPAGE_ENTER	24
-#define	XFS_IOMAP_ALLOC_ENTER	25
-#define	XFS_IOMAP_ALLOC_MAP	26
-#define	XFS_IOMAP_UNWRITTEN	27
-extern void xfs_rw_enter_trace(int, struct xfs_iocore *,
-				void *, size_t, loff_t, int);
-extern void xfs_inval_cached_trace(struct xfs_iocore *,
-				xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t);
-#else
-#define xfs_rw_enter_trace(tag, io, data, size, offset, ioflags)
-#define xfs_inval_cached_trace(io, offset, len, first, last)
-#endif
-
-/*
- * Maximum count of bmaps used by read and write paths.
- */
-#define	XFS_MAX_RW_NBMAPS	4
-
-extern int xfs_bmap(struct bhv_desc *, xfs_off_t, ssize_t, int,
-			struct xfs_iomap *, int *);
-extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
-extern int xfs_bdstrat_cb(struct xfs_buf *);
-
-extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t,
-				xfs_fsize_t, xfs_fsize_t);
-extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *,
-				const struct iovec *, unsigned int,
-				loff_t *, int, struct cred *);
-extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *,
-				const struct iovec *, unsigned int,
-				loff_t *, int, struct cred *);
-extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *,
-				loff_t *, int, size_t, read_actor_t,
-				void *, struct cred *);
-
-extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
-
-#endif	/* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
deleted file mode 100644
index 8955720a2c6..00000000000
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include <linux/proc_fs.h>
-
-DEFINE_PER_CPU(struct xfsstats, xfsstats);
-
-STATIC int
-xfs_read_xfsstats(
-	char		*buffer,
-	char		**start,
-	off_t		offset,
-	int		count,
-	int		*eof,
-	void		*data)
-{
-	int		c, i, j, len, val;
-	__uint64_t	xs_xstrat_bytes = 0;
-	__uint64_t	xs_write_bytes = 0;
-	__uint64_t	xs_read_bytes = 0;
-
-	static const struct xstats_entry {
-		char	*desc;
-		int	endpoint;
-	} xstats[] = {
-		{ "extent_alloc",	XFSSTAT_END_EXTENT_ALLOC	},
-		{ "abt",		XFSSTAT_END_ALLOC_BTREE		},
-		{ "blk_map",		XFSSTAT_END_BLOCK_MAPPING	},
-		{ "bmbt",		XFSSTAT_END_BLOCK_MAP_BTREE	},
-		{ "dir",		XFSSTAT_END_DIRECTORY_OPS	},
-		{ "trans",		XFSSTAT_END_TRANSACTIONS	},
-		{ "ig",			XFSSTAT_END_INODE_OPS		},
-		{ "log",		XFSSTAT_END_LOG_OPS		},
-		{ "push_ail",		XFSSTAT_END_TAIL_PUSHING	},
-		{ "xstrat",		XFSSTAT_END_WRITE_CONVERT	},
-		{ "rw",			XFSSTAT_END_READ_WRITE_OPS	},
-		{ "attr",		XFSSTAT_END_ATTRIBUTE_OPS	},
-		{ "icluster",		XFSSTAT_END_INODE_CLUSTER	},
-		{ "vnodes",		XFSSTAT_END_VNODE_OPS		},
-		{ "buf",		XFSSTAT_END_BUF			},
-	};
-
-	/* Loop over all stats groups */
-	for (i=j=len = 0; i < sizeof(xstats)/sizeof(struct xstats_entry); i++) {
-		len += sprintf(buffer + len, xstats[i].desc);
-		/* inner loop does each group */
-		while (j < xstats[i].endpoint) {
-			val = 0;
-			/* sum over all cpus */
-			for (c = 0; c < NR_CPUS; c++) {
-				if (!cpu_possible(c)) continue;
-				val += *(((__u32*)&per_cpu(xfsstats, c) + j));
-			}
-			len += sprintf(buffer + len, " %u", val);
-			j++;
-		}
-		buffer[len++] = '\n';
-	}
-	/* extra precision counters */
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i)) continue;
-		xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
-		xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
-		xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
-	}
-
-	len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
-			xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
-	len += sprintf(buffer + len, "debug %u\n",
-#if defined(DEBUG)
-		1);
-#else
-		0);
-#endif
-
-	if (offset >= len) {
-		*start = buffer;
-		*eof = 1;
-		return 0;
-	}
-	*start = buffer + offset;
-	if ((len -= offset) > count)
-		return count;
-	*eof = 1;
-
-	return len;
-}
-
-void
-xfs_init_procfs(void)
-{
-	if (!proc_mkdir("fs/xfs", NULL))
-		return;
-	create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL);
-}
-
-void
-xfs_cleanup_procfs(void)
-{
-	remove_proc_entry("fs/xfs/stat", NULL);
-	remove_proc_entry("fs/xfs", NULL);
-}
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
deleted file mode 100644
index f22e426d9e4..00000000000
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ /dev/null
@@ -1,1014 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_clnt.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_version.h"
-
-#include <linux/namei.h>
-#include <linux/init.h>
-#include <linux/mount.h>
-#include <linux/mempool.h>
-#include <linux/writeback.h>
-#include <linux/kthread.h>
-
-STATIC struct quotactl_ops linvfs_qops;
-STATIC struct super_operations linvfs_sops;
-STATIC kmem_zone_t *xfs_vnode_zone;
-STATIC kmem_zone_t *xfs_ioend_zone;
-mempool_t *xfs_ioend_pool;
-
-STATIC struct xfs_mount_args *
-xfs_args_allocate(
-	struct super_block	*sb)
-{
-	struct xfs_mount_args	*args;
-
-	args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP);
-	args->logbufs = args->logbufsize = -1;
-	strncpy(args->fsname, sb->s_id, MAXNAMELEN);
-
-	/* Copy the already-parsed mount(2) flags we're interested in */
-	if (sb->s_flags & MS_NOATIME)
-		args->flags |= XFSMNT_NOATIME;
-	if (sb->s_flags & MS_DIRSYNC)
-		args->flags |= XFSMNT_DIRSYNC;
-	if (sb->s_flags & MS_SYNCHRONOUS)
-		args->flags |= XFSMNT_WSYNC;
-
-	/* Default to 32 bit inodes on Linux all the time */
-	args->flags |= XFSMNT_32BITINODES;
-
-	return args;
-}
-
-__uint64_t
-xfs_max_file_offset(
-	unsigned int		blockshift)
-{
-	unsigned int		pagefactor = 1;
-	unsigned int		bitshift = BITS_PER_LONG - 1;
-
-	/* Figure out maximum filesize, on Linux this can depend on
-	 * the filesystem blocksize (on 32 bit platforms).
-	 * __block_prepare_write does this in an [unsigned] long...
-	 *      page->index << (PAGE_CACHE_SHIFT - bbits)
-	 * So, for page sized blocks (4K on 32 bit platforms),
-	 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
-	 *      (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
-	 * but for smaller blocksizes it is less (bbits = log2 bsize).
-	 * Note1: get_block_t takes a long (implicit cast from above)
-	 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
-	 * can optionally convert the [unsigned] long from above into
-	 * an [unsigned] long long.
-	 */
-
-#if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
-	ASSERT(sizeof(sector_t) == 8);
-	pagefactor = PAGE_CACHE_SIZE;
-	bitshift = BITS_PER_LONG;
-# else
-	pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
-# endif
-#endif
-
-	return (((__uint64_t)pagefactor) << bitshift) - 1;
-}
-
-STATIC __inline__ void
-xfs_set_inodeops(
-	struct inode		*inode)
-{
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFREG:
-		inode->i_op = &linvfs_file_inode_operations;
-		inode->i_fop = &linvfs_file_operations;
-		inode->i_mapping->a_ops = &linvfs_aops;
-		break;
-	case S_IFDIR:
-		inode->i_op = &linvfs_dir_inode_operations;
-		inode->i_fop = &linvfs_dir_operations;
-		break;
-	case S_IFLNK:
-		inode->i_op = &linvfs_symlink_inode_operations;
-		if (inode->i_blocks)
-			inode->i_mapping->a_ops = &linvfs_aops;
-		break;
-	default:
-		inode->i_op = &linvfs_file_inode_operations;
-		init_special_inode(inode, inode->i_mode, inode->i_rdev);
-		break;
-	}
-}
-
-STATIC __inline__ void
-xfs_revalidate_inode(
-	xfs_mount_t		*mp,
-	vnode_t			*vp,
-	xfs_inode_t		*ip)
-{
-	struct inode		*inode = LINVFS_GET_IP(vp);
-
-	inode->i_mode	= ip->i_d.di_mode;
-	inode->i_nlink	= ip->i_d.di_nlink;
-	inode->i_uid	= ip->i_d.di_uid;
-	inode->i_gid	= ip->i_d.di_gid;
-
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFBLK:
-	case S_IFCHR:
-		inode->i_rdev =
-			MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
-			      sysv_minor(ip->i_df.if_u2.if_rdev));
-		break;
-	default:
-		inode->i_rdev = 0;
-		break;
-	}
-
-	inode->i_blksize = xfs_preferred_iosize(mp);
-	inode->i_generation = ip->i_d.di_gen;
-	i_size_write(inode, ip->i_d.di_size);
-	inode->i_blocks =
-		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
-	inode->i_atime.tv_sec	= ip->i_d.di_atime.t_sec;
-	inode->i_atime.tv_nsec	= ip->i_d.di_atime.t_nsec;
-	inode->i_mtime.tv_sec	= ip->i_d.di_mtime.t_sec;
-	inode->i_mtime.tv_nsec	= ip->i_d.di_mtime.t_nsec;
-	inode->i_ctime.tv_sec	= ip->i_d.di_ctime.t_sec;
-	inode->i_ctime.tv_nsec	= ip->i_d.di_ctime.t_nsec;
-	if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
-		inode->i_flags |= S_IMMUTABLE;
-	else
-		inode->i_flags &= ~S_IMMUTABLE;
-	if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
-		inode->i_flags |= S_APPEND;
-	else
-		inode->i_flags &= ~S_APPEND;
-	if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
-		inode->i_flags |= S_SYNC;
-	else
-		inode->i_flags &= ~S_SYNC;
-	if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
-		inode->i_flags |= S_NOATIME;
-	else
-		inode->i_flags &= ~S_NOATIME;
-	vp->v_flag &= ~VMODIFIED;
-}
-
-void
-xfs_initialize_vnode(
-	bhv_desc_t		*bdp,
-	vnode_t			*vp,
-	bhv_desc_t		*inode_bhv,
-	int			unlock)
-{
-	xfs_inode_t		*ip = XFS_BHVTOI(inode_bhv);
-	struct inode		*inode = LINVFS_GET_IP(vp);
-
-	if (!inode_bhv->bd_vobj) {
-		vp->v_vfsp = bhvtovfs(bdp);
-		bhv_desc_init(inode_bhv, ip, vp, &xfs_vnodeops);
-		bhv_insert(VN_BHV_HEAD(vp), inode_bhv);
-	}
-
-	/*
-	 * We need to set the ops vectors, and unlock the inode, but if
-	 * we have been called during the new inode create process, it is
-	 * too early to fill in the Linux inode.  We will get called a
-	 * second time once the inode is properly set up, and then we can
-	 * finish our work.
-	 */
-	if (ip->i_d.di_mode != 0 && unlock && (inode->i_state & I_NEW)) {
-		xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip);
-		xfs_set_inodeops(inode);
-	
-		ip->i_flags &= ~XFS_INEW;
-		barrier();
-
-		unlock_new_inode(inode);
-	}
-}
-
-int
-xfs_blkdev_get(
-	xfs_mount_t		*mp,
-	const char		*name,
-	struct block_device	**bdevp)
-{
-	int			error = 0;
-
-	*bdevp = open_bdev_excl(name, 0, mp);
-	if (IS_ERR(*bdevp)) {
-		error = PTR_ERR(*bdevp);
-		printk("XFS: Invalid device [%s], error=%d\n", name, error);
-	}
-
-	return -error;
-}
-
-void
-xfs_blkdev_put(
-	struct block_device	*bdev)
-{
-	if (bdev)
-		close_bdev_excl(bdev);
-}
-
-/*
- * Try to write out the superblock using barriers.
- */
-STATIC int
-xfs_barrier_test(
-	xfs_mount_t	*mp)
-{
-	xfs_buf_t	*sbp = xfs_getsb(mp, 0);
-	int		error;
-
-	XFS_BUF_UNDONE(sbp);
-	XFS_BUF_UNREAD(sbp);
-	XFS_BUF_UNDELAYWRITE(sbp);
-	XFS_BUF_WRITE(sbp);
-	XFS_BUF_UNASYNC(sbp);
-	XFS_BUF_ORDERED(sbp);
-
-	xfsbdstrat(mp, sbp);
-	error = xfs_iowait(sbp);
-
-	/*
-	 * Clear all the flags we set and possible error state in the
-	 * buffer.  We only did the write to try out whether barriers
-	 * worked and shouldn't leave any traces in the superblock
-	 * buffer.
-	 */
-	XFS_BUF_DONE(sbp);
-	XFS_BUF_ERROR(sbp, 0);
-	XFS_BUF_UNORDERED(sbp);
-
-	xfs_buf_relse(sbp);
-	return error;
-}
-
-void
-xfs_mountfs_check_barriers(xfs_mount_t *mp)
-{
-	int error;
-
-	if (mp->m_logdev_targp != mp->m_ddev_targp) {
-		xfs_fs_cmn_err(CE_NOTE, mp,
-		  "Disabling barriers, not supported with external log device");
-		mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		return;
-	}
-
-	if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered ==
-					QUEUE_ORDERED_NONE) {
-		xfs_fs_cmn_err(CE_NOTE, mp,
-		  "Disabling barriers, not supported by the underlying device");
-		mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		return;
-	}
-
-	error = xfs_barrier_test(mp);
-	if (error) {
-		xfs_fs_cmn_err(CE_NOTE, mp,
-		  "Disabling barriers, trial barrier write failed");
-		mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		return;
-	}
-}
-
-void
-xfs_blkdev_issue_flush(
-	xfs_buftarg_t		*buftarg)
-{
-	blkdev_issue_flush(buftarg->bt_bdev, NULL);
-}
-
-STATIC struct inode *
-linvfs_alloc_inode(
-	struct super_block	*sb)
-{
-	vnode_t			*vp;
-
-	vp = kmem_cache_alloc(xfs_vnode_zone, kmem_flags_convert(KM_SLEEP));
-	if (!vp)
-		return NULL;
-	return LINVFS_GET_IP(vp);
-}
-
-STATIC void
-linvfs_destroy_inode(
-	struct inode		*inode)
-{
-	kmem_zone_free(xfs_vnode_zone, LINVFS_GET_VP(inode));
-}
-
-STATIC void
-linvfs_inode_init_once(
-	void			*data,
-	kmem_cache_t		*cachep,
-	unsigned long		flags)
-{
-	vnode_t			*vp = (vnode_t *)data;
-
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(LINVFS_GET_IP(vp));
-}
-
-STATIC int
-linvfs_init_zones(void)
-{
-	xfs_vnode_zone = kmem_cache_create("xfs_vnode",
-				sizeof(vnode_t), 0, SLAB_RECLAIM_ACCOUNT,
-				linvfs_inode_init_once, NULL);
-	if (!xfs_vnode_zone)
-		goto out;
-
-	xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
-	if (!xfs_ioend_zone)
-		goto out_destroy_vnode_zone;
-
-	xfs_ioend_pool = mempool_create(4 * MAX_BUF_PER_PAGE,
-			mempool_alloc_slab, mempool_free_slab,
-			xfs_ioend_zone);
-	if (!xfs_ioend_pool)
-		goto out_free_ioend_zone;
-
-	return 0;
-
-
- out_free_ioend_zone:
-	kmem_zone_destroy(xfs_ioend_zone);
- out_destroy_vnode_zone:
-	kmem_zone_destroy(xfs_vnode_zone);
- out:
-	return -ENOMEM;
-}
-
-STATIC void
-linvfs_destroy_zones(void)
-{
-	mempool_destroy(xfs_ioend_pool);
-	kmem_zone_destroy(xfs_vnode_zone);
-	kmem_zone_destroy(xfs_ioend_zone);
-}
-
-/*
- * Attempt to flush the inode, this will actually fail
- * if the inode is pinned, but we dirty the inode again
- * at the point when it is unpinned after a log write,
- * since this is when the inode itself becomes flushable. 
- */
-STATIC int
-linvfs_write_inode(
-	struct inode		*inode,
-	int			sync)
-{
-	vnode_t			*vp = LINVFS_GET_VP(inode);
-	int			error = 0, flags = FLUSH_INODE;
-
-	if (vp) {
-		vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
-		if (sync)
-			flags |= FLUSH_SYNC;
-		VOP_IFLUSH(vp, flags, error);
-		if (error == EAGAIN) {
-			if (sync)
-				VOP_IFLUSH(vp, flags | FLUSH_LOG, error);
-			else
-				error = 0;
-		}
-	}
-
-	return -error;
-}
-
-STATIC void
-linvfs_clear_inode(
-	struct inode		*inode)
-{
-	vnode_t			*vp = LINVFS_GET_VP(inode);
-	int			error, cache;
-
-	vn_trace_entry(vp, "clear_inode", (inst_t *)__return_address);
-
-	XFS_STATS_INC(vn_rele);
-	XFS_STATS_INC(vn_remove);
-	XFS_STATS_INC(vn_reclaim);
-	XFS_STATS_DEC(vn_active);
-
-	/*
-	 * This can happen because xfs_iget_core calls xfs_idestroy if we
-	 * find an inode with di_mode == 0 but without IGET_CREATE set.
-	 */
-	if (vp->v_fbhv)
-		VOP_INACTIVE(vp, NULL, cache);
-
-	VN_LOCK(vp);
-	vp->v_flag &= ~VMODIFIED;
-	VN_UNLOCK(vp, 0);
-
-	if (vp->v_fbhv) {
-		VOP_RECLAIM(vp, error);
-		if (error)
-			panic("vn_purge: cannot reclaim");
-	}
-
-	ASSERT(vp->v_fbhv == NULL);
-
-#ifdef XFS_VNODE_TRACE
-	ktrace_free(vp->v_trace);
-#endif
-}
-
-/*
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
- * Doing this has two advantages:
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
-	struct vfs	*vfs,
-	void		*data,
-	void		(*syncer)(vfs_t *, void *))
-{
-	vfs_sync_work_t	*work;
-
-	work = kmem_alloc(sizeof(struct vfs_sync_work), KM_SLEEP);
-	INIT_LIST_HEAD(&work->w_list);
-	work->w_syncer = syncer;
-	work->w_data = data;
-	work->w_vfs = vfs;
-	spin_lock(&vfs->vfs_sync_lock);
-	list_add_tail(&work->w_list, &vfs->vfs_sync_list);
-	spin_unlock(&vfs->vfs_sync_lock);
-	wake_up_process(vfs->vfs_sync_task);
-}
-
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inode_work(
-	vfs_t		*vfs,
-	void		*inode)
-{
-	filemap_flush(((struct inode *)inode)->i_mapping);
-	iput((struct inode *)inode);
-}
-
-void
-xfs_flush_inode(
-	xfs_inode_t	*ip)
-{
-	struct inode	*inode = LINVFS_GET_IP(XFS_ITOV(ip));
-	struct vfs	*vfs = XFS_MTOVFS(ip->i_mount);
-
-	igrab(inode);
-	xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work);
-	delay(msecs_to_jiffies(500));
-}
-
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
-	vfs_t		*vfs,
-	void		*inode)
-{
-	sync_blockdev(vfs->vfs_super->s_bdev);
-	iput((struct inode *)inode);
-}
-
-void
-xfs_flush_device(
-	xfs_inode_t	*ip)
-{
-	struct inode	*inode = LINVFS_GET_IP(XFS_ITOV(ip));
-	struct vfs	*vfs = XFS_MTOVFS(ip->i_mount);
-
-	igrab(inode);
-	xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work);
-	delay(msecs_to_jiffies(500));
-	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
-}
-
-#define SYNCD_FLAGS	(SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR)
-STATIC void
-vfs_sync_worker(
-	vfs_t		*vfsp,
-	void		*unused)
-{
-	int		error;
-
-	if (!(vfsp->vfs_flag & VFS_RDONLY))
-		VFS_SYNC(vfsp, SYNCD_FLAGS, NULL, error);
-	vfsp->vfs_sync_seq++;
-	wmb();
-	wake_up(&vfsp->vfs_wait_single_sync_task);
-}
-
-STATIC int
-xfssyncd(
-	void			*arg)
-{
-	long			timeleft;
-	vfs_t			*vfsp = (vfs_t *) arg;
-	struct vfs_sync_work	*work, *n;
-	LIST_HEAD		(tmp);
-
-	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
-	for (;;) {
-		timeleft = schedule_timeout_interruptible(timeleft);
-		/* swsusp */
-		try_to_freeze();
-		if (kthread_should_stop() && list_empty(&vfsp->vfs_sync_list))
-			break;
-
-		spin_lock(&vfsp->vfs_sync_lock);
-		/*
-		 * We can get woken by laptop mode, to do a sync -
-		 * that's the (only!) case where the list would be
-		 * empty with time remaining.
-		 */
-		if (!timeleft || list_empty(&vfsp->vfs_sync_list)) {
-			if (!timeleft)
-				timeleft = xfs_syncd_centisecs *
-							msecs_to_jiffies(10);
-			INIT_LIST_HEAD(&vfsp->vfs_sync_work.w_list);
-			list_add_tail(&vfsp->vfs_sync_work.w_list,
-					&vfsp->vfs_sync_list);
-		}
-		list_for_each_entry_safe(work, n, &vfsp->vfs_sync_list, w_list)
-			list_move(&work->w_list, &tmp);
-		spin_unlock(&vfsp->vfs_sync_lock);
-
-		list_for_each_entry_safe(work, n, &tmp, w_list) {
-			(*work->w_syncer)(vfsp, work->w_data);
-			list_del(&work->w_list);
-			if (work == &vfsp->vfs_sync_work)
-				continue;
-			kmem_free(work, sizeof(struct vfs_sync_work));
-		}
-	}
-
-	return 0;
-}
-
-STATIC int
-linvfs_start_syncd(
-	vfs_t			*vfsp)
-{
-	vfsp->vfs_sync_work.w_syncer = vfs_sync_worker;
-	vfsp->vfs_sync_work.w_vfs = vfsp;
-	vfsp->vfs_sync_task = kthread_run(xfssyncd, vfsp, "xfssyncd");
-	if (IS_ERR(vfsp->vfs_sync_task))
-		return -PTR_ERR(vfsp->vfs_sync_task);
-	return 0;
-}
-
-STATIC void
-linvfs_stop_syncd(
-	vfs_t			*vfsp)
-{
-	kthread_stop(vfsp->vfs_sync_task);
-}
-
-STATIC void
-linvfs_put_super(
-	struct super_block	*sb)
-{
-	vfs_t			*vfsp = LINVFS_GET_VFS(sb);
-	int			error;
-
-	linvfs_stop_syncd(vfsp);
-	VFS_SYNC(vfsp, SYNC_ATTR|SYNC_DELWRI, NULL, error);
-	if (!error)
-		VFS_UNMOUNT(vfsp, 0, NULL, error);
-	if (error) {
-		printk("XFS unmount got error %d\n", error);
-		printk("%s: vfsp/0x%p left dangling!\n", __FUNCTION__, vfsp);
-		return;
-	}
-
-	vfs_deallocate(vfsp);
-}
-
-STATIC void
-linvfs_write_super(
-	struct super_block	*sb)
-{
-	vfs_t			*vfsp = LINVFS_GET_VFS(sb);
-	int			error;
-
-	if (sb->s_flags & MS_RDONLY) {
-		sb->s_dirt = 0; /* paranoia */
-		return;
-	}
-	/* Push the log and superblock a little */
-	VFS_SYNC(vfsp, SYNC_FSDATA, NULL, error);
-	sb->s_dirt = 0;
-}
-
-STATIC int
-linvfs_sync_super(
-	struct super_block	*sb,
-	int			wait)
-{
-	vfs_t		*vfsp = LINVFS_GET_VFS(sb);
-	int		error;
-	int		flags = SYNC_FSDATA;
-
-	if (unlikely(sb->s_frozen == SB_FREEZE_WRITE))
-		flags = SYNC_QUIESCE;
-	else
-		flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0);
-
-	VFS_SYNC(vfsp, flags, NULL, error);
-	sb->s_dirt = 0;
-
-	if (unlikely(laptop_mode)) {
-		int	prev_sync_seq = vfsp->vfs_sync_seq;
-
-		/*
-		 * The disk must be active because we're syncing.
-		 * We schedule xfssyncd now (now that the disk is
-		 * active) instead of later (when it might not be).
-		 */
-		wake_up_process(vfsp->vfs_sync_task);
-		/*
-		 * We have to wait for the sync iteration to complete.
-		 * If we don't, the disk activity caused by the sync
-		 * will come after the sync is completed, and that
-		 * triggers another sync from laptop mode.
-		 */
-		wait_event(vfsp->vfs_wait_single_sync_task,
-				vfsp->vfs_sync_seq != prev_sync_seq);
-	}
-
-	return -error;
-}
-
-STATIC int
-linvfs_statfs(
-	struct super_block	*sb,
-	struct kstatfs		*statp)
-{
-	vfs_t			*vfsp = LINVFS_GET_VFS(sb);
-	int			error;
-
-	VFS_STATVFS(vfsp, statp, NULL, error);
-	return -error;
-}
-
-STATIC int
-linvfs_remount(
-	struct super_block	*sb,
-	int			*flags,
-	char			*options)
-{
-	vfs_t			*vfsp = LINVFS_GET_VFS(sb);
-	struct xfs_mount_args	*args = xfs_args_allocate(sb);
-	int			error;
-
-	VFS_PARSEARGS(vfsp, options, args, 1, error);
-	if (!error)
-		VFS_MNTUPDATE(vfsp, flags, args, error);
-	kmem_free(args, sizeof(*args));
-	return -error;
-}
-
-STATIC void
-linvfs_freeze_fs(
-	struct super_block	*sb)
-{
-	VFS_FREEZE(LINVFS_GET_VFS(sb));
-}
-
-STATIC int
-linvfs_show_options(
-	struct seq_file		*m,
-	struct vfsmount		*mnt)
-{
-	struct vfs		*vfsp = LINVFS_GET_VFS(mnt->mnt_sb);
-	int			error;
-
-	VFS_SHOWARGS(vfsp, m, error);
-	return error;
-}
-
-STATIC int
-linvfs_quotasync(
-	struct super_block	*sb,
-	int			type)
-{
-	struct vfs		*vfsp = LINVFS_GET_VFS(sb);
-	int			error;
-
-	VFS_QUOTACTL(vfsp, Q_XQUOTASYNC, 0, (caddr_t)NULL, error);
-	return -error;
-}
-
-STATIC int
-linvfs_getxstate(
-	struct super_block	*sb,
-	struct fs_quota_stat	*fqs)
-{
-	struct vfs		*vfsp = LINVFS_GET_VFS(sb);
-	int			error;
-
-	VFS_QUOTACTL(vfsp, Q_XGETQSTAT, 0, (caddr_t)fqs, error);
-	return -error;
-}
-
-STATIC int
-linvfs_setxstate(
-	struct super_block	*sb,
-	unsigned int		flags,
-	int			op)
-{
-	struct vfs		*vfsp = LINVFS_GET_VFS(sb);
-	int			error;
-
-	VFS_QUOTACTL(vfsp, op, 0, (caddr_t)&flags, error);
-	return -error;
-}
-
-STATIC int
-linvfs_getxquota(
-	struct super_block	*sb,
-	int			type,
-	qid_t			id,
-	struct fs_disk_quota	*fdq)
-{
-	struct vfs		*vfsp = LINVFS_GET_VFS(sb);
-	int			error, getmode;
-
-	getmode = (type == USRQUOTA) ? Q_XGETQUOTA :
-		 ((type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETPQUOTA);
-	VFS_QUOTACTL(vfsp, getmode, id, (caddr_t)fdq, error);
-	return -error;
-}
-
-STATIC int
-linvfs_setxquota(
-	struct super_block	*sb,
-	int			type,
-	qid_t			id,
-	struct fs_disk_quota	*fdq)
-{
-	struct vfs		*vfsp = LINVFS_GET_VFS(sb);
-	int			error, setmode;
-
-	setmode = (type == USRQUOTA) ? Q_XSETQLIM :
-		 ((type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETPQLIM);
-	VFS_QUOTACTL(vfsp, setmode, id, (caddr_t)fdq, error);
-	return -error;
-}
-
-STATIC int
-linvfs_fill_super(
-	struct super_block	*sb,
-	void			*data,
-	int			silent)
-{
-	vnode_t			*rootvp;
-	struct vfs		*vfsp = vfs_allocate();
-	struct xfs_mount_args	*args = xfs_args_allocate(sb);
-	struct kstatfs		statvfs;
-	int			error, error2;
-
-	vfsp->vfs_super = sb;
-	LINVFS_SET_VFS(sb, vfsp);
-	if (sb->s_flags & MS_RDONLY)
-		vfsp->vfs_flag |= VFS_RDONLY;
-	bhv_insert_all_vfsops(vfsp);
-
-	VFS_PARSEARGS(vfsp, (char *)data, args, 0, error);
-	if (error) {
-		bhv_remove_all_vfsops(vfsp, 1);
-		goto fail_vfsop;
-	}
-
-	sb_min_blocksize(sb, BBSIZE);
-#ifdef CONFIG_XFS_EXPORT
-	sb->s_export_op = &linvfs_export_ops;
-#endif
-	sb->s_qcop = &linvfs_qops;
-	sb->s_op = &linvfs_sops;
-
-	VFS_MOUNT(vfsp, args, NULL, error);
-	if (error) {
-		bhv_remove_all_vfsops(vfsp, 1);
-		goto fail_vfsop;
-	}
-
-	VFS_STATVFS(vfsp, &statvfs, NULL, error);
-	if (error)
-		goto fail_unmount;
-
-	sb->s_dirt = 1;
-	sb->s_magic = statvfs.f_type;
-	sb->s_blocksize = statvfs.f_bsize;
-	sb->s_blocksize_bits = ffs(statvfs.f_bsize) - 1;
-	sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
-	sb->s_time_gran = 1;
-	set_posix_acl_flag(sb);
-
-	VFS_ROOT(vfsp, &rootvp, error);
-	if (error)
-		goto fail_unmount;
-
-	sb->s_root = d_alloc_root(LINVFS_GET_IP(rootvp));
-	if (!sb->s_root) {
-		error = ENOMEM;
-		goto fail_vnrele;
-	}
-	if (is_bad_inode(sb->s_root->d_inode)) {
-		error = EINVAL;
-		goto fail_vnrele;
-	}
-	if ((error = linvfs_start_syncd(vfsp)))
-		goto fail_vnrele;
-	vn_trace_exit(rootvp, __FUNCTION__, (inst_t *)__return_address);
-
-	kmem_free(args, sizeof(*args));
-	return 0;
-
-fail_vnrele:
-	if (sb->s_root) {
-		dput(sb->s_root);
-		sb->s_root = NULL;
-	} else {
-		VN_RELE(rootvp);
-	}
-
-fail_unmount:
-	VFS_UNMOUNT(vfsp, 0, NULL, error2);
-
-fail_vfsop:
-	vfs_deallocate(vfsp);
-	kmem_free(args, sizeof(*args));
-	return -error;
-}
-
-STATIC struct super_block *
-linvfs_get_sb(
-	struct file_system_type	*fs_type,
-	int			flags,
-	const char		*dev_name,
-	void			*data)
-{
-	return get_sb_bdev(fs_type, flags, dev_name, data, linvfs_fill_super);
-}
-
-STATIC struct super_operations linvfs_sops = {
-	.alloc_inode		= linvfs_alloc_inode,
-	.destroy_inode		= linvfs_destroy_inode,
-	.write_inode		= linvfs_write_inode,
-	.clear_inode		= linvfs_clear_inode,
-	.put_super		= linvfs_put_super,
-	.write_super		= linvfs_write_super,
-	.sync_fs		= linvfs_sync_super,
-	.write_super_lockfs	= linvfs_freeze_fs,
-	.statfs			= linvfs_statfs,
-	.remount_fs		= linvfs_remount,
-	.show_options		= linvfs_show_options,
-};
-
-STATIC struct quotactl_ops linvfs_qops = {
-	.quota_sync		= linvfs_quotasync,
-	.get_xstate		= linvfs_getxstate,
-	.set_xstate		= linvfs_setxstate,
-	.get_xquota		= linvfs_getxquota,
-	.set_xquota		= linvfs_setxquota,
-};
-
-STATIC struct file_system_type xfs_fs_type = {
-	.owner			= THIS_MODULE,
-	.name			= "xfs",
-	.get_sb			= linvfs_get_sb,
-	.kill_sb		= kill_block_super,
-	.fs_flags		= FS_REQUIRES_DEV,
-};
-
-
-STATIC int __init
-init_xfs_fs( void )
-{
-	int			error;
-	struct sysinfo		si;
-	static char		message[] __initdata = KERN_INFO \
-		XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n";
-
-	printk(message);
-
-	si_meminfo(&si);
-	xfs_physmem = si.totalram;
-
-	ktrace_init(64);
-
-	error = linvfs_init_zones();
-	if (error < 0)
-		goto undo_zones;
-
-	error = xfs_buf_init();
-	if (error < 0)
-		goto undo_buffers;
-
-	vn_init();
-	xfs_init();
-	uuid_init();
-	vfs_initquota();
-
-	error = register_filesystem(&xfs_fs_type);
-	if (error)
-		goto undo_register;
-	XFS_DM_INIT(&xfs_fs_type);
-	return 0;
-
-undo_register:
-	xfs_buf_terminate();
-
-undo_buffers:
-	linvfs_destroy_zones();
-
-undo_zones:
-	return error;
-}
-
-STATIC void __exit
-exit_xfs_fs( void )
-{
-	vfs_exitquota();
-	XFS_DM_EXIT(&xfs_fs_type);
-	unregister_filesystem(&xfs_fs_type);
-	xfs_cleanup();
-	xfs_buf_terminate();
-	linvfs_destroy_zones();
-	ktrace_uninit();
-}
-
-module_init(init_xfs_fs);
-module_exit(exit_xfs_fs);
-
-MODULE_AUTHOR("Silicon Graphics, Inc.");
-MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
-MODULE_LICENSE("GPL");
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
deleted file mode 100644
index a0256497242..00000000000
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2001-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-
-static struct ctl_table_header *xfs_table_header;
-
-#ifdef CONFIG_PROC_FS
-STATIC int
-xfs_stats_clear_proc_handler(
-	ctl_table	*ctl,
-	int		write,
-	struct file	*filp,
-	void		__user *buffer,
-	size_t		*lenp,
-	loff_t		*ppos)
-{
-	int		c, ret, *valp = ctl->data;
-	__uint32_t	vn_active;
-
-	ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos);
-
-	if (!ret && write && *valp) {
-		printk("XFS Clearing xfsstats\n");
-		for (c = 0; c < NR_CPUS; c++) {
-			if (!cpu_possible(c)) continue;
-			preempt_disable();
-			/* save vn_active, it's a universal truth! */
-			vn_active = per_cpu(xfsstats, c).vn_active;
-			memset(&per_cpu(xfsstats, c), 0,
-			       sizeof(struct xfsstats));
-			per_cpu(xfsstats, c).vn_active = vn_active;
-			preempt_enable();
-		}
-		xfs_stats_clear = 0;
-	}
-
-	return ret;
-}
-#endif /* CONFIG_PROC_FS */
-
-STATIC ctl_table xfs_table[] = {
-	{XFS_RESTRICT_CHOWN, "restrict_chown", &xfs_params.restrict_chown.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.restrict_chown.min, &xfs_params.restrict_chown.max},
-
-	{XFS_SGID_INHERIT, "irix_sgid_inherit", &xfs_params.sgid_inherit.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.sgid_inherit.min, &xfs_params.sgid_inherit.max},
-
-	{XFS_SYMLINK_MODE, "irix_symlink_mode", &xfs_params.symlink_mode.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.symlink_mode.min, &xfs_params.symlink_mode.max},
-
-	{XFS_PANIC_MASK, "panic_mask", &xfs_params.panic_mask.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.panic_mask.min, &xfs_params.panic_mask.max},
-
-	{XFS_ERRLEVEL, "error_level", &xfs_params.error_level.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.error_level.min, &xfs_params.error_level.max},
-
-	{XFS_SYNCD_TIMER, "xfssyncd_centisecs", &xfs_params.syncd_timer.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.syncd_timer.min, &xfs_params.syncd_timer.max},
-
-	{XFS_INHERIT_SYNC, "inherit_sync", &xfs_params.inherit_sync.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.inherit_sync.min, &xfs_params.inherit_sync.max},
-
-	{XFS_INHERIT_NODUMP, "inherit_nodump", &xfs_params.inherit_nodump.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.inherit_nodump.min, &xfs_params.inherit_nodump.max},
-
-	{XFS_INHERIT_NOATIME, "inherit_noatime", &xfs_params.inherit_noatim.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.inherit_noatim.min, &xfs_params.inherit_noatim.max},
-
-	{XFS_BUF_TIMER, "xfsbufd_centisecs", &xfs_params.xfs_buf_timer.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.xfs_buf_timer.min, &xfs_params.xfs_buf_timer.max},
-
-	{XFS_BUF_AGE, "age_buffer_centisecs", &xfs_params.xfs_buf_age.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.xfs_buf_age.min, &xfs_params.xfs_buf_age.max},
-
-	{XFS_INHERIT_NOSYM, "inherit_nosymlinks", &xfs_params.inherit_nosym.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.inherit_nosym.min, &xfs_params.inherit_nosym.max},
-
-	{XFS_ROTORSTEP, "rotorstep", &xfs_params.rotorstep.val,
-	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
-	&sysctl_intvec, NULL,
-	&xfs_params.rotorstep.min, &xfs_params.rotorstep.max},
-
-	/* please keep this the last entry */
-#ifdef CONFIG_PROC_FS
-	{XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val,
-	sizeof(int), 0644, NULL, &xfs_stats_clear_proc_handler,
-	&sysctl_intvec, NULL,
-	&xfs_params.stats_clear.min, &xfs_params.stats_clear.max},
-#endif /* CONFIG_PROC_FS */
-
-	{0}
-};
-
-STATIC ctl_table xfs_dir_table[] = {
-	{FS_XFS, "xfs", NULL, 0, 0555, xfs_table},
-	{0}
-};
-
-STATIC ctl_table xfs_root_table[] = {
-	{CTL_FS, "fs",  NULL, 0, 0555, xfs_dir_table},
-	{0}
-};
-
-void
-xfs_sysctl_register(void)
-{
-	xfs_table_header = register_sysctl_table(xfs_root_table, 1);
-}
-
-void
-xfs_sysctl_unregister(void)
-{
-	if (xfs_table_header)
-		unregister_sysctl_table(xfs_table_header);
-}
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
deleted file mode 100644
index c855d62e534..00000000000
--- a/fs/xfs/linux-2.6/xfs_vfs.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_inum.h"
-#include "xfs_log.h"
-#include "xfs_clnt.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_imap.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_quota.h"
-
-int
-vfs_mount(
-	struct bhv_desc		*bdp,
-	struct xfs_mount_args	*args,
-	struct cred		*cr)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_mount)
-		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_mount)(next, args, cr));
-}
-
-int
-vfs_parseargs(
-	struct bhv_desc		*bdp,
-	char			*s,
-	struct xfs_mount_args	*args,
-	int			f)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_parseargs)
-		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_parseargs)(next, s, args, f));
-}
-
-int
-vfs_showargs(
-	struct bhv_desc		*bdp,
-	struct seq_file		*m)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_showargs)
-		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_showargs)(next, m));
-}
-
-int
-vfs_unmount(
-	struct bhv_desc		*bdp,
-	int			fl,
-	struct cred		*cr)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_unmount)
-		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_unmount)(next, fl, cr));
-}
-
-int
-vfs_mntupdate(
-	struct bhv_desc		*bdp,
-	int			*fl,
-	struct xfs_mount_args	*args)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_mntupdate)
-		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_mntupdate)(next, fl, args));
-}
-
-int
-vfs_root(
-	struct bhv_desc		*bdp,
-	struct vnode		**vpp)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_root)
-		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_root)(next, vpp));
-}
-
-int
-vfs_statvfs(
-	struct bhv_desc		*bdp,
-	xfs_statfs_t		*sp,
-	struct vnode		*vp)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_statvfs)
-		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_statvfs)(next, sp, vp));
-}
-
-int
-vfs_sync(
-	struct bhv_desc		*bdp,
-	int			fl,
-	struct cred		*cr)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_sync)
-		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_sync)(next, fl, cr));
-}
-
-int
-vfs_vget(
-	struct bhv_desc		*bdp,
-	struct vnode		**vpp,
-	struct fid		*fidp)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_vget)
-		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_vget)(next, vpp, fidp));
-}
-
-int
-vfs_dmapiops(
-	struct bhv_desc		*bdp,
-	caddr_t			addr)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_dmapiops)
-		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_dmapiops)(next, addr));
-}
-
-int
-vfs_quotactl(
-	struct bhv_desc		*bdp,
-	int			cmd,
-	int			id,
-	caddr_t			addr)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_quotactl)
-		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_quotactl)(next, cmd, id, addr));
-}
-
-void
-vfs_init_vnode(
-	struct bhv_desc		*bdp,
-	struct vnode		*vp,
-	struct bhv_desc		*bp,
-	int			unlock)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_init_vnode)
-		next = BHV_NEXT(next);
-	((*bhvtovfsops(next)->vfs_init_vnode)(next, vp, bp, unlock));
-}
-
-void
-vfs_force_shutdown(
-	struct bhv_desc		*bdp,
-	int			fl,
-	char			*file,
-	int			line)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_force_shutdown)
-		next = BHV_NEXT(next);
-	((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line));
-}
-
-void
-vfs_freeze(
-	struct bhv_desc		*bdp)
-{
-	struct bhv_desc		*next = bdp;
-
-	ASSERT(next);
-	while (! (bhvtovfsops(next))->vfs_freeze)
-		next = BHV_NEXT(next);
-	((*bhvtovfsops(next)->vfs_freeze)(next));
-}
-
-vfs_t *
-vfs_allocate( void )
-{
-	struct vfs		*vfsp;
-
-	vfsp = kmem_zalloc(sizeof(vfs_t), KM_SLEEP);
-	bhv_head_init(VFS_BHVHEAD(vfsp), "vfs");
-	INIT_LIST_HEAD(&vfsp->vfs_sync_list);
-	spin_lock_init(&vfsp->vfs_sync_lock);
-	init_waitqueue_head(&vfsp->vfs_wait_single_sync_task);
-	return vfsp;
-}
-
-void
-vfs_deallocate(
-	struct vfs		*vfsp)
-{
-	bhv_head_destroy(VFS_BHVHEAD(vfsp));
-	kmem_free(vfsp, sizeof(vfs_t));
-}
-
-void
-vfs_insertops(
-	struct vfs		*vfsp,
-	struct bhv_vfsops	*vfsops)
-{
-	struct bhv_desc		*bdp;
-
-	bdp = kmem_alloc(sizeof(struct bhv_desc), KM_SLEEP);
-	bhv_desc_init(bdp, NULL, vfsp, vfsops);
-	bhv_insert(&vfsp->vfs_bh, bdp);
-}
-
-void
-vfs_insertbhv(
-	struct vfs		*vfsp,
-	struct bhv_desc		*bdp,
-	struct vfsops		*vfsops,
-	void			*mount)
-{
-	bhv_desc_init(bdp, mount, vfsp, vfsops);
-	bhv_insert_initial(&vfsp->vfs_bh, bdp);
-}
-
-void
-bhv_remove_vfsops(
-	struct vfs		*vfsp,
-	int			pos)
-{
-	struct bhv_desc		*bhv;
-
-	bhv = bhv_lookup_range(&vfsp->vfs_bh, pos, pos);
-	if (!bhv)
-		return;
-	bhv_remove(&vfsp->vfs_bh, bhv);
-	kmem_free(bhv, sizeof(*bhv));
-}
-
-void
-bhv_remove_all_vfsops(
-	struct vfs		*vfsp,
-	int			freebase)
-{
-	struct xfs_mount	*mp;
-
-	bhv_remove_vfsops(vfsp, VFS_POSITION_QM);
-	bhv_remove_vfsops(vfsp, VFS_POSITION_DM);
-	if (!freebase)
-		return;
-	mp = XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfsp), &xfs_vfsops));
-	VFS_REMOVEBHV(vfsp, &mp->m_bhv);
-	xfs_mount_free(mp, 0);
-}
-
-void
-bhv_insert_all_vfsops(
-	struct vfs		*vfsp)
-{
-	struct xfs_mount	*mp;
-
-	mp = xfs_mount_init();
-	vfs_insertbhv(vfsp, &mp->m_bhv, &xfs_vfsops, mp);
-	vfs_insertdmapi(vfsp);
-	vfs_insertquota(vfsp);
-}
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
deleted file mode 100644
index 57caf9eddee..00000000000
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VFS_H__
-#define __XFS_VFS_H__
-
-#include <linux/vfs.h>
-#include "xfs_fs.h"
-
-struct fid;
-struct vfs;
-struct cred;
-struct vnode;
-struct kstatfs;
-struct seq_file;
-struct super_block;
-struct xfs_mount_args;
-
-typedef struct kstatfs xfs_statfs_t;
-
-typedef struct vfs_sync_work {
-	struct list_head	w_list;
-	struct vfs		*w_vfs;
-	void			*w_data;	/* syncer routine argument */
-	void			(*w_syncer)(struct vfs *, void *);
-} vfs_sync_work_t;
-
-typedef struct vfs {
-	u_int			vfs_flag;	/* flags */
-	xfs_fsid_t		vfs_fsid;	/* file system ID */
-	xfs_fsid_t		*vfs_altfsid;	/* An ID fixed for life of FS */
-	bhv_head_t		vfs_bh;		/* head of vfs behavior chain */
-	struct super_block	*vfs_super;	/* generic superblock pointer */
-	struct task_struct	*vfs_sync_task;	/* generalised sync thread */
-	vfs_sync_work_t		vfs_sync_work;	/* work item for VFS_SYNC */
-	struct list_head	vfs_sync_list;	/* sync thread work item list */
-	spinlock_t		vfs_sync_lock;	/* work item list lock */
-	int 			vfs_sync_seq;	/* sync thread generation no. */
-	wait_queue_head_t	vfs_wait_single_sync_task;
-} vfs_t;
-
-#define vfs_fbhv		vfs_bh.bh_first	/* 1st on vfs behavior chain */
-
-#define bhvtovfs(bdp)		( (struct vfs *)BHV_VOBJ(bdp) )
-#define bhvtovfsops(bdp)	( (struct vfsops *)BHV_OPS(bdp) )
-#define VFS_BHVHEAD(vfs)	( &(vfs)->vfs_bh )
-#define VFS_REMOVEBHV(vfs, bdp)	( bhv_remove(VFS_BHVHEAD(vfs), bdp) )
-
-#define VFS_POSITION_BASE	BHV_POSITION_BASE	/* chain bottom */
-#define VFS_POSITION_TOP	BHV_POSITION_TOP	/* chain top */
-#define VFS_POSITION_INVALID	BHV_POSITION_INVALID	/* invalid pos. num */
-
-typedef enum {
-	VFS_BHV_UNKNOWN,	/* not specified */
-	VFS_BHV_XFS,		/* xfs */
-	VFS_BHV_DM,		/* data migration */
-	VFS_BHV_QM,		/* quota manager */
-	VFS_BHV_IO,		/* IO path */
-	VFS_BHV_END		/* housekeeping end-of-range */
-} vfs_bhv_t;
-
-#define VFS_POSITION_XFS	(BHV_POSITION_BASE)
-#define VFS_POSITION_DM		(VFS_POSITION_BASE+10)
-#define VFS_POSITION_QM		(VFS_POSITION_BASE+20)
-#define VFS_POSITION_IO		(VFS_POSITION_BASE+30)
-
-#define VFS_RDONLY		0x0001	/* read-only vfs */
-#define VFS_GRPID		0x0002	/* group-ID assigned from directory */
-#define VFS_DMI			0x0004	/* filesystem has the DMI enabled */
-#define VFS_32BITINODES		0x0008	/* do not use inums above 32 bits */
-#define VFS_END			0x0008	/* max flag */
-
-#define SYNC_ATTR		0x0001	/* sync attributes */
-#define SYNC_CLOSE		0x0002	/* close file system down */
-#define SYNC_DELWRI		0x0004	/* look at delayed writes */
-#define SYNC_WAIT		0x0008	/* wait for i/o to complete */
-#define SYNC_BDFLUSH		0x0010	/* BDFLUSH is calling -- don't block */
-#define SYNC_FSDATA		0x0020	/* flush fs data (e.g. superblocks) */
-#define SYNC_REFCACHE		0x0040  /* prune some of the nfs ref cache */
-#define SYNC_REMOUNT		0x0080  /* remount readonly, no dummy LRs */
-#define SYNC_QUIESCE		0x0100  /* quiesce fileystem for a snapshot */
-
-typedef int	(*vfs_mount_t)(bhv_desc_t *,
-				struct xfs_mount_args *, struct cred *);
-typedef int	(*vfs_parseargs_t)(bhv_desc_t *, char *,
-				struct xfs_mount_args *, int);
-typedef	int	(*vfs_showargs_t)(bhv_desc_t *, struct seq_file *);
-typedef int	(*vfs_unmount_t)(bhv_desc_t *, int, struct cred *);
-typedef int	(*vfs_mntupdate_t)(bhv_desc_t *, int *,
-				struct xfs_mount_args *);
-typedef int	(*vfs_root_t)(bhv_desc_t *, struct vnode **);
-typedef int	(*vfs_statvfs_t)(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
-typedef int	(*vfs_sync_t)(bhv_desc_t *, int, struct cred *);
-typedef int	(*vfs_vget_t)(bhv_desc_t *, struct vnode **, struct fid *);
-typedef int	(*vfs_dmapiops_t)(bhv_desc_t *, caddr_t);
-typedef int	(*vfs_quotactl_t)(bhv_desc_t *, int, int, caddr_t);
-typedef void	(*vfs_init_vnode_t)(bhv_desc_t *,
-				struct vnode *, bhv_desc_t *, int);
-typedef void	(*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int);
-typedef void	(*vfs_freeze_t)(bhv_desc_t *);
-
-typedef struct vfsops {
-	bhv_position_t		vf_position;	/* behavior chain position */
-	vfs_mount_t		vfs_mount;	/* mount file system */
-	vfs_parseargs_t		vfs_parseargs;	/* parse mount options */
-	vfs_showargs_t		vfs_showargs;	/* unparse mount options */
-	vfs_unmount_t		vfs_unmount;	/* unmount file system */
-	vfs_mntupdate_t		vfs_mntupdate;	/* update file system options */
-	vfs_root_t		vfs_root;	/* get root vnode */
-	vfs_statvfs_t		vfs_statvfs;	/* file system statistics */
-	vfs_sync_t		vfs_sync;	/* flush files */
-	vfs_vget_t		vfs_vget;	/* get vnode from fid */
-	vfs_dmapiops_t		vfs_dmapiops;	/* data migration */
-	vfs_quotactl_t		vfs_quotactl;	/* disk quota */
-	vfs_init_vnode_t	vfs_init_vnode;	/* initialize a new vnode */
-	vfs_force_shutdown_t	vfs_force_shutdown;	/* crash and burn */
-	vfs_freeze_t		vfs_freeze;	/* freeze fs for snapshot */
-} vfsops_t;
-
-/*
- * VFS's.  Operates on vfs structure pointers (starts at bhv head).
- */
-#define VHEAD(v)			((v)->vfs_fbhv)
-#define VFS_MOUNT(v, ma,cr, rv)		((rv) = vfs_mount(VHEAD(v), ma,cr))
-#define VFS_PARSEARGS(v, o,ma,f, rv)	((rv) = vfs_parseargs(VHEAD(v), o,ma,f))
-#define VFS_SHOWARGS(v, m, rv)		((rv) = vfs_showargs(VHEAD(v), m))
-#define VFS_UNMOUNT(v, f, cr, rv)	((rv) = vfs_unmount(VHEAD(v), f,cr))
-#define VFS_MNTUPDATE(v, fl, args, rv)	((rv) = vfs_mntupdate(VHEAD(v), fl, args))
-#define VFS_ROOT(v, vpp, rv)		((rv) = vfs_root(VHEAD(v), vpp))
-#define VFS_STATVFS(v, sp,vp, rv)	((rv) = vfs_statvfs(VHEAD(v), sp,vp))
-#define VFS_SYNC(v, flag,cr, rv)	((rv) = vfs_sync(VHEAD(v), flag,cr))
-#define VFS_VGET(v, vpp,fidp, rv)	((rv) = vfs_vget(VHEAD(v), vpp,fidp))
-#define VFS_DMAPIOPS(v, p, rv)		((rv) = vfs_dmapiops(VHEAD(v), p))
-#define VFS_QUOTACTL(v, c,id,p, rv)	((rv) = vfs_quotactl(VHEAD(v), c,id,p))
-#define VFS_INIT_VNODE(v, vp,b,ul)	( vfs_init_vnode(VHEAD(v), vp,b,ul) )
-#define VFS_FORCE_SHUTDOWN(v, fl,f,l)	( vfs_force_shutdown(VHEAD(v), fl,f,l) )
-#define VFS_FREEZE(v)			( vfs_freeze(VHEAD(v)) )
-
-/*
- * PVFS's.  Operates on behavior descriptor pointers.
- */
-#define PVFS_MOUNT(b, ma,cr, rv)	((rv) = vfs_mount(b, ma,cr))
-#define PVFS_PARSEARGS(b, o,ma,f, rv)	((rv) = vfs_parseargs(b, o,ma,f))
-#define PVFS_SHOWARGS(b, m, rv)		((rv) = vfs_showargs(b, m))
-#define PVFS_UNMOUNT(b, f,cr, rv)	((rv) = vfs_unmount(b, f,cr))
-#define PVFS_MNTUPDATE(b, fl, args, rv)	((rv) = vfs_mntupdate(b, fl, args))
-#define PVFS_ROOT(b, vpp, rv)		((rv) = vfs_root(b, vpp))
-#define PVFS_STATVFS(b, sp,vp, rv)	((rv) = vfs_statvfs(b, sp,vp))
-#define PVFS_SYNC(b, flag,cr, rv)	((rv) = vfs_sync(b, flag,cr))
-#define PVFS_VGET(b, vpp,fidp, rv)	((rv) = vfs_vget(b, vpp,fidp))
-#define PVFS_DMAPIOPS(b, p, rv)		((rv) = vfs_dmapiops(b, p))
-#define PVFS_QUOTACTL(b, c,id,p, rv)	((rv) = vfs_quotactl(b, c,id,p))
-#define PVFS_INIT_VNODE(b, vp,b2,ul)	( vfs_init_vnode(b, vp,b2,ul) )
-#define PVFS_FORCE_SHUTDOWN(b, fl,f,l)	( vfs_force_shutdown(b, fl,f,l) )
-#define PVFS_FREEZE(b)			( vfs_freeze(b) )
-
-extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *);
-extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int);
-extern int vfs_showargs(bhv_desc_t *, struct seq_file *);
-extern int vfs_unmount(bhv_desc_t *, int, struct cred *);
-extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *);
-extern int vfs_root(bhv_desc_t *, struct vnode **);
-extern int vfs_statvfs(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
-extern int vfs_sync(bhv_desc_t *, int, struct cred *);
-extern int vfs_vget(bhv_desc_t *, struct vnode **, struct fid *);
-extern int vfs_dmapiops(bhv_desc_t *, caddr_t);
-extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t);
-extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int);
-extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
-extern void vfs_freeze(bhv_desc_t *);
-
-typedef struct bhv_vfsops {
-	struct vfsops		bhv_common;
-	void *			bhv_custom;
-} bhv_vfsops_t;
-
-#define vfs_bhv_lookup(v, id)	( bhv_lookup_range(&(v)->vfs_bh, (id), (id)) )
-#define vfs_bhv_custom(b)	( ((bhv_vfsops_t *)BHV_OPS(b))->bhv_custom )
-#define vfs_bhv_set_custom(b,o)	( (b)->bhv_custom = (void *)(o))
-#define vfs_bhv_clr_custom(b)	( (b)->bhv_custom = NULL )
-
-extern vfs_t *vfs_allocate(void);
-extern void vfs_deallocate(vfs_t *);
-extern void vfs_insertops(vfs_t *, bhv_vfsops_t *);
-extern void vfs_insertbhv(vfs_t *, bhv_desc_t *, vfsops_t *, void *);
-
-extern void bhv_insert_all_vfsops(struct vfs *);
-extern void bhv_remove_all_vfsops(struct vfs *, int);
-extern void bhv_remove_vfsops(struct vfs *, int);
-
-#define fs_frozen(vfsp)		((vfsp)->vfs_super->s_frozen)
-#define fs_check_frozen(vfsp, level) \
-	vfs_check_frozen(vfsp->vfs_super, level);
-
-#endif	/* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
deleted file mode 100644
index 260dd8415dd..00000000000
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-
-uint64_t vn_generation;		/* vnode generation number */
-DEFINE_SPINLOCK(vnumber_lock);
-
-/*
- * Dedicated vnode inactive/reclaim sync semaphores.
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC                  37
-#define vptosync(v)             (&vsync[((unsigned long)v) % NVSYNC])
-STATIC wait_queue_head_t vsync[NVSYNC];
-
-void
-vn_init(void)
-{
-	int i;
-
-	for (i = 0; i < NVSYNC; i++)
-		init_waitqueue_head(&vsync[i]);
-}
-
-void
-vn_iowait(
-	struct vnode	*vp)
-{
-	wait_queue_head_t *wq = vptosync(vp);
-
-	wait_event(*wq, (atomic_read(&vp->v_iocount) == 0));
-}
-
-void
-vn_iowake(
-	struct vnode	*vp)
-{
-	if (atomic_dec_and_test(&vp->v_iocount))
-		wake_up(vptosync(vp));
-}
-
-struct vnode *
-vn_initialize(
-	struct inode	*inode)
-{
-	struct vnode	*vp = LINVFS_GET_VP(inode);
-
-	XFS_STATS_INC(vn_active);
-	XFS_STATS_INC(vn_alloc);
-
-	vp->v_flag = VMODIFIED;
-	spinlock_init(&vp->v_lock, "v_lock");
-
-	spin_lock(&vnumber_lock);
-	if (!++vn_generation)	/* v_number shouldn't be zero */
-		vn_generation++;
-	vp->v_number = vn_generation;
-	spin_unlock(&vnumber_lock);
-
-	ASSERT(VN_CACHED(vp) == 0);
-
-	/* Initialize the first behavior and the behavior chain head. */
-	vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode");
-
-	atomic_set(&vp->v_iocount, 0);
-
-#ifdef	XFS_VNODE_TRACE
-	vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP);
-#endif	/* XFS_VNODE_TRACE */
-
-	vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address);
-	return vp;
-}
-
-/*
- * Revalidate the Linux inode from the vattr.
- * Note: i_size _not_ updated; we must hold the inode
- * semaphore when doing that - callers responsibility.
- */
-void
-vn_revalidate_core(
-	struct vnode	*vp,
-	vattr_t		*vap)
-{
-	struct inode	*inode = LINVFS_GET_IP(vp);
-
-	inode->i_mode	    = vap->va_mode;
-	inode->i_nlink	    = vap->va_nlink;
-	inode->i_uid	    = vap->va_uid;
-	inode->i_gid	    = vap->va_gid;
-	inode->i_blocks	    = vap->va_nblocks;
-	inode->i_mtime	    = vap->va_mtime;
-	inode->i_ctime	    = vap->va_ctime;
-	inode->i_blksize    = vap->va_blocksize;
-	if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
-		inode->i_flags |= S_IMMUTABLE;
-	else
-		inode->i_flags &= ~S_IMMUTABLE;
-	if (vap->va_xflags & XFS_XFLAG_APPEND)
-		inode->i_flags |= S_APPEND;
-	else
-		inode->i_flags &= ~S_APPEND;
-	if (vap->va_xflags & XFS_XFLAG_SYNC)
-		inode->i_flags |= S_SYNC;
-	else
-		inode->i_flags &= ~S_SYNC;
-	if (vap->va_xflags & XFS_XFLAG_NOATIME)
-		inode->i_flags |= S_NOATIME;
-	else
-		inode->i_flags &= ~S_NOATIME;
-}
-
-/*
- * Revalidate the Linux inode from the vnode.
- */
-int
-vn_revalidate(
-	struct vnode	*vp)
-{
-	vattr_t		va;
-	int		error;
-
-	vn_trace_entry(vp, "vn_revalidate", (inst_t *)__return_address);
-	ASSERT(vp->v_fbhv != NULL);
-
-	va.va_mask = XFS_AT_STAT|XFS_AT_XFLAGS;
-	VOP_GETATTR(vp, &va, 0, NULL, error);
-	if (!error) {
-		vn_revalidate_core(vp, &va);
-		VUNMODIFY(vp);
-	}
-	return -error;
-}
-
-/*
- * Add a reference to a referenced vnode.
- */
-struct vnode *
-vn_hold(
-	struct vnode	*vp)
-{
-	struct inode	*inode;
-
-	XFS_STATS_INC(vn_hold);
-
-	VN_LOCK(vp);
-	inode = igrab(LINVFS_GET_IP(vp));
-	ASSERT(inode);
-	VN_UNLOCK(vp, 0);
-
-	return vp;
-}
-
-#ifdef	XFS_VNODE_TRACE
-
-#define KTRACE_ENTER(vp, vk, s, line, ra)			\
-	ktrace_enter(	(vp)->v_trace,				\
-/*  0 */		(void *)(__psint_t)(vk),		\
-/*  1 */		(void *)(s),				\
-/*  2 */		(void *)(__psint_t) line,		\
-/*  3 */		(void *)(__psint_t)(vn_count(vp)),	\
-/*  4 */		(void *)(ra),				\
-/*  5 */		(void *)(__psunsigned_t)(vp)->v_flag,	\
-/*  6 */		(void *)(__psint_t)current_cpu(),	\
-/*  7 */		(void *)(__psint_t)current_pid(),	\
-/*  8 */		(void *)__return_address,		\
-/*  9 */		NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-
-/*
- * Vnode tracing code.
- */
-void
-vn_trace_entry(vnode_t *vp, const char *func, inst_t *ra)
-{
-	KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra);
-}
-
-void
-vn_trace_exit(vnode_t *vp, const char *func, inst_t *ra)
-{
-	KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra);
-}
-
-void
-vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra)
-{
-	KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra);
-}
-
-void
-vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra)
-{
-	KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra);
-}
-
-void
-vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra)
-{
-	KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra);
-}
-#endif	/* XFS_VNODE_TRACE */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
deleted file mode 100644
index 0fe2419461d..00000000000
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ /dev/null
@@ -1,644 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Portions Copyright (c) 1989, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#ifndef __XFS_VNODE_H__
-#define __XFS_VNODE_H__
-
-struct uio;
-struct file;
-struct vattr;
-struct xfs_iomap;
-struct attrlist_cursor_kern;
-
-
-typedef xfs_ino_t vnumber_t;
-typedef struct dentry vname_t;
-typedef bhv_head_t vn_bhv_head_t;
-
-/*
- * MP locking protocols:
- *	v_flag, v_vfsp				VN_LOCK/VN_UNLOCK
- */
-typedef struct vnode {
-	__u32		v_flag;			/* vnode flags (see below) */
-	struct vfs	*v_vfsp;		/* ptr to containing VFS */
-	vnumber_t	v_number;		/* in-core vnode number */
-	vn_bhv_head_t	v_bh;			/* behavior head */
-	spinlock_t	v_lock;			/* VN_LOCK/VN_UNLOCK */
-	atomic_t	v_iocount;		/* outstanding I/O count */
-#ifdef XFS_VNODE_TRACE
-	struct ktrace	*v_trace;		/* trace header structure    */
-#endif
-	struct inode	v_inode;		/* Linux inode */
-	/* inode MUST be last */
-} vnode_t;
-
-#define VN_ISLNK(vp)	S_ISLNK((vp)->v_inode.i_mode)
-#define VN_ISREG(vp)	S_ISREG((vp)->v_inode.i_mode)
-#define VN_ISDIR(vp)	S_ISDIR((vp)->v_inode.i_mode)
-#define VN_ISCHR(vp)	S_ISCHR((vp)->v_inode.i_mode)
-#define VN_ISBLK(vp)	S_ISBLK((vp)->v_inode.i_mode)
-
-#define v_fbhv			v_bh.bh_first	       /* first behavior */
-#define v_fops			v_bh.bh_first->bd_ops  /* first behavior ops */
-
-#define VNODE_POSITION_BASE	BHV_POSITION_BASE	/* chain bottom */
-#define VNODE_POSITION_TOP	BHV_POSITION_TOP	/* chain top */
-#define VNODE_POSITION_INVALID	BHV_POSITION_INVALID	/* invalid pos. num */
-
-typedef enum {
-	VN_BHV_UNKNOWN,		/* not specified */
-	VN_BHV_XFS,		/* xfs */
-	VN_BHV_DM,		/* data migration */
-	VN_BHV_QM,		/* quota manager */
-	VN_BHV_IO,		/* IO path */
-	VN_BHV_END		/* housekeeping end-of-range */
-} vn_bhv_t;
-
-#define VNODE_POSITION_XFS	(VNODE_POSITION_BASE)
-#define VNODE_POSITION_DM	(VNODE_POSITION_BASE+10)
-#define VNODE_POSITION_QM	(VNODE_POSITION_BASE+20)
-#define VNODE_POSITION_IO	(VNODE_POSITION_BASE+30)
-
-/*
- * Macros for dealing with the behavior descriptor inside of the vnode.
- */
-#define BHV_TO_VNODE(bdp)	((vnode_t *)BHV_VOBJ(bdp))
-#define BHV_TO_VNODE_NULL(bdp)	((vnode_t *)BHV_VOBJNULL(bdp))
-
-#define VN_BHV_HEAD(vp)			((bhv_head_t *)(&((vp)->v_bh)))
-#define vn_bhv_head_init(bhp,name)	bhv_head_init(bhp,name)
-#define vn_bhv_remove(bhp,bdp)		bhv_remove(bhp,bdp)
-#define vn_bhv_lookup(bhp,ops)		bhv_lookup(bhp,ops)
-#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops)
-
-/*
- * Vnode to Linux inode mapping.
- */
-#define LINVFS_GET_VP(inode)	((vnode_t *)list_entry(inode, vnode_t, v_inode))
-#define LINVFS_GET_IP(vp)	(&(vp)->v_inode)
-
-/*
- * Vnode flags.
- */
-#define VMODIFIED	       0x8	/* XFS inode state possibly differs */
-					/* to the Linux inode state.	*/
-
-/*
- * Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter.
- */
-typedef enum vrwlock {
-	VRWLOCK_NONE,
-	VRWLOCK_READ,
-	VRWLOCK_WRITE,
-	VRWLOCK_WRITE_DIRECT,
-	VRWLOCK_TRY_READ,
-	VRWLOCK_TRY_WRITE
-} vrwlock_t;
-
-/*
- * Return values for VOP_INACTIVE.  A return value of
- * VN_INACTIVE_NOCACHE implies that the file system behavior
- * has disassociated its state and bhv_desc_t from the vnode.
- */
-#define	VN_INACTIVE_CACHE	0
-#define	VN_INACTIVE_NOCACHE	1
-
-/*
- * Values for the cmd code given to VOP_VNODE_CHANGE.
- */
-typedef enum vchange {
-	VCHANGE_FLAGS_FRLOCKS		= 0,
-	VCHANGE_FLAGS_ENF_LOCKING	= 1,
-	VCHANGE_FLAGS_TRUNCATED		= 2,
-	VCHANGE_FLAGS_PAGE_DIRTY	= 3,
-	VCHANGE_FLAGS_IOEXCL_COUNT	= 4
-} vchange_t;
-
-
-typedef int	(*vop_open_t)(bhv_desc_t *, struct cred *);
-typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *,
-				const struct iovec *, unsigned int,
-				loff_t *, int, struct cred *);
-typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *,
-				const struct iovec *, unsigned int,
-				loff_t *, int, struct cred *);
-typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *,
-				loff_t *, int, size_t, read_actor_t,
-				void *, struct cred *);
-typedef int	(*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
-				int, unsigned int, void __user *);
-typedef int	(*vop_getattr_t)(bhv_desc_t *, struct vattr *, int,
-				struct cred *);
-typedef int	(*vop_setattr_t)(bhv_desc_t *, struct vattr *, int,
-				struct cred *);
-typedef int	(*vop_access_t)(bhv_desc_t *, int, struct cred *);
-typedef int	(*vop_lookup_t)(bhv_desc_t *, vname_t *, vnode_t **,
-				int, vnode_t *, struct cred *);
-typedef int	(*vop_create_t)(bhv_desc_t *, vname_t *, struct vattr *,
-				vnode_t **, struct cred *);
-typedef int	(*vop_remove_t)(bhv_desc_t *, vname_t *, struct cred *);
-typedef int	(*vop_link_t)(bhv_desc_t *, vnode_t *, vname_t *,
-				struct cred *);
-typedef int	(*vop_rename_t)(bhv_desc_t *, vname_t *, vnode_t *, vname_t *,
-				struct cred *);
-typedef int	(*vop_mkdir_t)(bhv_desc_t *, vname_t *, struct vattr *,
-				vnode_t **, struct cred *);
-typedef int	(*vop_rmdir_t)(bhv_desc_t *, vname_t *, struct cred *);
-typedef int	(*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *,
-				int *);
-typedef int	(*vop_symlink_t)(bhv_desc_t *, vname_t *, struct vattr *,
-				char *, vnode_t **, struct cred *);
-typedef int	(*vop_readlink_t)(bhv_desc_t *, struct uio *, int,
-				struct cred *);
-typedef int	(*vop_fsync_t)(bhv_desc_t *, int, struct cred *,
-				xfs_off_t, xfs_off_t);
-typedef int	(*vop_inactive_t)(bhv_desc_t *, struct cred *);
-typedef int	(*vop_fid2_t)(bhv_desc_t *, struct fid *);
-typedef int	(*vop_release_t)(bhv_desc_t *);
-typedef int	(*vop_rwlock_t)(bhv_desc_t *, vrwlock_t);
-typedef void	(*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t);
-typedef int	(*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int,
-				struct xfs_iomap *, int *);
-typedef int	(*vop_reclaim_t)(bhv_desc_t *);
-typedef int	(*vop_attr_get_t)(bhv_desc_t *, const char *, char *, int *,
-				int, struct cred *);
-typedef	int	(*vop_attr_set_t)(bhv_desc_t *, const char *, char *, int,
-				int, struct cred *);
-typedef	int	(*vop_attr_remove_t)(bhv_desc_t *, const char *,
-				int, struct cred *);
-typedef	int	(*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
-				struct attrlist_cursor_kern *, struct cred *);
-typedef void	(*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int);
-typedef void	(*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t);
-typedef void	(*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-typedef void	(*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
-typedef int	(*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
-				uint64_t, int);
-typedef int	(*vop_iflush_t)(bhv_desc_t *, int);
-
-
-typedef struct vnodeops {
-	bhv_position_t  vn_position;    /* position within behavior chain */
-	vop_open_t		vop_open;
-	vop_read_t		vop_read;
-	vop_write_t		vop_write;
-	vop_sendfile_t		vop_sendfile;
-	vop_ioctl_t		vop_ioctl;
-	vop_getattr_t		vop_getattr;
-	vop_setattr_t		vop_setattr;
-	vop_access_t		vop_access;
-	vop_lookup_t		vop_lookup;
-	vop_create_t		vop_create;
-	vop_remove_t		vop_remove;
-	vop_link_t		vop_link;
-	vop_rename_t		vop_rename;
-	vop_mkdir_t		vop_mkdir;
-	vop_rmdir_t		vop_rmdir;
-	vop_readdir_t		vop_readdir;
-	vop_symlink_t		vop_symlink;
-	vop_readlink_t		vop_readlink;
-	vop_fsync_t		vop_fsync;
-	vop_inactive_t		vop_inactive;
-	vop_fid2_t		vop_fid2;
-	vop_rwlock_t		vop_rwlock;
-	vop_rwunlock_t		vop_rwunlock;
-	vop_bmap_t		vop_bmap;
-	vop_reclaim_t		vop_reclaim;
-	vop_attr_get_t		vop_attr_get;
-	vop_attr_set_t		vop_attr_set;
-	vop_attr_remove_t	vop_attr_remove;
-	vop_attr_list_t		vop_attr_list;
-	vop_link_removed_t	vop_link_removed;
-	vop_vnode_change_t	vop_vnode_change;
-	vop_ptossvp_t		vop_tosspages;
-	vop_pflushinvalvp_t	vop_flushinval_pages;
-	vop_pflushvp_t		vop_flush_pages;
-	vop_release_t		vop_release;
-	vop_iflush_t		vop_iflush;
-} vnodeops_t;
-
-/*
- * VOP's.
- */
-#define _VOP_(op, vp)	(*((vnodeops_t *)(vp)->v_fops)->op)
-
-#define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv)			\
-	rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
-#define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv)		\
-	rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
-#define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv)		\
-	rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr)
-#define VOP_BMAP(vp,of,sz,rw,b,n,rv)					\
-	rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
-#define VOP_OPEN(vp, cr, rv)						\
-	rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr)
-#define VOP_GETATTR(vp, vap, f, cr, rv)					\
-	rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr)
-#define	VOP_SETATTR(vp, vap, f, cr, rv)					\
-	rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr)
-#define	VOP_ACCESS(vp, mode, cr, rv)					\
-	rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr)
-#define	VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv)				\
-	rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr)
-#define VOP_CREATE(dvp,d,vap,vpp,cr,rv)					\
-	rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr)
-#define VOP_REMOVE(dvp,d,cr,rv)						\
-	rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr)
-#define	VOP_LINK(tdvp,fvp,d,cr,rv)					\
-	rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr)
-#define	VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv)				\
-	rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr)
-#define	VOP_MKDIR(dp,d,vap,vpp,cr,rv)					\
-	rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr)
-#define	VOP_RMDIR(dp,d,cr,rv)	 					\
-	rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr)
-#define	VOP_READDIR(vp,uiop,cr,eofp,rv)					\
-	rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp)
-#define	VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv)				\
-	rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr)
-#define	VOP_READLINK(vp,uiop,fl,cr,rv)					\
-	rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,fl,cr)
-#define	VOP_FSYNC(vp,f,cr,b,e,rv)					\
-	rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e)
-#define VOP_INACTIVE(vp, cr, rv)					\
-	rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr)
-#define VOP_RELEASE(vp, rv)						\
-	rv = _VOP_(vop_release, vp)((vp)->v_fbhv)
-#define VOP_FID2(vp, fidp, rv)						\
-	rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp)
-#define VOP_RWLOCK(vp,i)						\
-	(void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
-#define VOP_RWLOCK_TRY(vp,i)						\
-	_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
-#define VOP_RWUNLOCK(vp,i)						\
-	(void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i)
-#define VOP_FRLOCK(vp,c,fl,flags,offset,fr,rv)				\
-	rv = _VOP_(vop_frlock, vp)((vp)->v_fbhv,c,fl,flags,offset,fr)
-#define VOP_RECLAIM(vp, rv)						\
-	rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv)
-#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv)		\
-	rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred)
-#define	VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv)		\
-	rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred)
-#define	VOP_ATTR_REMOVE(vp, name, flags, cred, rv)			\
-	rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred)
-#define	VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv)		\
-	rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred)
-#define VOP_LINK_REMOVED(vp, dvp, linkzero)				\
-	(void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero)
-#define VOP_VNODE_CHANGE(vp, cmd, val)					\
-	(void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val)
-/*
- * These are page cache functions that now go thru VOPs.
- * 'last' parameter is unused and left in for IRIX compatibility
- */
-#define VOP_TOSS_PAGES(vp, first, last, fiopt)				\
-	_VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt)
-/*
- * 'last' parameter is unused and left in for IRIX compatibility
- */
-#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt)			\
-	_VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt)
-/*
- * 'last' parameter is unused and left in for IRIX compatibility
- */
-#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv)		\
-	rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt)
-#define VOP_IOCTL(vp, inode, filp, fl, cmd, arg, rv)			\
-	rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,fl,cmd,arg)
-#define VOP_IFLUSH(vp, flags, rv)					\
-	rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags)
-
-/*
- * Flags for read/write calls - same values as IRIX
- */
-#define IO_ISAIO	0x00001		/* don't wait for completion */
-#define IO_ISDIRECT	0x00004		/* bypass page cache */
-#define IO_INVIS	0x00020		/* don't update inode timestamps */
-
-/*
- * Flags for VOP_IFLUSH call
- */
-#define FLUSH_SYNC		1	/* wait for flush to complete	*/
-#define FLUSH_INODE		2	/* flush the inode itself	*/
-#define FLUSH_LOG		4	/* force the last log entry for
-					 * this inode out to disk	*/
-
-/*
- * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and
- *	VOP_FLUSH_PAGES.
- */
-#define FI_NONE			0	/* none */
-#define FI_REMAPF		1	/* Do a remapf prior to the operation */
-#define FI_REMAPF_LOCKED	2	/* Do a remapf prior to the operation.
-					   Prevent VM access to the pages until
-					   the operation completes. */
-
-/*
- * Vnode attributes.  va_mask indicates those attributes the caller
- * wants to set or extract.
- */
-typedef struct vattr {
-	int		va_mask;	/* bit-mask of attributes present */
-	mode_t		va_mode;	/* file access mode and type */
-	xfs_nlink_t	va_nlink;	/* number of references to file */
-	uid_t		va_uid;		/* owner user id */
-	gid_t		va_gid;		/* owner group id */
-	xfs_ino_t	va_nodeid;	/* file id */
-	xfs_off_t	va_size;	/* file size in bytes */
-	u_long		va_blocksize;	/* blocksize preferred for i/o */
-	struct timespec	va_atime;	/* time of last access */
-	struct timespec	va_mtime;	/* time of last modification */
-	struct timespec	va_ctime;	/* time file changed */
-	u_int		va_gen;		/* generation number of file */
-	xfs_dev_t	va_rdev;	/* device the special file represents */
-	__int64_t	va_nblocks;	/* number of blocks allocated */
-	u_long		va_xflags;	/* random extended file flags */
-	u_long		va_extsize;	/* file extent size */
-	u_long		va_nextents;	/* number of extents in file */
-	u_long		va_anextents;	/* number of attr extents in file */
-	prid_t		va_projid;	/* project id */
-} vattr_t;
-
-/*
- * setattr or getattr attributes
- */
-#define XFS_AT_TYPE		0x00000001
-#define XFS_AT_MODE		0x00000002
-#define XFS_AT_UID		0x00000004
-#define XFS_AT_GID		0x00000008
-#define XFS_AT_FSID		0x00000010
-#define XFS_AT_NODEID		0x00000020
-#define XFS_AT_NLINK		0x00000040
-#define XFS_AT_SIZE		0x00000080
-#define XFS_AT_ATIME		0x00000100
-#define XFS_AT_MTIME		0x00000200
-#define XFS_AT_CTIME		0x00000400
-#define XFS_AT_RDEV		0x00000800
-#define XFS_AT_BLKSIZE		0x00001000
-#define XFS_AT_NBLOCKS		0x00002000
-#define XFS_AT_VCODE		0x00004000
-#define XFS_AT_MAC		0x00008000
-#define XFS_AT_UPDATIME		0x00010000
-#define XFS_AT_UPDMTIME		0x00020000
-#define XFS_AT_UPDCTIME		0x00040000
-#define XFS_AT_ACL		0x00080000
-#define XFS_AT_CAP		0x00100000
-#define XFS_AT_INF		0x00200000
-#define XFS_AT_XFLAGS		0x00400000
-#define XFS_AT_EXTSIZE		0x00800000
-#define XFS_AT_NEXTENTS		0x01000000
-#define XFS_AT_ANEXTENTS	0x02000000
-#define XFS_AT_PROJID		0x04000000
-#define XFS_AT_SIZE_NOPERM	0x08000000
-#define XFS_AT_GENCOUNT		0x10000000
-
-#define XFS_AT_ALL	(XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
-		XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
-		XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
-		XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\
-		XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\
-		XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT)
-
-#define XFS_AT_STAT	(XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
-		XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
-		XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
-		XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID)
-
-#define XFS_AT_TIMES	(XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME)
-
-#define XFS_AT_UPDTIMES	(XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME)
-
-#define XFS_AT_NOSET	(XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\
-		XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\
-		XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT)
-
-/*
- *  Modes.
- */
-#define VSUID	S_ISUID		/* set user id on execution */
-#define VSGID	S_ISGID		/* set group id on execution */
-#define VSVTX	S_ISVTX		/* save swapped text even after use */
-#define VREAD	S_IRUSR		/* read, write, execute permissions */
-#define VWRITE	S_IWUSR
-#define VEXEC	S_IXUSR
-
-#define MODEMASK S_IALLUGO	/* mode bits plus permission bits */
-
-/*
- * Check whether mandatory file locking is enabled.
- */
-#define MANDLOCK(vp, mode)	\
-	(VN_ISREG(vp) && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
-
-extern void	vn_init(void);
-extern vnode_t	*vn_initialize(struct inode *);
-
-/*
- * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
- */
-typedef struct vnode_map {
-	vfs_t		*v_vfsp;
-	vnumber_t	v_number;		/* in-core vnode number */
-	xfs_ino_t	v_ino;			/* inode #	*/
-} vmap_t;
-
-#define VMAP(vp, vmap)	{(vmap).v_vfsp	 = (vp)->v_vfsp,	\
-			 (vmap).v_number = (vp)->v_number,	\
-			 (vmap).v_ino	 = (vp)->v_inode.i_ino; }
-
-extern int	vn_revalidate(struct vnode *);
-extern void	vn_revalidate_core(struct vnode *, vattr_t *);
-
-extern void	vn_iowait(struct vnode *vp);
-extern void	vn_iowake(struct vnode *vp);
-
-static inline int vn_count(struct vnode *vp)
-{
-	return atomic_read(&LINVFS_GET_IP(vp)->i_count);
-}
-
-/*
- * Vnode reference counting functions (and macros for compatibility).
- */
-extern vnode_t	*vn_hold(struct vnode *);
-
-#if defined(XFS_VNODE_TRACE)
-#define VN_HOLD(vp)		\
-	((void)vn_hold(vp),	\
-	  vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address))
-#define VN_RELE(vp)		\
-	  (vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address), \
-	   iput(LINVFS_GET_IP(vp)))
-#else
-#define VN_HOLD(vp)		((void)vn_hold(vp))
-#define VN_RELE(vp)		(iput(LINVFS_GET_IP(vp)))
-#endif
-
-static inline struct vnode *vn_grab(struct vnode *vp)
-{
-	struct inode *inode = igrab(LINVFS_GET_IP(vp));
-	return inode ? LINVFS_GET_VP(inode) : NULL;
-}
-
-/*
- * Vname handling macros.
- */
-#define VNAME(dentry)		((char *) (dentry)->d_name.name)
-#define VNAMELEN(dentry)	((dentry)->d_name.len)
-#define VNAME_TO_VNODE(dentry)	(LINVFS_GET_VP((dentry)->d_inode))
-
-/*
- * Vnode spinlock manipulation.
- */
-#define VN_LOCK(vp)		mutex_spinlock(&(vp)->v_lock)
-#define VN_UNLOCK(vp, s)	mutex_spinunlock(&(vp)->v_lock, s)
-#define VN_FLAGSET(vp,b)	vn_flagset(vp,b)
-#define VN_FLAGCLR(vp,b)	vn_flagclr(vp,b)
-
-static __inline__ void vn_flagset(struct vnode *vp, uint flag)
-{
-	spin_lock(&vp->v_lock);
-	vp->v_flag |= flag;
-	spin_unlock(&vp->v_lock);
-}
-
-static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
-{
-	spin_lock(&vp->v_lock);
-	vp->v_flag &= ~flag;
-	spin_unlock(&vp->v_lock);
-}
-
-/*
- * Dealing with bad inodes
- */
-static inline void vn_mark_bad(struct vnode *vp)
-{
-	make_bad_inode(LINVFS_GET_IP(vp));
-}
-
-static inline int VN_BAD(struct vnode *vp)
-{
-	return is_bad_inode(LINVFS_GET_IP(vp));
-}
-
-/*
- * Extracting atime values in various formats
- */
-static inline void vn_atime_to_bstime(struct vnode *vp, xfs_bstime_t *bs_atime)
-{
-	bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec;
-	bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec;
-}
-
-static inline void vn_atime_to_timespec(struct vnode *vp, struct timespec *ts)
-{
-	*ts = vp->v_inode.i_atime;
-}
-
-static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
-{
-	*tt = vp->v_inode.i_atime.tv_sec;
-}
-
-/*
- * Some useful predicates.
- */
-#define VN_MAPPED(vp)	mapping_mapped(LINVFS_GET_IP(vp)->i_mapping)
-#define VN_CACHED(vp)	(LINVFS_GET_IP(vp)->i_mapping->nrpages)
-#define VN_DIRTY(vp)	mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \
-					PAGECACHE_TAG_DIRTY)
-#define VMODIFY(vp)	VN_FLAGSET(vp, VMODIFIED)
-#define VUNMODIFY(vp)	VN_FLAGCLR(vp, VMODIFIED)
-
-/*
- * Flags to VOP_SETATTR/VOP_GETATTR.
- */
-#define	ATTR_UTIME	0x01	/* non-default utime(2) request */
-#define	ATTR_DMI	0x08	/* invocation from a DMI function */
-#define	ATTR_LAZY	0x80	/* set/get attributes lazily */
-#define	ATTR_NONBLOCK	0x100	/* return EAGAIN if operation would block */
-#define ATTR_NOLOCK	0x200	/* Don't grab any conflicting locks */
-#define ATTR_NOSIZETOK	0x400	/* Don't get the SIZE token */
-
-/*
- * Flags to VOP_FSYNC and VOP_RECLAIM.
- */
-#define FSYNC_NOWAIT	0	/* asynchronous flush */
-#define FSYNC_WAIT	0x1	/* synchronous fsync or forced reclaim */
-#define FSYNC_INVAL	0x2	/* flush and invalidate cached data */
-#define FSYNC_DATA	0x4	/* synchronous fsync of data only */
-
-/*
- * Tracking vnode activity.
- */
-#if defined(XFS_VNODE_TRACE)
-
-#define	VNODE_TRACE_SIZE	16		/* number of trace entries */
-#define	VNODE_KTRACE_ENTRY	1
-#define	VNODE_KTRACE_EXIT	2
-#define	VNODE_KTRACE_HOLD	3
-#define	VNODE_KTRACE_REF	4
-#define	VNODE_KTRACE_RELE	5
-
-extern void vn_trace_entry(struct vnode *, const char *, inst_t *);
-extern void vn_trace_exit(struct vnode *, const char *, inst_t *);
-extern void vn_trace_hold(struct vnode *, char *, int, inst_t *);
-extern void vn_trace_ref(struct vnode *, char *, int, inst_t *);
-extern void vn_trace_rele(struct vnode *, char *, int, inst_t *);
-
-#define	VN_TRACE(vp)		\
-	vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address)
-#else
-#define	vn_trace_entry(a,b,c)
-#define	vn_trace_exit(a,b,c)
-#define	vn_trace_hold(a,b,c,d)
-#define	vn_trace_ref(a,b,c,d)
-#define	vn_trace_rele(a,b,c,d)
-#define	VN_TRACE(vp)
-#endif
-
-#endif	/* __XFS_VNODE_H__ */
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/mrlock.h
index 16b44c3c236..e3c92d19e54 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/mrlock.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -20,29 +20,35 @@
 
 #include <linux/rwsem.h>
 
-enum { MR_NONE, MR_ACCESS, MR_UPDATE };
-
 typedef struct {
 	struct rw_semaphore	mr_lock;
+#if defined(DEBUG) || defined(XFS_WARN)
 	int			mr_writer;
+#endif
 } mrlock_t;
 
+#if defined(DEBUG) || defined(XFS_WARN)
+#define mrinit(mrp, name)	\
+	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
+#else
 #define mrinit(mrp, name)	\
-	( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) )
+	do { init_rwsem(&(mrp)->mr_lock); } while (0)
+#endif
+
 #define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
 #define mrfree(mrp)		do { } while (0)
-#define mraccess(mrp)		mraccessf(mrp, 0)
-#define mrupdate(mrp)		mrupdatef(mrp, 0)
 
-static inline void mraccessf(mrlock_t *mrp, int flags)
+static inline void mraccess_nested(mrlock_t *mrp, int subclass)
 {
-	down_read(&mrp->mr_lock);
+	down_read_nested(&mrp->mr_lock, subclass);
 }
 
-static inline void mrupdatef(mrlock_t *mrp, int flags)
+static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
 {
-	down_write(&mrp->mr_lock);
+	down_write_nested(&mrp->mr_lock, subclass);
+#if defined(DEBUG) || defined(XFS_WARN)
 	mrp->mr_writer = 1;
+#endif
 }
 
 static inline int mrtryaccess(mrlock_t *mrp)
@@ -54,39 +60,31 @@ static inline int mrtryupdate(mrlock_t *mrp)
 {
 	if (!down_write_trylock(&mrp->mr_lock))
 		return 0;
+#if defined(DEBUG) || defined(XFS_WARN)
 	mrp->mr_writer = 1;
+#endif
 	return 1;
 }
 
-static inline void mrunlock(mrlock_t *mrp)
+static inline void mrunlock_excl(mrlock_t *mrp)
 {
-	if (mrp->mr_writer) {
-		mrp->mr_writer = 0;
-		up_write(&mrp->mr_lock);
-	} else {
-		up_read(&mrp->mr_lock);
-	}
+#if defined(DEBUG) || defined(XFS_WARN)
+	mrp->mr_writer = 0;
+#endif
+	up_write(&mrp->mr_lock);
 }
 
-static inline void mrdemote(mrlock_t *mrp)
+static inline void mrunlock_shared(mrlock_t *mrp)
 {
-	mrp->mr_writer = 0;
-	downgrade_write(&mrp->mr_lock);
+	up_read(&mrp->mr_lock);
 }
 
-#ifdef DEBUG
-/*
- * Debug-only routine, without some platform-specific asm code, we can
- * now only answer requests regarding whether we hold the lock for write
- * (reader state is outside our visibility, we only track writer state).
- * Note: means !ismrlocked would give false positivies, so don't do that.
- */
-static inline int ismrlocked(mrlock_t *mrp, int type)
+static inline void mrdemote(mrlock_t *mrp)
 {
-	if (mrp && type == MR_UPDATE)
-		return mrp->mr_writer;
-	return 1;
-}
+#if defined(DEBUG) || defined(XFS_WARN)
+	mrp->mr_writer = 0;
 #endif
+	downgrade_write(&mrp->mr_lock);
+}
 
 #endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
deleted file mode 100644
index 772ac48329e..00000000000
--- a/fs/xfs/quota/xfs_dquot.c
+++ /dev/null
@@ -1,1601 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_space.h"
-#include "xfs_trans_priv.h"
-#include "xfs_qm.h"
-
-
-/*
-   LOCK ORDER
-
-   inode lock		    (ilock)
-   dquot hash-chain lock    (hashlock)
-   xqm dquot freelist lock  (freelistlock
-   mount's dquot list lock  (mplistlock)
-   user dquot lock - lock ordering among dquots is based on the uid or gid
-   group dquot lock - similar to udquots. Between the two dquots, the udquot
-		      has to be locked first.
-   pin lock - the dquot lock must be held to take this lock.
-   flush lock - ditto.
-*/
-
-STATIC void		xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
-
-#ifdef DEBUG
-xfs_buftarg_t *xfs_dqerror_target;
-int xfs_do_dqerror;
-int xfs_dqreq_num;
-int xfs_dqerror_mod = 33;
-#endif
-
-/*
- * Allocate and initialize a dquot. We don't always allocate fresh memory;
- * we try to reclaim a free dquot if the number of incore dquots are above
- * a threshold.
- * The only field inside the core that gets initialized at this point
- * is the d_id field. The idea is to fill in the entire q_core
- * when we read in the on disk dquot.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqinit(
-	xfs_mount_t  *mp,
-	xfs_dqid_t   id,
-	uint	     type)
-{
-	xfs_dquot_t	*dqp;
-	boolean_t	brandnewdquot;
-
-	brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
-	dqp->dq_flags = type;
-	dqp->q_core.d_id = cpu_to_be32(id);
-	dqp->q_mount = mp;
-
-	/*
-	 * No need to re-initialize these if this is a reclaimed dquot.
-	 */
-	if (brandnewdquot) {
-		dqp->dq_flnext = dqp->dq_flprev = dqp;
-		mutex_init(&dqp->q_qlock);
-		initnsema(&dqp->q_flock, 1, "fdq");
-		sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
-
-#ifdef XFS_DQUOT_TRACE
-		dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP);
-		xfs_dqtrace_entry(dqp, "DQINIT");
-#endif
-	} else {
-		/*
-		 * Only the q_core portion was zeroed in dqreclaim_one().
-		 * So, we need to reset others.
-		 */
-		 dqp->q_nrefs = 0;
-		 dqp->q_blkno = 0;
-		 dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
-		 dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
-		 dqp->q_bufoffset = 0;
-		 dqp->q_fileoffset = 0;
-		 dqp->q_transp = NULL;
-		 dqp->q_gdquot = NULL;
-		 dqp->q_res_bcount = 0;
-		 dqp->q_res_icount = 0;
-		 dqp->q_res_rtbcount = 0;
-		 dqp->q_pincount = 0;
-		 dqp->q_hash = NULL;
-		 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
-
-#ifdef XFS_DQUOT_TRACE
-		 ASSERT(dqp->q_trace);
-		 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
-#endif
-	 }
-
-	/*
-	 * log item gets initialized later
-	 */
-	return (dqp);
-}
-
-/*
- * This is called to free all the memory associated with a dquot
- */
-void
-xfs_qm_dqdestroy(
-	xfs_dquot_t	*dqp)
-{
-	ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
-
-	mutex_destroy(&dqp->q_qlock);
-	freesema(&dqp->q_flock);
-	sv_destroy(&dqp->q_pinwait);
-
-#ifdef XFS_DQUOT_TRACE
-	if (dqp->q_trace)
-	     ktrace_free(dqp->q_trace);
-	dqp->q_trace = NULL;
-#endif
-	kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
-	atomic_dec(&xfs_Gqm->qm_totaldquots);
-}
-
-/*
- * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
- */
-STATIC void
-xfs_qm_dqinit_core(
-	xfs_dqid_t	id,
-	uint		type,
-	xfs_dqblk_t	*d)
-{
-	/*
-	 * Caller has zero'd the entire dquot 'chunk' already.
-	 */
-	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
-	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
-	d->dd_diskdq.d_id = cpu_to_be32(id);
-	d->dd_diskdq.d_flags = type;
-}
-
-
-#ifdef XFS_DQUOT_TRACE
-/*
- * Dquot tracing for debugging.
- */
-/* ARGSUSED */
-void
-__xfs_dqtrace_entry(
-	xfs_dquot_t	*dqp,
-	char		*func,
-	void		*retaddr,
-	xfs_inode_t	*ip)
-{
-	xfs_dquot_t	*udqp = NULL;
-	xfs_ino_t	ino = 0;
-
-	ASSERT(dqp->q_trace);
-	if (ip) {
-		ino = ip->i_ino;
-		udqp = ip->i_udquot;
-	}
-	ktrace_enter(dqp->q_trace,
-		     (void *)(__psint_t)DQUOT_KTRACE_ENTRY,
-		     (void *)func,
-		     (void *)(__psint_t)dqp->q_nrefs,
-		     (void *)(__psint_t)dqp->dq_flags,
-		     (void *)(__psint_t)dqp->q_res_bcount,
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_bcount),
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_icount),
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_hardlimit),
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_softlimit),
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_hardlimit),
-		     (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_softlimit),
-		     (void *)(__psint_t)be32_to_cpu(dqp->q_core.d_id),
-		     (void *)(__psint_t)current_pid(),
-		     (void *)(__psint_t)ino,
-		     (void *)(__psint_t)retaddr,
-		     (void *)(__psint_t)udqp);
-	return;
-}
-#endif
-
-
-/*
- * If default limits are in force, push them into the dquot now.
- * We overwrite the dquot limits only if they are zero and this
- * is not the root dquot.
- */
-void
-xfs_qm_adjust_dqlimits(
-	xfs_mount_t		*mp,
-	xfs_disk_dquot_t	*d)
-{
-	xfs_quotainfo_t		*q = mp->m_quotainfo;
-
-	ASSERT(d->d_id);
-
-	if (q->qi_bsoftlimit && !d->d_blk_softlimit)
-		d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
-	if (q->qi_bhardlimit && !d->d_blk_hardlimit)
-		d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
-	if (q->qi_isoftlimit && !d->d_ino_softlimit)
-		d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
-	if (q->qi_ihardlimit && !d->d_ino_hardlimit)
-		d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
-	if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
-		d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
-	if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
-		d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
-}
-
-/*
- * Check the limits and timers of a dquot and start or reset timers
- * if necessary.
- * This gets called even when quota enforcement is OFF, which makes our
- * life a little less complicated. (We just don't reject any quota
- * reservations in that case, when enforcement is off).
- * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
- * enforcement's off.
- * In contrast, warnings are a little different in that they don't
- * 'automatically' get started when limits get exceeded.  They do
- * get reset to zero, however, when we find the count to be under
- * the soft limit (they are only ever set non-zero via userspace).
- */
-void
-xfs_qm_adjust_dqtimers(
-	xfs_mount_t		*mp,
-	xfs_disk_dquot_t	*d)
-{
-	ASSERT(d->d_id);
-
-#ifdef QUOTADEBUG
-	if (d->d_blk_hardlimit)
-		ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
-		       be64_to_cpu(d->d_blk_hardlimit));
-	if (d->d_ino_hardlimit)
-		ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
-		       be64_to_cpu(d->d_ino_hardlimit));
-	if (d->d_rtb_hardlimit)
-		ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
-		       be64_to_cpu(d->d_rtb_hardlimit));
-#endif
-	if (!d->d_btimer) {
-		if ((d->d_blk_softlimit &&
-		     (be64_to_cpu(d->d_bcount) >=
-		      be64_to_cpu(d->d_blk_softlimit))) ||
-		    (d->d_blk_hardlimit &&
-		     (be64_to_cpu(d->d_bcount) >=
-		      be64_to_cpu(d->d_blk_hardlimit)))) {
-			d->d_btimer = cpu_to_be32(get_seconds() +
-					XFS_QI_BTIMELIMIT(mp));
-		} else {
-			d->d_bwarns = 0;
-		}
-	} else {
-		if ((!d->d_blk_softlimit ||
-		     (be64_to_cpu(d->d_bcount) <
-		      be64_to_cpu(d->d_blk_softlimit))) &&
-		    (!d->d_blk_hardlimit ||
-		    (be64_to_cpu(d->d_bcount) <
-		     be64_to_cpu(d->d_blk_hardlimit)))) {
-			d->d_btimer = 0;
-		}
-	}
-
-	if (!d->d_itimer) {
-		if ((d->d_ino_softlimit &&
-		     (be64_to_cpu(d->d_icount) >=
-		      be64_to_cpu(d->d_ino_softlimit))) ||
-		    (d->d_ino_hardlimit &&
-		     (be64_to_cpu(d->d_icount) >=
-		      be64_to_cpu(d->d_ino_hardlimit)))) {
-			d->d_itimer = cpu_to_be32(get_seconds() +
-					XFS_QI_ITIMELIMIT(mp));
-		} else {
-			d->d_iwarns = 0;
-		}
-	} else {
-		if ((!d->d_ino_softlimit ||
-		     (be64_to_cpu(d->d_icount) <
-		      be64_to_cpu(d->d_ino_softlimit)))  &&
-		    (!d->d_ino_hardlimit ||
-		     (be64_to_cpu(d->d_icount) <
-		      be64_to_cpu(d->d_ino_hardlimit)))) {
-			d->d_itimer = 0;
-		}
-	}
-
-	if (!d->d_rtbtimer) {
-		if ((d->d_rtb_softlimit &&
-		     (be64_to_cpu(d->d_rtbcount) >=
-		      be64_to_cpu(d->d_rtb_softlimit))) ||
-		    (d->d_rtb_hardlimit &&
-		     (be64_to_cpu(d->d_rtbcount) >=
-		      be64_to_cpu(d->d_rtb_hardlimit)))) {
-			d->d_rtbtimer = cpu_to_be32(get_seconds() +
-					XFS_QI_RTBTIMELIMIT(mp));
-		} else {
-			d->d_rtbwarns = 0;
-		}
-	} else {
-		if ((!d->d_rtb_softlimit ||
-		     (be64_to_cpu(d->d_rtbcount) <
-		      be64_to_cpu(d->d_rtb_softlimit))) &&
-		    (!d->d_rtb_hardlimit ||
-		     (be64_to_cpu(d->d_rtbcount) <
-		      be64_to_cpu(d->d_rtb_hardlimit)))) {
-			d->d_rtbtimer = 0;
-		}
-	}
-}
-
-/*
- * initialize a buffer full of dquots and log the whole thing
- */
-STATIC void
-xfs_qm_init_dquot_blk(
-	xfs_trans_t	*tp,
-	xfs_mount_t	*mp,
-	xfs_dqid_t	id,
-	uint		type,
-	xfs_buf_t	*bp)
-{
-	xfs_dqblk_t	*d;
-	int		curid, i;
-
-	ASSERT(tp);
-	ASSERT(XFS_BUF_ISBUSY(bp));
-	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
-
-	d = (xfs_dqblk_t *)XFS_BUF_PTR(bp);
-
-	/*
-	 * ID of the first dquot in the block - id's are zero based.
-	 */
-	curid = id - (id % XFS_QM_DQPERBLK(mp));
-	ASSERT(curid >= 0);
-	memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
-	for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
-		xfs_qm_dqinit_core(curid, type, d);
-	xfs_trans_dquot_buf(tp, bp,
-			    (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
-			    ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
-			     XFS_BLI_GDQUOT_BUF)));
-	xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
-}
-
-
-
-/*
- * Allocate a block and fill it with dquots.
- * This is called when the bmapi finds a hole.
- */
-STATIC int
-xfs_qm_dqalloc(
-	xfs_trans_t	**tpp,
-	xfs_mount_t	*mp,
-	xfs_dquot_t	*dqp,
-	xfs_inode_t	*quotip,
-	xfs_fileoff_t	offset_fsb,
-	xfs_buf_t	**O_bpp)
-{
-	xfs_fsblock_t	firstblock;
-	xfs_bmap_free_t flist;
-	xfs_bmbt_irec_t map;
-	int		nmaps, error, committed;
-	xfs_buf_t	*bp;
-	xfs_trans_t	*tp = *tpp;
-
-	ASSERT(tp != NULL);
-	xfs_dqtrace_entry(dqp, "DQALLOC");
-
-	/*
-	 * Initialize the bmap freelist prior to calling bmapi code.
-	 */
-	XFS_BMAP_INIT(&flist, &firstblock);
-	xfs_ilock(quotip, XFS_ILOCK_EXCL);
-	/*
-	 * Return if this type of quotas is turned off while we didn't
-	 * have an inode lock
-	 */
-	if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
-		xfs_iunlock(quotip, XFS_ILOCK_EXCL);
-		return (ESRCH);
-	}
-
-	/*
-	 * xfs_trans_commit normally decrements the vnode ref count
-	 * when it unlocks the inode. Since we want to keep the quota
-	 * inode around, we bump the vnode ref count now.
-	 */
-	VN_HOLD(XFS_ITOV(quotip));
-
-	xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
-	nmaps = 1;
-	if ((error = xfs_bmapi(tp, quotip,
-			      offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
-			      XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
-			      &firstblock,
-			      XFS_QM_DQALLOC_SPACE_RES(mp),
-			      &map, &nmaps, &flist))) {
-		goto error0;
-	}
-	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
-	ASSERT(nmaps == 1);
-	ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-	       (map.br_startblock != HOLESTARTBLOCK));
-
-	/*
-	 * Keep track of the blkno to save a lookup later
-	 */
-	dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-
-	/* now we can just get the buffer (there's nothing to read yet) */
-	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
-			       dqp->q_blkno,
-			       XFS_QI_DQCHUNKLEN(mp),
-			       0);
-	if (!bp || (error = XFS_BUF_GETERROR(bp)))
-		goto error1;
-	/*
-	 * Make a chunk of dquots out of this buffer and log
-	 * the entire thing.
-	 */
-	xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
-			      dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
-
-	/*
-	 * xfs_bmap_finish() may commit the current transaction and
-	 * start a second transaction if the freelist is not empty.
-	 *
-	 * Since we still want to modify this buffer, we need to
-	 * ensure that the buffer is not released on commit of
-	 * the first transaction and ensure the buffer is added to the
-	 * second transaction.
-	 *
-	 * If there is only one transaction then don't stop the buffer
-	 * from being released when it commits later on.
-	 */
-
-	xfs_trans_bhold(tp, bp);
-
-	if ((error = xfs_bmap_finish(tpp, &flist, firstblock, &committed))) {
-		goto error1;
-	}
-
-	if (committed) {
-		tp = *tpp;
-		xfs_trans_bjoin(tp, bp);
-	} else {
-		xfs_trans_bhold_release(tp, bp);
-	}
-
-	*O_bpp = bp;
-	return 0;
-
-      error1:
-	xfs_bmap_cancel(&flist);
-      error0:
-	xfs_iunlock(quotip, XFS_ILOCK_EXCL);
-
-	return (error);
-}
-
-/*
- * Maps a dquot to the buffer containing its on-disk version.
- * This returns a ptr to the buffer containing the on-disk dquot
- * in the bpp param, and a ptr to the on-disk dquot within that buffer
- */
-STATIC int
-xfs_qm_dqtobp(
-	xfs_trans_t		**tpp,
-	xfs_dquot_t		*dqp,
-	xfs_disk_dquot_t	**O_ddpp,
-	xfs_buf_t		**O_bpp,
-	uint			flags)
-{
-	xfs_bmbt_irec_t map;
-	int		nmaps, error;
-	xfs_buf_t	*bp;
-	xfs_inode_t	*quotip;
-	xfs_mount_t	*mp;
-	xfs_disk_dquot_t *ddq;
-	xfs_dqid_t	id;
-	boolean_t	newdquot;
-	xfs_trans_t	*tp = (tpp ? *tpp : NULL);
-
-	mp = dqp->q_mount;
-	id = be32_to_cpu(dqp->q_core.d_id);
-	nmaps = 1;
-	newdquot = B_FALSE;
-
-	/*
-	 * If we don't know where the dquot lives, find out.
-	 */
-	if (dqp->q_blkno == (xfs_daddr_t) 0) {
-		/* We use the id as an index */
-		dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp);
-		nmaps = 1;
-		quotip = XFS_DQ_TO_QIP(dqp);
-		xfs_ilock(quotip, XFS_ILOCK_SHARED);
-		/*
-		 * Return if this type of quotas is turned off while we didn't
-		 * have an inode lock
-		 */
-		if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
-			xfs_iunlock(quotip, XFS_ILOCK_SHARED);
-			return (ESRCH);
-		}
-		/*
-		 * Find the block map; no allocations yet
-		 */
-		error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
-				  XFS_DQUOT_CLUSTER_SIZE_FSB,
-				  XFS_BMAPI_METADATA,
-				  NULL, 0, &map, &nmaps, NULL);
-
-		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
-		if (error)
-			return (error);
-		ASSERT(nmaps == 1);
-		ASSERT(map.br_blockcount == 1);
-
-		/*
-		 * offset of dquot in the (fixed sized) dquot chunk.
-		 */
-		dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
-			sizeof(xfs_dqblk_t);
-		if (map.br_startblock == HOLESTARTBLOCK) {
-			/*
-			 * We don't allocate unless we're asked to
-			 */
-			if (!(flags & XFS_QMOPT_DQALLOC))
-				return (ENOENT);
-
-			ASSERT(tp);
-			if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
-						dqp->q_fileoffset, &bp)))
-				return (error);
-			tp = *tpp;
-			newdquot = B_TRUE;
-		} else {
-			/*
-			 * store the blkno etc so that we don't have to do the
-			 * mapping all the time
-			 */
-			dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-		}
-	}
-	ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
-	ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
-
-	/*
-	 * Read in the buffer, unless we've just done the allocation
-	 * (in which case we already have the buf).
-	 */
-	if (! newdquot) {
-		xfs_dqtrace_entry(dqp, "DQTOBP READBUF");
-		if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-					       dqp->q_blkno,
-					       XFS_QI_DQCHUNKLEN(mp),
-					       0, &bp))) {
-			return (error);
-		}
-		if (error || !bp)
-			return XFS_ERROR(error);
-	}
-	ASSERT(XFS_BUF_ISBUSY(bp));
-	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
-
-	/*
-	 * calculate the location of the dquot inside the buffer.
-	 */
-	ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset);
-
-	/*
-	 * A simple sanity check in case we got a corrupted dquot...
-	 */
-	if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
-			   flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
-			   "dqtobp")) {
-		if (!(flags & XFS_QMOPT_DQREPAIR)) {
-			xfs_trans_brelse(tp, bp);
-			return XFS_ERROR(EIO);
-		}
-		XFS_BUF_BUSY(bp); /* We dirtied this */
-	}
-
-	*O_bpp = bp;
-	*O_ddpp = ddq;
-
-	return (0);
-}
-
-
-/*
- * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
- * and release the buffer immediately.
- *
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_dqread(
-	xfs_trans_t	**tpp,
-	xfs_dqid_t	id,
-	xfs_dquot_t	*dqp,	/* dquot to get filled in */
-	uint		flags)
-{
-	xfs_disk_dquot_t *ddqp;
-	xfs_buf_t	 *bp;
-	int		 error;
-	xfs_trans_t	 *tp;
-
-	ASSERT(tpp);
-
-	/*
-	 * get a pointer to the on-disk dquot and the buffer containing it
-	 * dqp already knows its own type (GROUP/USER).
-	 */
-	xfs_dqtrace_entry(dqp, "DQREAD");
-	if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
-		return (error);
-	}
-	tp = *tpp;
-
-	/* copy everything from disk dquot to the incore dquot */
-	memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
-	ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
-	xfs_qm_dquot_logitem_init(dqp);
-
-	/*
-	 * Reservation counters are defined as reservation plus current usage
-	 * to avoid having to add everytime.
-	 */
-	dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
-	dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
-	dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
-
-	/* Mark the buf so that this will stay incore a little longer */
-	XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
-
-	/*
-	 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
-	 * So we need to release with xfs_trans_brelse().
-	 * The strategy here is identical to that of inodes; we lock
-	 * the dquot in xfs_qm_dqget() before making it accessible to
-	 * others. This is because dquots, like inodes, need a good level of
-	 * concurrency, and we don't want to take locks on the entire buffers
-	 * for dquot accesses.
-	 * Note also that the dquot buffer may even be dirty at this point, if
-	 * this particular dquot was repaired. We still aren't afraid to
-	 * brelse it because we have the changes incore.
-	 */
-	ASSERT(XFS_BUF_ISBUSY(bp));
-	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
-	xfs_trans_brelse(tp, bp);
-
-	return (error);
-}
-
-
-/*
- * allocate an incore dquot from the kernel heap,
- * and fill its core with quota information kept on disk.
- * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
- * if it wasn't already allocated.
- */
-STATIC int
-xfs_qm_idtodq(
-	xfs_mount_t	*mp,
-	xfs_dqid_t	id,	 /* gid or uid, depending on type */
-	uint		type,	 /* UDQUOT or GDQUOT */
-	uint		flags,	 /* DQALLOC, DQREPAIR */
-	xfs_dquot_t	**O_dqpp)/* OUT : incore dquot, not locked */
-{
-	xfs_dquot_t	*dqp;
-	int		error;
-	xfs_trans_t	*tp;
-	int		cancelflags=0;
-
-	dqp = xfs_qm_dqinit(mp, id, type);
-	tp = NULL;
-	if (flags & XFS_QMOPT_DQALLOC) {
-		tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-		if ((error = xfs_trans_reserve(tp,
-				       XFS_QM_DQALLOC_SPACE_RES(mp),
-				       XFS_WRITE_LOG_RES(mp) +
-					      BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
-					      128,
-				       0,
-				       XFS_TRANS_PERM_LOG_RES,
-				       XFS_WRITE_LOG_COUNT))) {
-			cancelflags = 0;
-			goto error0;
-		}
-		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
-	}
-
-	/*
-	 * Read it from disk; xfs_dqread() takes care of
-	 * all the necessary initialization of dquot's fields (locks, etc)
-	 */
-	if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) {
-		/*
-		 * This can happen if quotas got turned off (ESRCH),
-		 * or if the dquot didn't exist on disk and we ask to
-		 * allocate (ENOENT).
-		 */
-		xfs_dqtrace_entry(dqp, "DQREAD FAIL");
-		cancelflags |= XFS_TRANS_ABORT;
-		goto error0;
-	}
-	if (tp) {
-		if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
-					     NULL)))
-			goto error1;
-	}
-
-	*O_dqpp = dqp;
-	return (0);
-
- error0:
-	ASSERT(error);
-	if (tp)
-		xfs_trans_cancel(tp, cancelflags);
- error1:
-	xfs_qm_dqdestroy(dqp);
-	*O_dqpp = NULL;
-	return (error);
-}
-
-/*
- * Lookup a dquot in the incore dquot hashtable. We keep two separate
- * hashtables for user and group dquots; and, these are global tables
- * inside the XQM, not per-filesystem tables.
- * The hash chain must be locked by caller, and it is left locked
- * on return. Returning dquot is locked.
- */
-STATIC int
-xfs_qm_dqlookup(
-	xfs_mount_t		*mp,
-	xfs_dqid_t		id,
-	xfs_dqhash_t		*qh,
-	xfs_dquot_t		**O_dqpp)
-{
-	xfs_dquot_t		*dqp;
-	uint			flist_locked;
-	xfs_dquot_t		*d;
-
-	ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
-
-	flist_locked = B_FALSE;
-
-	/*
-	 * Traverse the hashchain looking for a match
-	 */
-	for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
-		/*
-		 * We already have the hashlock. We don't need the
-		 * dqlock to look at the id field of the dquot, since the
-		 * id can't be modified without the hashlock anyway.
-		 */
-		if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) {
-			xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP");
-			/*
-			 * All in core dquots must be on the dqlist of mp
-			 */
-			ASSERT(dqp->MPL_PREVP != NULL);
-
-			xfs_dqlock(dqp);
-			if (dqp->q_nrefs == 0) {
-				ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
-				if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
-					xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT");
-
-					/*
-					 * We may have raced with dqreclaim_one()
-					 * (and lost). So, flag that we don't
-					 * want the dquot to be reclaimed.
-					 */
-					dqp->dq_flags |= XFS_DQ_WANT;
-					xfs_dqunlock(dqp);
-					xfs_qm_freelist_lock(xfs_Gqm);
-					xfs_dqlock(dqp);
-					dqp->dq_flags &= ~(XFS_DQ_WANT);
-				}
-				flist_locked = B_TRUE;
-			}
-
-			/*
-			 * id couldn't have changed; we had the hashlock all
-			 * along
-			 */
-			ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
-
-			if (flist_locked) {
-				if (dqp->q_nrefs != 0) {
-					xfs_qm_freelist_unlock(xfs_Gqm);
-					flist_locked = B_FALSE;
-				} else {
-					/*
-					 * take it off the freelist
-					 */
-					xfs_dqtrace_entry(dqp,
-							"DQLOOKUP: TAKEOFF FL");
-					XQM_FREELIST_REMOVE(dqp);
-					/* xfs_qm_freelist_print(&(xfs_Gqm->
-							qm_dqfreelist),
-							"after removal"); */
-				}
-			}
-
-			/*
-			 * grab a reference
-			 */
-			XFS_DQHOLD(dqp);
-
-			if (flist_locked)
-				xfs_qm_freelist_unlock(xfs_Gqm);
-			/*
-			 * move the dquot to the front of the hashchain
-			 */
-			ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
-			if (dqp->HL_PREVP != &qh->qh_next) {
-				xfs_dqtrace_entry(dqp,
-						  "DQLOOKUP: HASH MOVETOFRONT");
-				if ((d = dqp->HL_NEXT))
-					d->HL_PREVP = dqp->HL_PREVP;
-				*(dqp->HL_PREVP) = d;
-				d = qh->qh_next;
-				d->HL_PREVP = &dqp->HL_NEXT;
-				dqp->HL_NEXT = d;
-				dqp->HL_PREVP = &qh->qh_next;
-				qh->qh_next = dqp;
-			}
-			xfs_dqtrace_entry(dqp, "LOOKUP END");
-			*O_dqpp = dqp;
-			ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
-			return (0);
-		}
-	}
-
-	*O_dqpp = NULL;
-	ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
-	return (1);
-}
-
-/*
- * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
- * a locked dquot, doing an allocation (if requested) as needed.
- * When both an inode and an id are given, the inode's id takes precedence.
- * That is, if the id changes while we don't hold the ilock inside this
- * function, the new dquot is returned, not necessarily the one requested
- * in the id argument.
- */
-int
-xfs_qm_dqget(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip,	  /* locked inode (optional) */
-	xfs_dqid_t	id,	  /* uid/projid/gid depending on type */
-	uint		type,	  /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
-	uint		flags,	  /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
-	xfs_dquot_t	**O_dqpp) /* OUT : locked incore dquot */
-{
-	xfs_dquot_t	*dqp;
-	xfs_dqhash_t	*h;
-	uint		version;
-	int		error;
-
-	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-	if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
-	    (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
-	    (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
-		return (ESRCH);
-	}
-	h = XFS_DQ_HASH(mp, id, type);
-
-#ifdef DEBUG
-	if (xfs_do_dqerror) {
-		if ((xfs_dqerror_target == mp->m_ddev_targp) &&
-		    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
-			cmn_err(CE_DEBUG, "Returning error in dqget");
-			return (EIO);
-		}
-	}
-#endif
-
- again:
-
-#ifdef DEBUG
-	ASSERT(type == XFS_DQ_USER ||
-	       type == XFS_DQ_PROJ ||
-	       type == XFS_DQ_GROUP);
-	if (ip) {
-		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
-		if (type == XFS_DQ_USER)
-			ASSERT(ip->i_udquot == NULL);
-		else
-			ASSERT(ip->i_gdquot == NULL);
-	}
-#endif
-	XFS_DQ_HASH_LOCK(h);
-
-	/*
-	 * Look in the cache (hashtable).
-	 * The chain is kept locked during lookup.
-	 */
-	if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
-		XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
-		/*
-		 * The dquot was found, moved to the front of the chain,
-		 * taken off the freelist if it was on it, and locked
-		 * at this point. Just unlock the hashchain and return.
-		 */
-		ASSERT(*O_dqpp);
-		ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
-		XFS_DQ_HASH_UNLOCK(h);
-		xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
-		return (0);	/* success */
-	}
-	XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
-
-	/*
-	 * Dquot cache miss. We don't want to keep the inode lock across
-	 * a (potential) disk read. Also we don't want to deal with the lock
-	 * ordering between quotainode and this inode. OTOH, dropping the inode
-	 * lock here means dealing with a chown that can happen before
-	 * we re-acquire the lock.
-	 */
-	if (ip)
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	/*
-	 * Save the hashchain version stamp, and unlock the chain, so that
-	 * we don't keep the lock across a disk read
-	 */
-	version = h->qh_version;
-	XFS_DQ_HASH_UNLOCK(h);
-
-	/*
-	 * Allocate the dquot on the kernel heap, and read the ondisk
-	 * portion off the disk. Also, do all the necessary initialization
-	 * This can return ENOENT if dquot didn't exist on disk and we didn't
-	 * ask it to allocate; ESRCH if quotas got turned off suddenly.
-	 */
-	if ((error = xfs_qm_idtodq(mp, id, type,
-				  flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
-					   XFS_QMOPT_DOWARN),
-				  &dqp))) {
-		if (ip)
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-		return (error);
-	}
-
-	/*
-	 * See if this is mount code calling to look at the overall quota limits
-	 * which are stored in the id == 0 user or group's dquot.
-	 * Since we may not have done a quotacheck by this point, just return
-	 * the dquot without attaching it to any hashtables, lists, etc, or even
-	 * taking a reference.
-	 * The caller must dqdestroy this once done.
-	 */
-	if (flags & XFS_QMOPT_DQSUSER) {
-		ASSERT(id == 0);
-		ASSERT(! ip);
-		goto dqret;
-	}
-
-	/*
-	 * Dquot lock comes after hashlock in the lock ordering
-	 */
-	if (ip) {
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		if (! XFS_IS_DQTYPE_ON(mp, type)) {
-			/* inode stays locked on return */
-			xfs_qm_dqdestroy(dqp);
-			return XFS_ERROR(ESRCH);
-		}
-		/*
-		 * A dquot could be attached to this inode by now, since
-		 * we had dropped the ilock.
-		 */
-		if (type == XFS_DQ_USER) {
-			if (ip->i_udquot) {
-				xfs_qm_dqdestroy(dqp);
-				dqp = ip->i_udquot;
-				xfs_dqlock(dqp);
-				goto dqret;
-			}
-		} else {
-			if (ip->i_gdquot) {
-				xfs_qm_dqdestroy(dqp);
-				dqp = ip->i_gdquot;
-				xfs_dqlock(dqp);
-				goto dqret;
-			}
-		}
-	}
-
-	/*
-	 * Hashlock comes after ilock in lock order
-	 */
-	XFS_DQ_HASH_LOCK(h);
-	if (version != h->qh_version) {
-		xfs_dquot_t *tmpdqp;
-		/*
-		 * Now, see if somebody else put the dquot in the
-		 * hashtable before us. This can happen because we didn't
-		 * keep the hashchain lock. We don't have to worry about
-		 * lock order between the two dquots here since dqp isn't
-		 * on any findable lists yet.
-		 */
-		if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
-			/*
-			 * Duplicate found. Just throw away the new dquot
-			 * and start over.
-			 */
-			xfs_qm_dqput(tmpdqp);
-			XFS_DQ_HASH_UNLOCK(h);
-			xfs_qm_dqdestroy(dqp);
-			XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
-			goto again;
-		}
-	}
-
-	/*
-	 * Put the dquot at the beginning of the hash-chain and mp's list
-	 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
-	 */
-	ASSERT(XFS_DQ_IS_HASH_LOCKED(h));
-	dqp->q_hash = h;
-	XQM_HASHLIST_INSERT(h, dqp);
-
-	/*
-	 * Attach this dquot to this filesystem's list of all dquots,
-	 * kept inside the mount structure in m_quotainfo field
-	 */
-	xfs_qm_mplist_lock(mp);
-
-	/*
-	 * We return a locked dquot to the caller, with a reference taken
-	 */
-	xfs_dqlock(dqp);
-	dqp->q_nrefs = 1;
-
-	XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
-
-	xfs_qm_mplist_unlock(mp);
-	XFS_DQ_HASH_UNLOCK(h);
- dqret:
-	ASSERT((ip == NULL) || XFS_ISLOCKED_INODE_EXCL(ip));
-	xfs_dqtrace_entry(dqp, "DQGET DONE");
-	*O_dqpp = dqp;
-	return (0);
-}
-
-
-/*
- * Release a reference to the dquot (decrement ref-count)
- * and unlock it. If there is a group quota attached to this
- * dquot, carefully release that too without tripping over
- * deadlocks'n'stuff.
- */
-void
-xfs_qm_dqput(
-	xfs_dquot_t	*dqp)
-{
-	xfs_dquot_t	*gdqp;
-
-	ASSERT(dqp->q_nrefs > 0);
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	xfs_dqtrace_entry(dqp, "DQPUT");
-
-	if (dqp->q_nrefs != 1) {
-		dqp->q_nrefs--;
-		xfs_dqunlock(dqp);
-		return;
-	}
-
-	/*
-	 * drop the dqlock and acquire the freelist and dqlock
-	 * in the right order; but try to get it out-of-order first
-	 */
-	if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
-		xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT");
-		xfs_dqunlock(dqp);
-		xfs_qm_freelist_lock(xfs_Gqm);
-		xfs_dqlock(dqp);
-	}
-
-	while (1) {
-		gdqp = NULL;
-
-		/* We can't depend on nrefs being == 1 here */
-		if (--dqp->q_nrefs == 0) {
-			xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST");
-			/*
-			 * insert at end of the freelist.
-			 */
-			XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
-
-			/*
-			 * If we just added a udquot to the freelist, then
-			 * we want to release the gdquot reference that
-			 * it (probably) has. Otherwise it'll keep the
-			 * gdquot from getting reclaimed.
-			 */
-			if ((gdqp = dqp->q_gdquot)) {
-				/*
-				 * Avoid a recursive dqput call
-				 */
-				xfs_dqlock(gdqp);
-				dqp->q_gdquot = NULL;
-			}
-
-			/* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
-			   "@@@@@++ Free list (after append) @@@@@+");
-			   */
-		}
-		xfs_dqunlock(dqp);
-
-		/*
-		 * If we had a group quota inside the user quota as a hint,
-		 * release it now.
-		 */
-		if (! gdqp)
-			break;
-		dqp = gdqp;
-	}
-	xfs_qm_freelist_unlock(xfs_Gqm);
-}
-
-/*
- * Release a dquot. Flush it if dirty, then dqput() it.
- * dquot must not be locked.
- */
-void
-xfs_qm_dqrele(
-	xfs_dquot_t	*dqp)
-{
-	ASSERT(dqp);
-	xfs_dqtrace_entry(dqp, "DQRELE");
-
-	xfs_dqlock(dqp);
-	/*
-	 * We don't care to flush it if the dquot is dirty here.
-	 * That will create stutters that we want to avoid.
-	 * Instead we do a delayed write when we try to reclaim
-	 * a dirty dquot. Also xfs_sync will take part of the burden...
-	 */
-	xfs_qm_dqput(dqp);
-}
-
-
-/*
- * Write a modified dquot to disk.
- * The dquot must be locked and the flush lock too taken by caller.
- * The flush lock will not be unlocked until the dquot reaches the disk,
- * but the dquot is free to be unlocked and modified by the caller
- * in the interim. Dquot is still locked on return. This behavior is
- * identical to that of inodes.
- */
-int
-xfs_qm_dqflush(
-	xfs_dquot_t		*dqp,
-	uint			flags)
-{
-	xfs_mount_t		*mp;
-	xfs_buf_t		*bp;
-	xfs_disk_dquot_t	*ddqp;
-	int			error;
-	SPLDECL(s);
-
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
-	xfs_dqtrace_entry(dqp, "DQFLUSH");
-
-	/*
-	 * If not dirty, nada.
-	 */
-	if (!XFS_DQ_IS_DIRTY(dqp)) {
-		xfs_dqfunlock(dqp);
-		return (0);
-	}
-
-	/*
-	 * Cant flush a pinned dquot. Wait for it.
-	 */
-	xfs_qm_dqunpin_wait(dqp);
-
-	/*
-	 * This may have been unpinned because the filesystem is shutting
-	 * down forcibly. If that's the case we must not write this dquot
-	 * to disk, because the log record didn't make it to disk!
-	 */
-	if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) {
-		dqp->dq_flags &= ~(XFS_DQ_DIRTY);
-		xfs_dqfunlock(dqp);
-		return XFS_ERROR(EIO);
-	}
-
-	/*
-	 * Get the buffer containing the on-disk dquot
-	 * We don't need a transaction envelope because we know that the
-	 * the ondisk-dquot has already been allocated for.
-	 */
-	if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
-		xfs_dqtrace_entry(dqp, "DQTOBP FAIL");
-		ASSERT(error != ENOENT);
-		/*
-		 * Quotas could have gotten turned off (ESRCH)
-		 */
-		xfs_dqfunlock(dqp);
-		return (error);
-	}
-
-	if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id),
-			   0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
-		xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE);
-		return XFS_ERROR(EIO);
-	}
-
-	/* This is the only portion of data that needs to persist */
-	memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t));
-
-	/*
-	 * Clear the dirty field and remember the flush lsn for later use.
-	 */
-	dqp->dq_flags &= ~(XFS_DQ_DIRTY);
-	mp = dqp->q_mount;
-
-	/* lsn is 64 bits */
-	AIL_LOCK(mp, s);
-	dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
-	AIL_UNLOCK(mp, s);
-
-	/*
-	 * Attach an iodone routine so that we can remove this dquot from the
-	 * AIL and release the flush lock once the dquot is synced to disk.
-	 */
-	xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *))
-			      xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item));
-	/*
-	 * If the buffer is pinned then push on the log so we won't
-	 * get stuck waiting in the write for too long.
-	 */
-	if (XFS_BUF_ISPINNED(bp)) {
-		xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE");
-		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-	}
-
-	if (flags & XFS_QMOPT_DELWRI) {
-		xfs_bdwrite(mp, bp);
-	} else if (flags & XFS_QMOPT_ASYNC) {
-		xfs_bawrite(mp, bp);
-	} else {
-		error = xfs_bwrite(mp, bp);
-	}
-	xfs_dqtrace_entry(dqp, "DQFLUSH END");
-	/*
-	 * dqp is still locked, but caller is free to unlock it now.
-	 */
-	return (error);
-
-}
-
-/*
- * This is the dquot flushing I/O completion routine.  It is called
- * from interrupt level when the buffer containing the dquot is
- * flushed to disk.  It is responsible for removing the dquot logitem
- * from the AIL if it has not been re-logged, and unlocking the dquot's
- * flush lock. This behavior is very similar to that of inodes..
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_dqflush_done(
-	xfs_buf_t		*bp,
-	xfs_dq_logitem_t	*qip)
-{
-	xfs_dquot_t		*dqp;
-	SPLDECL(s);
-
-	dqp = qip->qli_dquot;
-
-	/*
-	 * We only want to pull the item from the AIL if its
-	 * location in the log has not changed since we started the flush.
-	 * Thus, we only bother if the dquot's lsn has
-	 * not changed. First we check the lsn outside the lock
-	 * since it's cheaper, and then we recheck while
-	 * holding the lock before removing the dquot from the AIL.
-	 */
-	if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-	    qip->qli_item.li_lsn == qip->qli_flush_lsn) {
-
-		AIL_LOCK(dqp->q_mount, s);
-		/*
-		 * xfs_trans_delete_ail() drops the AIL lock.
-		 */
-		if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
-			xfs_trans_delete_ail(dqp->q_mount,
-					     (xfs_log_item_t*)qip, s);
-		else
-			AIL_UNLOCK(dqp->q_mount, s);
-	}
-
-	/*
-	 * Release the dq's flush lock since we're done with it.
-	 */
-	xfs_dqfunlock(dqp);
-}
-
-
-int
-xfs_qm_dqflock_nowait(
-	xfs_dquot_t *dqp)
-{
-	int locked;
-
-	locked = cpsema(&((dqp)->q_flock));
-
-	/* XXX ifdef these out */
-	if (locked)
-		(dqp)->dq_flags |= XFS_DQ_FLOCKED;
-	return (locked);
-}
-
-
-int
-xfs_qm_dqlock_nowait(
-	xfs_dquot_t *dqp)
-{
-	return (mutex_trylock(&((dqp)->q_qlock)));
-}
-
-void
-xfs_dqlock(
-	xfs_dquot_t *dqp)
-{
-	mutex_lock(&(dqp->q_qlock));
-}
-
-void
-xfs_dqunlock(
-	xfs_dquot_t *dqp)
-{
-	mutex_unlock(&(dqp->q_qlock));
-	if (dqp->q_logitem.qli_dquot == dqp) {
-		/* Once was dqp->q_mount, but might just have been cleared */
-		xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
-					(xfs_log_item_t*)&(dqp->q_logitem));
-	}
-}
-
-
-void
-xfs_dqunlock_nonotify(
-	xfs_dquot_t *dqp)
-{
-	mutex_unlock(&(dqp->q_qlock));
-}
-
-void
-xfs_dqlock2(
-	xfs_dquot_t	*d1,
-	xfs_dquot_t	*d2)
-{
-	if (d1 && d2) {
-		ASSERT(d1 != d2);
-		if (be32_to_cpu(d1->q_core.d_id) >
-		    be32_to_cpu(d2->q_core.d_id)) {
-			xfs_dqlock(d2);
-			xfs_dqlock(d1);
-		} else {
-			xfs_dqlock(d1);
-			xfs_dqlock(d2);
-		}
-	} else {
-		if (d1) {
-			xfs_dqlock(d1);
-		} else if (d2) {
-			xfs_dqlock(d2);
-		}
-	}
-}
-
-
-/*
- * Take a dquot out of the mount's dqlist as well as the hashlist.
- * This is called via unmount as well as quotaoff, and the purge
- * will always succeed unless there are soft (temp) references
- * outstanding.
- *
- * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
- * that we're returning! XXXsup - not cool.
- */
-/* ARGSUSED */
-int
-xfs_qm_dqpurge(
-	xfs_dquot_t	*dqp,
-	uint		flags)
-{
-	xfs_dqhash_t	*thishash;
-	xfs_mount_t	*mp;
-
-	mp = dqp->q_mount;
-
-	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
-	ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
-
-	xfs_dqlock(dqp);
-	/*
-	 * We really can't afford to purge a dquot that is
-	 * referenced, because these are hard refs.
-	 * It shouldn't happen in general because we went thru _all_ inodes in
-	 * dqrele_all_inodes before calling this and didn't let the mountlock go.
-	 * However it is possible that we have dquots with temporary
-	 * references that are not attached to an inode. e.g. see xfs_setattr().
-	 */
-	if (dqp->q_nrefs != 0) {
-		xfs_dqunlock(dqp);
-		XFS_DQ_HASH_UNLOCK(dqp->q_hash);
-		return (1);
-	}
-
-	ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
-
-	/*
-	 * If we're turning off quotas, we have to make sure that, for
-	 * example, we don't delete quota disk blocks while dquots are
-	 * in the process of getting written to those disk blocks.
-	 * This dquot might well be on AIL, and we can't leave it there
-	 * if we're turning off quotas. Basically, we need this flush
-	 * lock, and are willing to block on it.
-	 */
-	if (! xfs_qm_dqflock_nowait(dqp)) {
-		/*
-		 * Block on the flush lock after nudging dquot buffer,
-		 * if it is incore.
-		 */
-		xfs_qm_dqflock_pushbuf_wait(dqp);
-	}
-
-	/*
-	 * XXXIf we're turning this type of quotas off, we don't care
-	 * about the dirty metadata sitting in this dquot. OTOH, if
-	 * we're unmounting, we do care, so we flush it and wait.
-	 */
-	if (XFS_DQ_IS_DIRTY(dqp)) {
-		xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
-		/* dqflush unlocks dqflock */
-		/*
-		 * Given that dqpurge is a very rare occurrence, it is OK
-		 * that we're holding the hashlist and mplist locks
-		 * across the disk write. But, ... XXXsup
-		 *
-		 * We don't care about getting disk errors here. We need
-		 * to purge this dquot anyway, so we go ahead regardless.
-		 */
-		(void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
-		xfs_dqflock(dqp);
-	}
-	ASSERT(dqp->q_pincount == 0);
-	ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
-	       !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
-
-	thishash = dqp->q_hash;
-	XQM_HASHLIST_REMOVE(thishash, dqp);
-	XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
-	/*
-	 * XXX Move this to the front of the freelist, if we can get the
-	 * freelist lock.
-	 */
-	ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
-
-	dqp->q_mount = NULL;
-	dqp->q_hash = NULL;
-	dqp->dq_flags = XFS_DQ_INACTIVE;
-	memset(&dqp->q_core, 0, sizeof(dqp->q_core));
-	xfs_dqfunlock(dqp);
-	xfs_dqunlock(dqp);
-	XFS_DQ_HASH_UNLOCK(thishash);
-	return (0);
-}
-
-
-#ifdef QUOTADEBUG
-void
-xfs_qm_dqprint(xfs_dquot_t *dqp)
-{
-	cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------");
-	cmn_err(CE_DEBUG, "---- dquotID =  %d",
-		(int)be32_to_cpu(dqp->q_core.d_id));
-	cmn_err(CE_DEBUG, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
-	cmn_err(CE_DEBUG, "---- fs      =  0x%p", dqp->q_mount);
-	cmn_err(CE_DEBUG, "---- blkno   =  0x%x", (int) dqp->q_blkno);
-	cmn_err(CE_DEBUG, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
-	cmn_err(CE_DEBUG, "---- blkhlimit =  %Lu (0x%x)",
-		be64_to_cpu(dqp->q_core.d_blk_hardlimit),
-		(int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-	cmn_err(CE_DEBUG, "---- blkslimit =  %Lu (0x%x)",
-		be64_to_cpu(dqp->q_core.d_blk_softlimit),
-		(int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
-	cmn_err(CE_DEBUG, "---- inohlimit =  %Lu (0x%x)",
-		be64_to_cpu(dqp->q_core.d_ino_hardlimit),
-		(int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
-	cmn_err(CE_DEBUG, "---- inoslimit =  %Lu (0x%x)",
-		be64_to_cpu(dqp->q_core.d_ino_softlimit),
-		(int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
-	cmn_err(CE_DEBUG, "---- bcount  =  %Lu (0x%x)",
-		be64_to_cpu(dqp->q_core.d_bcount),
-		(int)be64_to_cpu(dqp->q_core.d_bcount));
-	cmn_err(CE_DEBUG, "---- icount  =  %Lu (0x%x)",
-		be64_to_cpu(dqp->q_core.d_icount),
-		(int)be64_to_cpu(dqp->q_core.d_icount));
-	cmn_err(CE_DEBUG, "---- btimer  =  %d",
-		(int)be32_to_cpu(dqp->q_core.d_btimer));
-	cmn_err(CE_DEBUG, "---- itimer  =  %d",
-		(int)be32_to_cpu(dqp->q_core.d_itimer));
-	cmn_err(CE_DEBUG, "---------------------------");
-}
-#endif
-
-/*
- * Give the buffer a little push if it is incore and
- * wait on the flush lock.
- */
-void
-xfs_qm_dqflock_pushbuf_wait(
-	xfs_dquot_t	*dqp)
-{
-	xfs_buf_t	*bp;
-
-	/*
-	 * Check to see if the dquot has been flushed delayed
-	 * write.  If so, grab its buffer and send it
-	 * out immediately.  We'll be able to acquire
-	 * the flush lock when the I/O completes.
-	 */
-	bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
-		    XFS_QI_DQCHUNKLEN(dqp->q_mount),
-		    XFS_INCORE_TRYLOCK);
-	if (bp != NULL) {
-		if (XFS_BUF_ISDELAYWRITE(bp)) {
-			if (XFS_BUF_ISPINNED(bp)) {
-				xfs_log_force(dqp->q_mount,
-					      (xfs_lsn_t)0,
-					      XFS_LOG_FORCE);
-			}
-			xfs_bawrite(dqp->q_mount, bp);
-		} else {
-			xfs_buf_relse(bp);
-		}
-	}
-	xfs_dqflock(dqp);
-}
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
deleted file mode 100644
index c0c629663a5..00000000000
--- a/fs/xfs/quota/xfs_dquot.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DQUOT_H__
-#define __XFS_DQUOT_H__
-
-/*
- * Dquots are structures that hold quota information about a user or a group,
- * much like inodes are for files. In fact, dquots share many characteristics
- * with inodes. However, dquots can also be a centralized resource, relative
- * to a collection of inodes. In this respect, dquots share some characteristics
- * of the superblock.
- * XFS dquots exploit both those in its algorithms. They make every attempt
- * to not be a bottleneck when quotas are on and have minimal impact, if any,
- * when quotas are off.
- */
-
-/*
- * The hash chain headers (hash buckets)
- */
-typedef struct xfs_dqhash {
-	struct xfs_dquot *qh_next;
-	mutex_t		  qh_lock;
-	uint		  qh_version;	/* ever increasing version */
-	uint		  qh_nelems;	/* number of dquots on the list */
-} xfs_dqhash_t;
-
-typedef struct xfs_dqlink {
-	struct xfs_dquot  *ql_next;	/* forward link */
-	struct xfs_dquot **ql_prevp;	/* pointer to prev ql_next */
-} xfs_dqlink_t;
-
-struct xfs_mount;
-struct xfs_trans;
-
-/*
- * This is the marker which is designed to occupy the first few
- * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
- * must come first.
- * This serves as the marker ("sentinel") when we have to restart list
- * iterations because of locking considerations.
- */
-typedef struct xfs_dqmarker {
-	struct xfs_dquot*dqm_flnext;	/* link to freelist: must be first */
-	struct xfs_dquot*dqm_flprev;
-	xfs_dqlink_t	 dqm_mplist;	/* link to mount's list of dquots */
-	xfs_dqlink_t	 dqm_hashlist;	/* link to the hash chain */
-	uint		 dqm_flags;	/* various flags (XFS_DQ_*) */
-} xfs_dqmarker_t;
-
-/*
- * The incore dquot structure
- */
-typedef struct xfs_dquot {
-	xfs_dqmarker_t	 q_lists;	/* list ptrs, q_flags (marker) */
-	xfs_dqhash_t	*q_hash;	/* the hashchain header */
-	struct xfs_mount*q_mount;	/* filesystem this relates to */
-	struct xfs_trans*q_transp;	/* trans this belongs to currently */
-	uint		 q_nrefs;	/* # active refs from inodes */
-	xfs_daddr_t	 q_blkno;	/* blkno of dquot buffer */
-	int		 q_bufoffset;	/* off of dq in buffer (# dquots) */
-	xfs_fileoff_t	 q_fileoffset;	/* offset in quotas file */
-
-	struct xfs_dquot*q_gdquot;	/* group dquot, hint only */
-	xfs_disk_dquot_t q_core;	/* actual usage & quotas */
-	xfs_dq_logitem_t q_logitem;	/* dquot log item */
-	xfs_qcnt_t	 q_res_bcount;	/* total regular nblks used+reserved */
-	xfs_qcnt_t	 q_res_icount;	/* total inos allocd+reserved */
-	xfs_qcnt_t	 q_res_rtbcount;/* total realtime blks used+reserved */
-	mutex_t		 q_qlock;	/* quota lock */
-	sema_t		 q_flock;	/* flush lock */
-	uint		 q_pincount;	/* pin count for this dquot */
-	sv_t		 q_pinwait;	/* sync var for pinning */
-#ifdef XFS_DQUOT_TRACE
-	struct ktrace	*q_trace;	/* trace header structure */
-#endif
-} xfs_dquot_t;
-
-
-#define dq_flnext	q_lists.dqm_flnext
-#define dq_flprev	q_lists.dqm_flprev
-#define dq_mplist	q_lists.dqm_mplist
-#define dq_hashlist	q_lists.dqm_hashlist
-#define dq_flags	q_lists.dqm_flags
-
-#define XFS_DQHOLD(dqp)		((dqp)->q_nrefs++)
-
-#ifdef DEBUG
-static inline int
-XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
-{
-	if (mutex_trylock(&dqp->q_qlock)) {
-		mutex_unlock(&dqp->q_qlock);
-		return 0;
-	}
-	return 1;
-}
-#endif
-
-
-/*
- * The following three routines simply manage the q_flock
- * semaphore embedded in the dquot.  This semaphore synchronizes
- * processes attempting to flush the in-core dquot back to disk.
- */
-#define xfs_dqflock(dqp)	 { psema(&((dqp)->q_flock), PINOD | PRECALC);\
-				   (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
-#define xfs_dqfunlock(dqp)	 { ASSERT(valusema(&((dqp)->q_flock)) <= 0); \
-				   vsema(&((dqp)->q_flock)); \
-				   (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
-
-#define XFS_DQ_PINLOCK(dqp)	   mutex_spinlock( \
-				     &(XFS_DQ_TO_QINF(dqp)->qi_pinlock))
-#define XFS_DQ_PINUNLOCK(dqp, s)   mutex_spinunlock( \
-				     &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s)
-
-#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (valusema(&((dqp)->q_flock)) <= 0)
-#define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
-#define XFS_DQ_IS_DIRTY(dqp)	((dqp)->dq_flags & XFS_DQ_DIRTY)
-#define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
-#define XFS_QM_ISPDQ(dqp)	((dqp)->dq_flags & XFS_DQ_PROJ)
-#define XFS_QM_ISGDQ(dqp)	((dqp)->dq_flags & XFS_DQ_GROUP)
-#define XFS_DQ_TO_QINF(dqp)	((dqp)->q_mount->m_quotainfo)
-#define XFS_DQ_TO_QIP(dqp)	(XFS_QM_ISUDQ(dqp) ? \
-				 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
-				 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
-
-#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
-				     (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
-				     (XFS_IS_OQUOTA_ON((d)->q_mount))))
-
-#ifdef XFS_DQUOT_TRACE
-/*
- * Dquot Tracing stuff.
- */
-#define DQUOT_TRACE_SIZE	64
-#define DQUOT_KTRACE_ENTRY	1
-
-extern void		__xfs_dqtrace_entry(xfs_dquot_t *dqp, char *func,
-					    void *, xfs_inode_t *);
-#define xfs_dqtrace_entry_ino(a,b,ip) \
-		__xfs_dqtrace_entry((a), (b), (void*)__return_address, (ip))
-#define xfs_dqtrace_entry(a,b) \
-		__xfs_dqtrace_entry((a), (b), (void*)__return_address, NULL)
-#else
-#define xfs_dqtrace_entry(a,b)
-#define xfs_dqtrace_entry_ino(a,b,ip)
-#endif
-
-#ifdef QUOTADEBUG
-extern void		xfs_qm_dqprint(xfs_dquot_t *);
-#else
-#define xfs_qm_dqprint(a)
-#endif
-
-extern void		xfs_qm_dqdestroy(xfs_dquot_t *);
-extern int		xfs_qm_dqflush(xfs_dquot_t *, uint);
-extern int		xfs_qm_dqpurge(xfs_dquot_t *, uint);
-extern void		xfs_qm_dqunpin_wait(xfs_dquot_t *);
-extern int		xfs_qm_dqlock_nowait(xfs_dquot_t *);
-extern int		xfs_qm_dqflock_nowait(xfs_dquot_t *);
-extern void		xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
-extern void		xfs_qm_adjust_dqtimers(xfs_mount_t *,
-					xfs_disk_dquot_t *);
-extern void		xfs_qm_adjust_dqlimits(xfs_mount_t *,
-					xfs_disk_dquot_t *);
-extern int		xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
-					xfs_dqid_t, uint, uint, xfs_dquot_t **);
-extern void		xfs_qm_dqput(xfs_dquot_t *);
-extern void		xfs_qm_dqrele(xfs_dquot_t *);
-extern void		xfs_dqlock(xfs_dquot_t *);
-extern void		xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
-extern void		xfs_dqunlock(xfs_dquot_t *);
-extern void		xfs_dqunlock_nonotify(xfs_dquot_t *);
-
-#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
deleted file mode 100644
index 2ec6b441849..00000000000
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ /dev/null
@@ -1,699 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_priv.h"
-#include "xfs_qm.h"
-
-/*
- * returns the number of iovecs needed to log the given dquot item.
- */
-/* ARGSUSED */
-STATIC uint
-xfs_qm_dquot_logitem_size(
-	xfs_dq_logitem_t	*logitem)
-{
-	/*
-	 * we need only two iovecs, one for the format, one for the real thing
-	 */
-	return (2);
-}
-
-/*
- * fills in the vector of log iovecs for the given dquot log item.
- */
-STATIC void
-xfs_qm_dquot_logitem_format(
-	xfs_dq_logitem_t	*logitem,
-	xfs_log_iovec_t		*logvec)
-{
-	ASSERT(logitem);
-	ASSERT(logitem->qli_dquot);
-
-	logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
-	logvec->i_len  = sizeof(xfs_dq_logformat_t);
-	logvec++;
-	logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
-	logvec->i_len  = sizeof(xfs_disk_dquot_t);
-
-	ASSERT(2 == logitem->qli_item.li_desc->lid_size);
-	logitem->qli_format.qlf_size = 2;
-
-}
-
-/*
- * Increment the pin count of the given dquot.
- * This value is protected by pinlock spinlock in the xQM structure.
- */
-STATIC void
-xfs_qm_dquot_logitem_pin(
-	xfs_dq_logitem_t *logitem)
-{
-	unsigned long	s;
-	xfs_dquot_t *dqp;
-
-	dqp = logitem->qli_dquot;
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	s = XFS_DQ_PINLOCK(dqp);
-	dqp->q_pincount++;
-	XFS_DQ_PINUNLOCK(dqp, s);
-}
-
-/*
- * Decrement the pin count of the given dquot, and wake up
- * anyone in xfs_dqwait_unpin() if the count goes to 0.	 The
- * dquot must have been previously pinned with a call to xfs_dqpin().
- */
-/* ARGSUSED */
-STATIC void
-xfs_qm_dquot_logitem_unpin(
-	xfs_dq_logitem_t *logitem,
-	int		  stale)
-{
-	unsigned long	s;
-	xfs_dquot_t *dqp;
-
-	dqp = logitem->qli_dquot;
-	ASSERT(dqp->q_pincount > 0);
-	s = XFS_DQ_PINLOCK(dqp);
-	dqp->q_pincount--;
-	if (dqp->q_pincount == 0) {
-		sv_broadcast(&dqp->q_pinwait);
-	}
-	XFS_DQ_PINUNLOCK(dqp, s);
-}
-
-/* ARGSUSED */
-STATIC void
-xfs_qm_dquot_logitem_unpin_remove(
-	xfs_dq_logitem_t *logitem,
-	xfs_trans_t	 *tp)
-{
-	xfs_qm_dquot_logitem_unpin(logitem, 0);
-}
-
-/*
- * Given the logitem, this writes the corresponding dquot entry to disk
- * asynchronously. This is called with the dquot entry securely locked;
- * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
- * at the end.
- */
-STATIC void
-xfs_qm_dquot_logitem_push(
-	xfs_dq_logitem_t	*logitem)
-{
-	xfs_dquot_t	*dqp;
-
-	dqp = logitem->qli_dquot;
-
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
-
-	/*
-	 * Since we were able to lock the dquot's flush lock and
-	 * we found it on the AIL, the dquot must be dirty.  This
-	 * is because the dquot is removed from the AIL while still
-	 * holding the flush lock in xfs_dqflush_done().  Thus, if
-	 * we found it in the AIL and were able to obtain the flush
-	 * lock without sleeping, then there must not have been
-	 * anyone in the process of flushing the dquot.
-	 */
-	xfs_qm_dqflush(dqp, XFS_B_DELWRI);
-	xfs_dqunlock(dqp);
-}
-
-/*ARGSUSED*/
-STATIC xfs_lsn_t
-xfs_qm_dquot_logitem_committed(
-	xfs_dq_logitem_t	*l,
-	xfs_lsn_t		lsn)
-{
-	/*
-	 * We always re-log the entire dquot when it becomes dirty,
-	 * so, the latest copy _is_ the only one that matters.
-	 */
-	return (lsn);
-}
-
-
-/*
- * This is called to wait for the given dquot to be unpinned.
- * Most of these pin/unpin routines are plagiarized from inode code.
- */
-void
-xfs_qm_dqunpin_wait(
-	xfs_dquot_t	*dqp)
-{
-	SPLDECL(s);
-
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	if (dqp->q_pincount == 0) {
-		return;
-	}
-
-	/*
-	 * Give the log a push so we don't wait here too long.
-	 */
-	xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
-	s = XFS_DQ_PINLOCK(dqp);
-	if (dqp->q_pincount == 0) {
-		XFS_DQ_PINUNLOCK(dqp, s);
-		return;
-	}
-	sv_wait(&(dqp->q_pinwait), PINOD,
-		&(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
-}
-
-/*
- * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
- * the dquot is locked by us, but the flush lock isn't. So, here we are
- * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
- * If so, we want to push it out to help us take this item off the AIL as soon
- * as possible.
- *
- * We must not be holding the AIL_LOCK at this point. Calling incore() to
- * search the buffercache can be a time consuming thing, and AIL_LOCK is a
- * spinlock.
- */
-STATIC void
-xfs_qm_dquot_logitem_pushbuf(
-	xfs_dq_logitem_t    *qip)
-{
-	xfs_dquot_t	*dqp;
-	xfs_mount_t	*mp;
-	xfs_buf_t	*bp;
-	uint		dopush;
-
-	dqp = qip->qli_dquot;
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
-	/*
-	 * The qli_pushbuf_flag keeps others from
-	 * trying to duplicate our effort.
-	 */
-	ASSERT(qip->qli_pushbuf_flag != 0);
-	ASSERT(qip->qli_push_owner == current_pid());
-
-	/*
-	 * If flushlock isn't locked anymore, chances are that the
-	 * inode flush completed and the inode was taken off the AIL.
-	 * So, just get out.
-	 */
-	if ((valusema(&(dqp->q_flock)) > 0)  ||
-	    ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
-		qip->qli_pushbuf_flag = 0;
-		xfs_dqunlock(dqp);
-		return;
-	}
-	mp = dqp->q_mount;
-	bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
-		    XFS_QI_DQCHUNKLEN(mp),
-		    XFS_INCORE_TRYLOCK);
-	if (bp != NULL) {
-		if (XFS_BUF_ISDELAYWRITE(bp)) {
-			dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-				  (valusema(&(dqp->q_flock)) <= 0));
-			qip->qli_pushbuf_flag = 0;
-			xfs_dqunlock(dqp);
-
-			if (XFS_BUF_ISPINNED(bp)) {
-				xfs_log_force(mp, (xfs_lsn_t)0,
-					      XFS_LOG_FORCE);
-			}
-			if (dopush) {
-#ifdef XFSRACEDEBUG
-				delay_for_intr();
-				delay(300);
-#endif
-				xfs_bawrite(mp, bp);
-			} else {
-				xfs_buf_relse(bp);
-			}
-		} else {
-			qip->qli_pushbuf_flag = 0;
-			xfs_dqunlock(dqp);
-			xfs_buf_relse(bp);
-		}
-		return;
-	}
-
-	qip->qli_pushbuf_flag = 0;
-	xfs_dqunlock(dqp);
-}
-
-/*
- * This is called to attempt to lock the dquot associated with this
- * dquot log item.  Don't sleep on the dquot lock or the flush lock.
- * If the flush lock is already held, indicating that the dquot has
- * been or is in the process of being flushed, then see if we can
- * find the dquot's buffer in the buffer cache without sleeping.  If
- * we can and it is marked delayed write, then we want to send it out.
- * We delay doing so until the push routine, though, to avoid sleeping
- * in any device strategy routines.
- */
-STATIC uint
-xfs_qm_dquot_logitem_trylock(
-	xfs_dq_logitem_t	*qip)
-{
-	xfs_dquot_t		*dqp;
-	uint			retval;
-
-	dqp = qip->qli_dquot;
-	if (dqp->q_pincount > 0)
-		return (XFS_ITEM_PINNED);
-
-	if (! xfs_qm_dqlock_nowait(dqp))
-		return (XFS_ITEM_LOCKED);
-
-	retval = XFS_ITEM_SUCCESS;
-	if (! xfs_qm_dqflock_nowait(dqp)) {
-		/*
-		 * The dquot is already being flushed.	It may have been
-		 * flushed delayed write, however, and we don't want to
-		 * get stuck waiting for that to complete.  So, we want to check
-		 * to see if we can lock the dquot's buffer without sleeping.
-		 * If we can and it is marked for delayed write, then we
-		 * hold it and send it out from the push routine.  We don't
-		 * want to do that now since we might sleep in the device
-		 * strategy routine.  We also don't want to grab the buffer lock
-		 * here because we'd like not to call into the buffer cache
-		 * while holding the AIL_LOCK.
-		 * Make sure to only return PUSHBUF if we set pushbuf_flag
-		 * ourselves.  If someone else is doing it then we don't
-		 * want to go to the push routine and duplicate their efforts.
-		 */
-		if (qip->qli_pushbuf_flag == 0) {
-			qip->qli_pushbuf_flag = 1;
-			ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
-#ifdef DEBUG
-			qip->qli_push_owner = current_pid();
-#endif
-			/*
-			 * The dquot is left locked.
-			 */
-			retval = XFS_ITEM_PUSHBUF;
-		} else {
-			retval = XFS_ITEM_FLUSHING;
-			xfs_dqunlock_nonotify(dqp);
-		}
-	}
-
-	ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
-	return (retval);
-}
-
-
-/*
- * Unlock the dquot associated with the log item.
- * Clear the fields of the dquot and dquot log item that
- * are specific to the current transaction.  If the
- * hold flags is set, do not unlock the dquot.
- */
-STATIC void
-xfs_qm_dquot_logitem_unlock(
-	xfs_dq_logitem_t    *ql)
-{
-	xfs_dquot_t	*dqp;
-
-	ASSERT(ql != NULL);
-	dqp = ql->qli_dquot;
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
-	/*
-	 * Clear the transaction pointer in the dquot
-	 */
-	dqp->q_transp = NULL;
-
-	/*
-	 * dquots are never 'held' from getting unlocked at the end of
-	 * a transaction.  Their locking and unlocking is hidden inside the
-	 * transaction layer, within trans_commit. Hence, no LI_HOLD flag
-	 * for the logitem.
-	 */
-	xfs_dqunlock(dqp);
-}
-
-
-/*
- * The transaction with the dquot locked has aborted.  The dquot
- * must not be dirty within the transaction.  We simply unlock just
- * as if the transaction had been cancelled.
- */
-STATIC void
-xfs_qm_dquot_logitem_abort(
-	xfs_dq_logitem_t    *ql)
-{
-	xfs_qm_dquot_logitem_unlock(ql);
-}
-
-/*
- * this needs to stamp an lsn into the dquot, I think.
- * rpc's that look at user dquot's would then have to
- * push on the dependency recorded in the dquot
- */
-/* ARGSUSED */
-STATIC void
-xfs_qm_dquot_logitem_committing(
-	xfs_dq_logitem_t	*l,
-	xfs_lsn_t		lsn)
-{
-	return;
-}
-
-
-/*
- * This is the ops vector for dquots
- */
-STATIC struct xfs_item_ops xfs_dquot_item_ops = {
-	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size,
-	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
-					xfs_qm_dquot_logitem_format,
-	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*, int))
-					xfs_qm_dquot_logitem_unpin,
-	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
-					xfs_qm_dquot_logitem_unpin_remove,
-	.iop_trylock	= (uint(*)(xfs_log_item_t*))
-					xfs_qm_dquot_logitem_trylock,
-	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock,
-	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_dquot_logitem_committed,
-	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
-	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort,
-	.iop_pushbuf	= (void(*)(xfs_log_item_t*))
-					xfs_qm_dquot_logitem_pushbuf,
-	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_dquot_logitem_committing
-};
-
-/*
- * Initialize the dquot log item for a newly allocated dquot.
- * The dquot isn't locked at this point, but it isn't on any of the lists
- * either, so we don't care.
- */
-void
-xfs_qm_dquot_logitem_init(
-	struct xfs_dquot *dqp)
-{
-	xfs_dq_logitem_t  *lp;
-	lp = &dqp->q_logitem;
-
-	lp->qli_item.li_type = XFS_LI_DQUOT;
-	lp->qli_item.li_ops = &xfs_dquot_item_ops;
-	lp->qli_item.li_mountp = dqp->q_mount;
-	lp->qli_dquot = dqp;
-	lp->qli_format.qlf_type = XFS_LI_DQUOT;
-	lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
-	lp->qli_format.qlf_blkno = dqp->q_blkno;
-	lp->qli_format.qlf_len = 1;
-	/*
-	 * This is just the offset of this dquot within its buffer
-	 * (which is currently 1 FSB and probably won't change).
-	 * Hence 32 bits for this offset should be just fine.
-	 * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
-	 * here, and recompute it at recovery time.
-	 */
-	lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
-}
-
-/*------------------  QUOTAOFF LOG ITEMS  -------------------*/
-
-/*
- * This returns the number of iovecs needed to log the given quotaoff item.
- * We only need 1 iovec for an quotaoff item.  It just logs the
- * quotaoff_log_format structure.
- */
-/*ARGSUSED*/
-STATIC uint
-xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
-{
-	return (1);
-}
-
-/*
- * This is called to fill in the vector of log iovecs for the
- * given quotaoff log item. We use only 1 iovec, and we point that
- * at the quotaoff_log_format structure embedded in the quotaoff item.
- * It is at this point that we assert that all of the extent
- * slots in the quotaoff item have been filled.
- */
-STATIC void
-xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t	*qf,
-			   xfs_log_iovec_t	*log_vector)
-{
-	ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF);
-
-	log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
-	log_vector->i_len = sizeof(xfs_qoff_logitem_t);
-	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF);
-	qf->qql_format.qf_size = 1;
-}
-
-
-/*
- * Pinning has no meaning for an quotaoff item, so just return.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
-{
-	return;
-}
-
-
-/*
- * Since pinning has no meaning for an quotaoff item, unpinning does
- * not either.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale)
-{
-	return;
-}
-
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
-{
-	return;
-}
-
-/*
- * Quotaoff items have no locking, so just return success.
- */
-/*ARGSUSED*/
-STATIC uint
-xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
-{
-	return XFS_ITEM_LOCKED;
-}
-
-/*
- * Quotaoff items have no locking or pushing, so return failure
- * so that the caller doesn't bother with us.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf)
-{
-	return;
-}
-
-/*
- * The quotaoff-start-item is logged only once and cannot be moved in the log,
- * so simply return the lsn at which it's been logged.
- */
-/*ARGSUSED*/
-STATIC xfs_lsn_t
-xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
-{
-	return (lsn);
-}
-
-/*
- * The transaction of which this QUOTAOFF is a part has been aborted.
- * Just clean up after ourselves.
- * Shouldn't this never happen in the case of qoffend logitems? XXX
- */
-STATIC void
-xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf)
-{
-	kmem_free(qf, sizeof(xfs_qoff_logitem_t));
-}
-
-/*
- * There isn't much you can do to push on an quotaoff item.  It is simply
- * stuck waiting for the log to be flushed to disk.
- */
-/*ARGSUSED*/
-STATIC void
-xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf)
-{
-	return;
-}
-
-
-/*ARGSUSED*/
-STATIC xfs_lsn_t
-xfs_qm_qoffend_logitem_committed(
-	xfs_qoff_logitem_t *qfe,
-	xfs_lsn_t lsn)
-{
-	xfs_qoff_logitem_t	*qfs;
-	SPLDECL(s);
-
-	qfs = qfe->qql_start_lip;
-	AIL_LOCK(qfs->qql_item.li_mountp,s);
-	/*
-	 * Delete the qoff-start logitem from the AIL.
-	 * xfs_trans_delete_ail() drops the AIL lock.
-	 */
-	xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs, s);
-	kmem_free(qfs, sizeof(xfs_qoff_logitem_t));
-	kmem_free(qfe, sizeof(xfs_qoff_logitem_t));
-	return (xfs_lsn_t)-1;
-}
-
-/*
- * XXX rcc - don't know quite what to do with this.  I think we can
- * just ignore it.  The only time that isn't the case is if we allow
- * the client to somehow see that quotas have been turned off in which
- * we can't allow that to get back until the quotaoff hits the disk.
- * So how would that happen?  Also, do we need different routines for
- * quotaoff start and quotaoff end?  I suspect the answer is yes but
- * to be sure, I need to look at the recovery code and see how quota off
- * recovery is handled (do we roll forward or back or do something else).
- * If we roll forwards or backwards, then we need two separate routines,
- * one that does nothing and one that stamps in the lsn that matters
- * (truly makes the quotaoff irrevocable).  If we do something else,
- * then maybe we don't need two.
- */
-/* ARGSUSED */
-STATIC void
-xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
-{
-	return;
-}
-
-/* ARGSUSED */
-STATIC void
-xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
-{
-	return;
-}
-
-STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
-	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
-	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
-					xfs_qm_qoff_logitem_format,
-	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t* ,int))
-					xfs_qm_qoff_logitem_unpin,
-	.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
-					xfs_qm_qoff_logitem_unpin_remove,
-	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
-	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
-	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_qoffend_logitem_committed,
-	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
-	.iop_pushbuf	= NULL,
-	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_qoffend_logitem_committing
-};
-
-/*
- * This is the ops vector shared by all quotaoff-start log items.
- */
-STATIC struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
-	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
-	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
-					xfs_qm_qoff_logitem_format,
-	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
-	.iop_unpin	= (void(*)(xfs_log_item_t*, int))
-					xfs_qm_qoff_logitem_unpin,
-	.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
-					xfs_qm_qoff_logitem_unpin_remove,
-	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
-	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
-	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_qoff_logitem_committed,
-	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
-	.iop_pushbuf	= NULL,
-	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
-					xfs_qm_qoff_logitem_committing
-};
-
-/*
- * Allocate and initialize an quotaoff item of the correct quota type(s).
- */
-xfs_qoff_logitem_t *
-xfs_qm_qoff_logitem_init(
-	struct xfs_mount *mp,
-	xfs_qoff_logitem_t *start,
-	uint flags)
-{
-	xfs_qoff_logitem_t	*qf;
-
-	qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
-
-	qf->qql_item.li_type = XFS_LI_QUOTAOFF;
-	if (start)
-		qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
-	else
-		qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
-	qf->qql_item.li_mountp = mp;
-	qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
-	qf->qql_format.qf_flags = flags;
-	qf->qql_start_lip = start;
-	return (qf);
-}
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
deleted file mode 100644
index 53a00fb217f..00000000000
--- a/fs/xfs/quota/xfs_qm.c
+++ /dev/null
@@ -1,2859 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_clnt.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_bmap.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_space.h"
-#include "xfs_utils.h"
-#include "xfs_qm.h"
-
-/*
- * The global quota manager. There is only one of these for the entire
- * system, _not_ one per file system. XQM keeps track of the overall
- * quota functionality, including maintaining the freelist and hash
- * tables of dquots.
- */
-mutex_t		xfs_Gqm_lock;
-struct xfs_qm	*xfs_Gqm;
-uint		ndquot;
-
-kmem_zone_t	*qm_dqzone;
-kmem_zone_t	*qm_dqtrxzone;
-STATIC kmem_shaker_t	xfs_qm_shaker;
-
-STATIC void	xfs_qm_list_init(xfs_dqlist_t *, char *, int);
-STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
-
-STATIC void	xfs_qm_freelist_init(xfs_frlist_t *);
-STATIC void	xfs_qm_freelist_destroy(xfs_frlist_t *);
-STATIC int	xfs_qm_mplist_nowait(xfs_mount_t *);
-STATIC int	xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
-
-STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
-STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int	xfs_qm_shake(int, gfp_t);
-
-#ifdef DEBUG
-extern mutex_t	qcheck_lock;
-#endif
-
-#ifdef QUOTADEBUG
-#define XQM_LIST_PRINT(l, NXT, title) \
-{ \
-	xfs_dquot_t	*dqp; int i = 0; \
-	cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
-	for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
-		cmn_err(CE_DEBUG, "   %d.  \"%d (%s)\"   " \
-				  "bcnt = %d, icnt = %d, refs = %d", \
-			++i, (int) be32_to_cpu(dqp->q_core.d_id), \
-			DQFLAGTO_TYPESTR(dqp),	     \
-			(int) be64_to_cpu(dqp->q_core.d_bcount), \
-			(int) be64_to_cpu(dqp->q_core.d_icount), \
-			(int) dqp->q_nrefs);  } \
-}
-#else
-#define XQM_LIST_PRINT(l, NXT, title) do { } while (0)
-#endif
-
-/*
- * Initialize the XQM structure.
- * Note that there is not one quota manager per file system.
- */
-STATIC struct xfs_qm *
-xfs_Gqm_init(void)
-{
-	xfs_dqhash_t	*udqhash, *gdqhash;
-	xfs_qm_t	*xqm;
-	uint		i, hsize, flags = KM_SLEEP | KM_MAYFAIL;
-
-	/*
-	 * Initialize the dquot hash tables.
-	 */
-	hsize = XFS_QM_HASHSIZE_HIGH;
-	while (!(udqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), flags))) {
-		if ((hsize >>= 1) <= XFS_QM_HASHSIZE_LOW)
-			flags = KM_SLEEP;
-	}
-	gdqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), KM_SLEEP);
-	ndquot = hsize << 8;
-
-	xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
-	xqm->qm_dqhashmask = hsize - 1;
-	xqm->qm_usr_dqhtable = udqhash;
-	xqm->qm_grp_dqhtable = gdqhash;
-	ASSERT(xqm->qm_usr_dqhtable != NULL);
-	ASSERT(xqm->qm_grp_dqhtable != NULL);
-
-	for (i = 0; i < hsize; i++) {
-		xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
-		xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
-	}
-
-	/*
-	 * Freelist of all dquots of all file systems
-	 */
-	xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
-
-	/*
-	 * dquot zone. we register our own low-memory callback.
-	 */
-	if (!qm_dqzone) {
-		xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
-						"xfs_dquots");
-		qm_dqzone = xqm->qm_dqzone;
-	} else
-		xqm->qm_dqzone = qm_dqzone;
-
-	xfs_qm_shaker = kmem_shake_register(xfs_qm_shake);
-
-	/*
-	 * The t_dqinfo portion of transactions.
-	 */
-	if (!qm_dqtrxzone) {
-		xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
-						   "xfs_dqtrx");
-		qm_dqtrxzone = xqm->qm_dqtrxzone;
-	} else
-		xqm->qm_dqtrxzone = qm_dqtrxzone;
-
-	atomic_set(&xqm->qm_totaldquots, 0);
-	xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
-	xqm->qm_nrefs = 0;
-#ifdef DEBUG
-	mutex_init(&qcheck_lock);
-#endif
-	return xqm;
-}
-
-/*
- * Destroy the global quota manager when its reference count goes to zero.
- */
-STATIC void
-xfs_qm_destroy(
-	struct xfs_qm	*xqm)
-{
-	int		hsize, i;
-
-	ASSERT(xqm != NULL);
-	ASSERT(xqm->qm_nrefs == 0);
-	kmem_shake_deregister(xfs_qm_shaker);
-	hsize = xqm->qm_dqhashmask + 1;
-	for (i = 0; i < hsize; i++) {
-		xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
-		xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
-	}
-	kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t));
-	kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t));
-	xqm->qm_usr_dqhtable = NULL;
-	xqm->qm_grp_dqhtable = NULL;
-	xqm->qm_dqhashmask = 0;
-	xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
-#ifdef DEBUG
-	mutex_destroy(&qcheck_lock);
-#endif
-	kmem_free(xqm, sizeof(xfs_qm_t));
-}
-
-/*
- * Called at mount time to let XQM know that another file system is
- * starting quotas. This isn't crucial information as the individual mount
- * structures are pretty independent, but it helps the XQM keep a
- * global view of what's going on.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_hold_quotafs_ref(
-	struct xfs_mount *mp)
-{
-	/*
-	 * Need to lock the xfs_Gqm structure for things like this. For example,
-	 * the structure could disappear between the entry to this routine and
-	 * a HOLD operation if not locked.
-	 */
-	XFS_QM_LOCK(xfs_Gqm);
-
-	if (xfs_Gqm == NULL)
-		xfs_Gqm = xfs_Gqm_init();
-	/*
-	 * We can keep a list of all filesystems with quotas mounted for
-	 * debugging and statistical purposes, but ...
-	 * Just take a reference and get out.
-	 */
-	XFS_QM_HOLD(xfs_Gqm);
-	XFS_QM_UNLOCK(xfs_Gqm);
-
-	return 0;
-}
-
-
-/*
- * Release the reference that a filesystem took at mount time,
- * so that we know when we need to destroy the entire quota manager.
- */
-/* ARGSUSED */
-STATIC void
-xfs_qm_rele_quotafs_ref(
-	struct xfs_mount *mp)
-{
-	xfs_dquot_t	*dqp, *nextdqp;
-
-	ASSERT(xfs_Gqm);
-	ASSERT(xfs_Gqm->qm_nrefs > 0);
-
-	/*
-	 * Go thru the freelist and destroy all inactive dquots.
-	 */
-	xfs_qm_freelist_lock(xfs_Gqm);
-
-	for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
-	     dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
-		xfs_dqlock(dqp);
-		nextdqp = dqp->dq_flnext;
-		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-			ASSERT(dqp->q_mount == NULL);
-			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-			ASSERT(dqp->HL_PREVP == NULL);
-			ASSERT(dqp->MPL_PREVP == NULL);
-			XQM_FREELIST_REMOVE(dqp);
-			xfs_dqunlock(dqp);
-			xfs_qm_dqdestroy(dqp);
-		} else {
-			xfs_dqunlock(dqp);
-		}
-		dqp = nextdqp;
-	}
-	xfs_qm_freelist_unlock(xfs_Gqm);
-
-	/*
-	 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
-	 * be restarted.
-	 */
-	XFS_QM_LOCK(xfs_Gqm);
-	XFS_QM_RELE(xfs_Gqm);
-	if (xfs_Gqm->qm_nrefs == 0) {
-		xfs_qm_destroy(xfs_Gqm);
-		xfs_Gqm = NULL;
-	}
-	XFS_QM_UNLOCK(xfs_Gqm);
-}
-
-/*
- * This is called at mount time from xfs_mountfs to initialize the quotainfo
- * structure and start the global quotamanager (xfs_Gqm) if it hasn't done
- * so already.	Note that the superblock has not been read in yet.
- */
-void
-xfs_qm_mount_quotainit(
-	xfs_mount_t	*mp,
-	uint		flags)
-{
-	/*
-	 * User, projects or group quotas has to be on.
-	 */
-	ASSERT(flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA));
-
-	/*
-	 * Initialize the flags in the mount structure. From this point
-	 * onwards we look at m_qflags to figure out if quotas's ON/OFF, etc.
-	 * Note that we enforce nothing if accounting is off.
-	 * ie.	XFSMNT_*QUOTA must be ON for XFSMNT_*QUOTAENF.
-	 * It isn't necessary to take the quotaoff lock to do this; this is
-	 * called from mount.
-	 */
-	if (flags & XFSMNT_UQUOTA) {
-		mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-		if (flags & XFSMNT_UQUOTAENF)
-			mp->m_qflags |= XFS_UQUOTA_ENFD;
-	}
-	if (flags & XFSMNT_GQUOTA) {
-		mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-		if (flags & XFSMNT_GQUOTAENF)
-			mp->m_qflags |= XFS_OQUOTA_ENFD;
-	} else if (flags & XFSMNT_PQUOTA) {
-		mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-		if (flags & XFSMNT_PQUOTAENF)
-			mp->m_qflags |= XFS_OQUOTA_ENFD;
-	}
-}
-
-/*
- * Just destroy the quotainfo structure.
- */
-void
-xfs_qm_unmount_quotadestroy(
-	xfs_mount_t	*mp)
-{
-	if (mp->m_quotainfo)
-		xfs_qm_destroy_quotainfo(mp);
-}
-
-
-/*
- * This is called from xfs_mountfs to start quotas and initialize all
- * necessary data structures like quotainfo.  This is also responsible for
- * running a quotacheck as necessary.  We are guaranteed that the superblock
- * is consistently read in at this point.
- */
-int
-xfs_qm_mount_quotas(
-	xfs_mount_t	*mp,
-	int		mfsi_flags)
-{
-	unsigned long	s;
-	int		error = 0;
-	uint		sbf;
-
-
-	/*
-	 * If quotas on realtime volumes is not supported, we disable
-	 * quotas immediately.
-	 */
-	if (mp->m_sb.sb_rextents) {
-		cmn_err(CE_NOTE,
-			"Cannot turn on quotas for realtime filesystem %s",
-			mp->m_fsname);
-		mp->m_qflags = 0;
-		goto write_changes;
-	}
-
-	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-	/*
-	 * Allocate the quotainfo structure inside the mount struct, and
-	 * create quotainode(s), and change/rev superblock if necessary.
-	 */
-	if ((error = xfs_qm_init_quotainfo(mp))) {
-		/*
-		 * We must turn off quotas.
-		 */
-		ASSERT(mp->m_quotainfo == NULL);
-		mp->m_qflags = 0;
-		goto write_changes;
-	}
-	/*
-	 * If any of the quotas are not consistent, do a quotacheck.
-	 */
-	if (XFS_QM_NEED_QUOTACHECK(mp) &&
-		!(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
-		if ((error = xfs_qm_quotacheck(mp))) {
-			/* Quotacheck has failed and quotas have
-			 * been disabled.
-			 */
-			return XFS_ERROR(error);
-		}
-	}
-
- write_changes:
-	/*
-	 * We actually don't have to acquire the SB_LOCK at all.
-	 * This can only be called from mount, and that's single threaded. XXX
-	 */
-	s = XFS_SB_LOCK(mp);
-	sbf = mp->m_sb.sb_qflags;
-	mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
-	XFS_SB_UNLOCK(mp, s);
-
-	if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
-		if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
-			/*
-			 * We could only have been turning quotas off.
-			 * We aren't in very good shape actually because
-			 * the incore structures are convinced that quotas are
-			 * off, but the on disk superblock doesn't know that !
-			 */
-			ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
-			xfs_fs_cmn_err(CE_ALERT, mp,
-				"XFS mount_quotas: Superblock update failed!");
-		}
-	}
-
-	if (error) {
-		xfs_fs_cmn_err(CE_WARN, mp,
-			"Failed to initialize disk quotas.");
-	}
-	return XFS_ERROR(error);
-}
-
-/*
- * Called from the vfsops layer.
- */
-int
-xfs_qm_unmount_quotas(
-	xfs_mount_t	*mp)
-{
-	xfs_inode_t	*uqp, *gqp;
-	int		error = 0;
-
-	/*
-	 * Release the dquots that root inode, et al might be holding,
-	 * before we flush quotas and blow away the quotainfo structure.
-	 */
-	ASSERT(mp->m_rootip);
-	xfs_qm_dqdetach(mp->m_rootip);
-	if (mp->m_rbmip)
-		xfs_qm_dqdetach(mp->m_rbmip);
-	if (mp->m_rsumip)
-		xfs_qm_dqdetach(mp->m_rsumip);
-
-	/*
-	 * Flush out the quota inodes.
-	 */
-	uqp = gqp = NULL;
-	if (mp->m_quotainfo) {
-		if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) {
-			xfs_ilock(uqp, XFS_ILOCK_EXCL);
-			xfs_iflock(uqp);
-			error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
-			xfs_iunlock(uqp, XFS_ILOCK_EXCL);
-			if (unlikely(error == EFSCORRUPTED)) {
-				XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)",
-						 XFS_ERRLEVEL_LOW, mp);
-				goto out;
-			}
-		}
-		if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) {
-			xfs_ilock(gqp, XFS_ILOCK_EXCL);
-			xfs_iflock(gqp);
-			error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
-			xfs_iunlock(gqp, XFS_ILOCK_EXCL);
-			if (unlikely(error == EFSCORRUPTED)) {
-				XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)",
-						 XFS_ERRLEVEL_LOW, mp);
-				goto out;
-			}
-		}
-	}
-	if (uqp) {
-		 XFS_PURGE_INODE(uqp);
-		 mp->m_quotainfo->qi_uquotaip = NULL;
-	}
-	if (gqp) {
-		XFS_PURGE_INODE(gqp);
-		mp->m_quotainfo->qi_gquotaip = NULL;
-	}
-out:
-	return XFS_ERROR(error);
-}
-
-/*
- * Flush all dquots of the given file system to disk. The dquots are
- * _not_ purged from memory here, just their data written to disk.
- */
-STATIC int
-xfs_qm_dqflush_all(
-	xfs_mount_t	*mp,
-	int		flags)
-{
-	int		recl;
-	xfs_dquot_t	*dqp;
-	int		niters;
-	int		error;
-
-	if (mp->m_quotainfo == NULL)
-		return 0;
-	niters = 0;
-again:
-	xfs_qm_mplist_lock(mp);
-	FOREACH_DQUOT_IN_MP(dqp, mp) {
-		xfs_dqlock(dqp);
-		if (! XFS_DQ_IS_DIRTY(dqp)) {
-			xfs_dqunlock(dqp);
-			continue;
-		}
-		xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
-		/* XXX a sentinel would be better */
-		recl = XFS_QI_MPLRECLAIMS(mp);
-		if (! xfs_qm_dqflock_nowait(dqp)) {
-			/*
-			 * If we can't grab the flush lock then check
-			 * to see if the dquot has been flushed delayed
-			 * write.  If so, grab its buffer and send it
-			 * out immediately.  We'll be able to acquire
-			 * the flush lock when the I/O completes.
-			 */
-			xfs_qm_dqflock_pushbuf_wait(dqp);
-		}
-		/*
-		 * Let go of the mplist lock. We don't want to hold it
-		 * across a disk write.
-		 */
-		xfs_qm_mplist_unlock(mp);
-		error = xfs_qm_dqflush(dqp, flags);
-		xfs_dqunlock(dqp);
-		if (error)
-			return error;
-
-		xfs_qm_mplist_lock(mp);
-		if (recl != XFS_QI_MPLRECLAIMS(mp)) {
-			xfs_qm_mplist_unlock(mp);
-			/* XXX restart limit */
-			goto again;
-		}
-	}
-
-	xfs_qm_mplist_unlock(mp);
-	/* return ! busy */
-	return 0;
-}
-/*
- * Release the group dquot pointers the user dquots may be
- * carrying around as a hint. mplist is locked on entry and exit.
- */
-STATIC void
-xfs_qm_detach_gdquots(
-	xfs_mount_t	*mp)
-{
-	xfs_dquot_t	*dqp, *gdqp;
-	int		nrecl;
-
- again:
-	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
-	dqp = XFS_QI_MPLNEXT(mp);
-	while (dqp) {
-		xfs_dqlock(dqp);
-		if ((gdqp = dqp->q_gdquot)) {
-			xfs_dqlock(gdqp);
-			dqp->q_gdquot = NULL;
-		}
-		xfs_dqunlock(dqp);
-
-		if (gdqp) {
-			/*
-			 * Can't hold the mplist lock across a dqput.
-			 * XXXmust convert to marker based iterations here.
-			 */
-			nrecl = XFS_QI_MPLRECLAIMS(mp);
-			xfs_qm_mplist_unlock(mp);
-			xfs_qm_dqput(gdqp);
-
-			xfs_qm_mplist_lock(mp);
-			if (nrecl != XFS_QI_MPLRECLAIMS(mp))
-				goto again;
-		}
-		dqp = dqp->MPL_NEXT;
-	}
-}
-
-/*
- * Go through all the incore dquots of this file system and take them
- * off the mplist and hashlist, if the dquot type matches the dqtype
- * parameter. This is used when turning off quota accounting for
- * users and/or groups, as well as when the filesystem is unmounting.
- */
-STATIC int
-xfs_qm_dqpurge_int(
-	xfs_mount_t	*mp,
-	uint		flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */
-{
-	xfs_dquot_t	*dqp;
-	uint		dqtype;
-	int		nrecl;
-	xfs_dquot_t	*nextdqp;
-	int		nmisses;
-
-	if (mp->m_quotainfo == NULL)
-		return 0;
-
-	dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
-	dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
-	dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
-
-	xfs_qm_mplist_lock(mp);
-
-	/*
-	 * In the first pass through all incore dquots of this filesystem,
-	 * we release the group dquot pointers the user dquots may be
-	 * carrying around as a hint. We need to do this irrespective of
-	 * what's being turned off.
-	 */
-	xfs_qm_detach_gdquots(mp);
-
-      again:
-	nmisses = 0;
-	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
-	/*
-	 * Try to get rid of all of the unwanted dquots. The idea is to
-	 * get them off mplist and hashlist, but leave them on freelist.
-	 */
-	dqp = XFS_QI_MPLNEXT(mp);
-	while (dqp) {
-		/*
-		 * It's OK to look at the type without taking dqlock here.
-		 * We're holding the mplist lock here, and that's needed for
-		 * a dqreclaim.
-		 */
-		if ((dqp->dq_flags & dqtype) == 0) {
-			dqp = dqp->MPL_NEXT;
-			continue;
-		}
-
-		if (! xfs_qm_dqhashlock_nowait(dqp)) {
-			nrecl = XFS_QI_MPLRECLAIMS(mp);
-			xfs_qm_mplist_unlock(mp);
-			XFS_DQ_HASH_LOCK(dqp->q_hash);
-			xfs_qm_mplist_lock(mp);
-
-			/*
-			 * XXXTheoretically, we can get into a very long
-			 * ping pong game here.
-			 * No one can be adding dquots to the mplist at
-			 * this point, but somebody might be taking things off.
-			 */
-			if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
-				XFS_DQ_HASH_UNLOCK(dqp->q_hash);
-				goto again;
-			}
-		}
-
-		/*
-		 * Take the dquot off the mplist and hashlist. It may remain on
-		 * freelist in INACTIVE state.
-		 */
-		nextdqp = dqp->MPL_NEXT;
-		nmisses += xfs_qm_dqpurge(dqp, flags);
-		dqp = nextdqp;
-	}
-	xfs_qm_mplist_unlock(mp);
-	return nmisses;
-}
-
-int
-xfs_qm_dqpurge_all(
-	xfs_mount_t	*mp,
-	uint		flags)
-{
-	int		ndquots;
-
-	/*
-	 * Purge the dquot cache.
-	 * None of the dquots should really be busy at this point.
-	 */
-	if (mp->m_quotainfo) {
-		while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
-			delay(ndquots * 10);
-		}
-	}
-	return 0;
-}
-
-STATIC int
-xfs_qm_dqattach_one(
-	xfs_inode_t	*ip,
-	xfs_dqid_t	id,
-	uint		type,
-	uint		doalloc,
-	uint		dolock,
-	xfs_dquot_t	*udqhint, /* hint */
-	xfs_dquot_t	**IO_idqpp)
-{
-	xfs_dquot_t	*dqp;
-	int		error;
-
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
-	error = 0;
-	/*
-	 * See if we already have it in the inode itself. IO_idqpp is
-	 * &i_udquot or &i_gdquot. This made the code look weird, but
-	 * made the logic a lot simpler.
-	 */
-	if ((dqp = *IO_idqpp)) {
-		if (dolock)
-			xfs_dqlock(dqp);
-		xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
-		goto done;
-	}
-
-	/*
-	 * udqhint is the i_udquot field in inode, and is non-NULL only
-	 * when the type arg is group/project. Its purpose is to save a
-	 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
-	 * the user dquot.
-	 */
-	ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
-	if (udqhint && !dolock)
-		xfs_dqlock(udqhint);
-
-	/*
-	 * No need to take dqlock to look at the id.
-	 * The ID can't change until it gets reclaimed, and it won't
-	 * be reclaimed as long as we have a ref from inode and we hold
-	 * the ilock.
-	 */
-	if (udqhint &&
-	    (dqp = udqhint->q_gdquot) &&
-	    (be32_to_cpu(dqp->q_core.d_id) == id)) {
-		ASSERT(XFS_DQ_IS_LOCKED(udqhint));
-		xfs_dqlock(dqp);
-		XFS_DQHOLD(dqp);
-		ASSERT(*IO_idqpp == NULL);
-		*IO_idqpp = dqp;
-		if (!dolock) {
-			xfs_dqunlock(dqp);
-			xfs_dqunlock(udqhint);
-		}
-		goto done;
-	}
-	/*
-	 * We can't hold a dquot lock when we call the dqget code.
-	 * We'll deadlock in no time, because of (not conforming to)
-	 * lock ordering - the inodelock comes before any dquot lock,
-	 * and we may drop and reacquire the ilock in xfs_qm_dqget().
-	 */
-	if (udqhint)
-		xfs_dqunlock(udqhint);
-	/*
-	 * Find the dquot from somewhere. This bumps the
-	 * reference count of dquot and returns it locked.
-	 * This can return ENOENT if dquot didn't exist on
-	 * disk and we didn't ask it to allocate;
-	 * ESRCH if quotas got turned off suddenly.
-	 */
-	if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type,
-				 doalloc|XFS_QMOPT_DOWARN, &dqp))) {
-		if (udqhint && dolock)
-			xfs_dqlock(udqhint);
-		goto done;
-	}
-
-	xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
-	/*
-	 * dqget may have dropped and re-acquired the ilock, but it guarantees
-	 * that the dquot returned is the one that should go in the inode.
-	 */
-	*IO_idqpp = dqp;
-	ASSERT(dqp);
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	if (! dolock) {
-		xfs_dqunlock(dqp);
-		goto done;
-	}
-	if (! udqhint)
-		goto done;
-
-	ASSERT(udqhint);
-	ASSERT(dolock);
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	if (! xfs_qm_dqlock_nowait(udqhint)) {
-		xfs_dqunlock(dqp);
-		xfs_dqlock(udqhint);
-		xfs_dqlock(dqp);
-	}
-      done:
-#ifdef QUOTADEBUG
-	if (udqhint) {
-		if (dolock)
-			ASSERT(XFS_DQ_IS_LOCKED(udqhint));
-	}
-	if (! error) {
-		if (dolock)
-			ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	}
-#endif
-	return error;
-}
-
-
-/*
- * Given a udquot and gdquot, attach a ptr to the group dquot in the
- * udquot as a hint for future lookups. The idea sounds simple, but the
- * execution isn't, because the udquot might have a group dquot attached
- * already and getting rid of that gets us into lock ordering contraints.
- * The process is complicated more by the fact that the dquots may or may not
- * be locked on entry.
- */
-STATIC void
-xfs_qm_dqattach_grouphint(
-	xfs_dquot_t	*udq,
-	xfs_dquot_t	*gdq,
-	uint		locked)
-{
-	xfs_dquot_t	*tmp;
-
-#ifdef QUOTADEBUG
-	if (locked) {
-		ASSERT(XFS_DQ_IS_LOCKED(udq));
-		ASSERT(XFS_DQ_IS_LOCKED(gdq));
-	}
-#endif
-	if (! locked)
-		xfs_dqlock(udq);
-
-	if ((tmp = udq->q_gdquot)) {
-		if (tmp == gdq) {
-			if (! locked)
-				xfs_dqunlock(udq);
-			return;
-		}
-
-		udq->q_gdquot = NULL;
-		/*
-		 * We can't keep any dqlocks when calling dqrele,
-		 * because the freelist lock comes before dqlocks.
-		 */
-		xfs_dqunlock(udq);
-		if (locked)
-			xfs_dqunlock(gdq);
-		/*
-		 * we took a hard reference once upon a time in dqget,
-		 * so give it back when the udquot no longer points at it
-		 * dqput() does the unlocking of the dquot.
-		 */
-		xfs_qm_dqrele(tmp);
-
-		xfs_dqlock(udq);
-		xfs_dqlock(gdq);
-
-	} else {
-		ASSERT(XFS_DQ_IS_LOCKED(udq));
-		if (! locked) {
-			xfs_dqlock(gdq);
-		}
-	}
-
-	ASSERT(XFS_DQ_IS_LOCKED(udq));
-	ASSERT(XFS_DQ_IS_LOCKED(gdq));
-	/*
-	 * Somebody could have attached a gdquot here,
-	 * when we dropped the uqlock. If so, just do nothing.
-	 */
-	if (udq->q_gdquot == NULL) {
-		XFS_DQHOLD(gdq);
-		udq->q_gdquot = gdq;
-	}
-	if (! locked) {
-		xfs_dqunlock(gdq);
-		xfs_dqunlock(udq);
-	}
-}
-
-
-/*
- * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
- * into account.
- * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
- * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
- * much made this code a complete mess, but it has been pretty useful.
- * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
- * Inode may get unlocked and relocked in here, and the caller must deal with
- * the consequences.
- */
-int
-xfs_qm_dqattach(
-	xfs_inode_t	*ip,
-	uint		flags)
-{
-	xfs_mount_t	*mp = ip->i_mount;
-	uint		nquotas = 0;
-	int		error = 0;
-
-	if ((! XFS_IS_QUOTA_ON(mp)) ||
-	    (! XFS_NOT_DQATTACHED(mp, ip)) ||
-	    (ip->i_ino == mp->m_sb.sb_uquotino) ||
-	    (ip->i_ino == mp->m_sb.sb_gquotino))
-		return 0;
-
-	ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 ||
-	       XFS_ISLOCKED_INODE_EXCL(ip));
-
-	if (! (flags & XFS_QMOPT_ILOCKED))
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-	if (XFS_IS_UQUOTA_ON(mp)) {
-		error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
-						flags & XFS_QMOPT_DQALLOC,
-						flags & XFS_QMOPT_DQLOCK,
-						NULL, &ip->i_udquot);
-		if (error)
-			goto done;
-		nquotas++;
-	}
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
-	if (XFS_IS_OQUOTA_ON(mp)) {
-		error = XFS_IS_GQUOTA_ON(mp) ?
-			xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
-						flags & XFS_QMOPT_DQALLOC,
-						flags & XFS_QMOPT_DQLOCK,
-						ip->i_udquot, &ip->i_gdquot) :
-			xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
-						flags & XFS_QMOPT_DQALLOC,
-						flags & XFS_QMOPT_DQLOCK,
-						ip->i_udquot, &ip->i_gdquot);
-		/*
-		 * Don't worry about the udquot that we may have
-		 * attached above. It'll get detached, if not already.
-		 */
-		if (error)
-			goto done;
-		nquotas++;
-	}
-
-	/*
-	 * Attach this group quota to the user quota as a hint.
-	 * This WON'T, in general, result in a thrash.
-	 */
-	if (nquotas == 2) {
-		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
-		ASSERT(ip->i_udquot);
-		ASSERT(ip->i_gdquot);
-
-		/*
-		 * We may or may not have the i_udquot locked at this point,
-		 * but this check is OK since we don't depend on the i_gdquot to
-		 * be accurate 100% all the time. It is just a hint, and this
-		 * will succeed in general.
-		 */
-		if (ip->i_udquot->q_gdquot == ip->i_gdquot)
-			goto done;
-		/*
-		 * Attach i_gdquot to the gdquot hint inside the i_udquot.
-		 */
-		xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot,
-					 flags & XFS_QMOPT_DQLOCK);
-	}
-
-      done:
-
-#ifdef QUOTADEBUG
-	if (! error) {
-		if (ip->i_udquot) {
-			if (flags & XFS_QMOPT_DQLOCK)
-				ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
-		}
-		if (ip->i_gdquot) {
-			if (flags & XFS_QMOPT_DQLOCK)
-				ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
-		}
-		if (XFS_IS_UQUOTA_ON(mp))
-			ASSERT(ip->i_udquot);
-		if (XFS_IS_OQUOTA_ON(mp))
-			ASSERT(ip->i_gdquot);
-	}
-#endif
-
-	if (! (flags & XFS_QMOPT_ILOCKED))
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-#ifdef QUOTADEBUG
-	else
-		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
-#endif
-	return error;
-}
-
-/*
- * Release dquots (and their references) if any.
- * The inode should be locked EXCL except when this's called by
- * xfs_ireclaim.
- */
-void
-xfs_qm_dqdetach(
-	xfs_inode_t	*ip)
-{
-	if (!(ip->i_udquot || ip->i_gdquot))
-		return;
-
-	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
-	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
-	if (ip->i_udquot) {
-		xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip);
-		xfs_qm_dqrele(ip->i_udquot);
-		ip->i_udquot = NULL;
-	}
-	if (ip->i_gdquot) {
-		xfs_dqtrace_entry_ino(ip->i_gdquot, "DQDETTACH", ip);
-		xfs_qm_dqrele(ip->i_gdquot);
-		ip->i_gdquot = NULL;
-	}
-}
-
-/*
- * This is called by VFS_SYNC and flags arg determines the caller,
- * and its motives, as done in xfs_sync.
- *
- * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
- * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
- * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
- */
-
-int
-xfs_qm_sync(
-	xfs_mount_t	*mp,
-	short		flags)
-{
-	int		recl, restarts;
-	xfs_dquot_t	*dqp;
-	uint		flush_flags;
-	boolean_t	nowait;
-	int		error;
-
-	restarts = 0;
-	/*
-	 * We won't block unless we are asked to.
-	 */
-	nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0);
-
-  again:
-	xfs_qm_mplist_lock(mp);
-	/*
-	 * dqpurge_all() also takes the mplist lock and iterate thru all dquots
-	 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
-	 * when we have the mplist lock, we know that dquots will be consistent
-	 * as long as we have it locked.
-	 */
-	if (! XFS_IS_QUOTA_ON(mp)) {
-		xfs_qm_mplist_unlock(mp);
-		return 0;
-	}
-	FOREACH_DQUOT_IN_MP(dqp, mp) {
-		/*
-		 * If this is vfs_sync calling, then skip the dquots that
-		 * don't 'seem' to be dirty. ie. don't acquire dqlock.
-		 * This is very similar to what xfs_sync does with inodes.
-		 */
-		if (flags & SYNC_BDFLUSH) {
-			if (! XFS_DQ_IS_DIRTY(dqp))
-				continue;
-		}
-
-		if (nowait) {
-			/*
-			 * Try to acquire the dquot lock. We are NOT out of
-			 * lock order, but we just don't want to wait for this
-			 * lock, unless somebody wanted us to.
-			 */
-			if (! xfs_qm_dqlock_nowait(dqp))
-				continue;
-		} else {
-			xfs_dqlock(dqp);
-		}
-
-		/*
-		 * Now, find out for sure if this dquot is dirty or not.
-		 */
-		if (! XFS_DQ_IS_DIRTY(dqp)) {
-			xfs_dqunlock(dqp);
-			continue;
-		}
-
-		/* XXX a sentinel would be better */
-		recl = XFS_QI_MPLRECLAIMS(mp);
-		if (! xfs_qm_dqflock_nowait(dqp)) {
-			if (nowait) {
-				xfs_dqunlock(dqp);
-				continue;
-			}
-			/*
-			 * If we can't grab the flush lock then if the caller
-			 * really wanted us to give this our best shot,
-			 * see if we can give a push to the buffer before we wait
-			 * on the flush lock. At this point, we know that
-			 * eventhough the dquot is being flushed,
-			 * it has (new) dirty data.
-			 */
-			xfs_qm_dqflock_pushbuf_wait(dqp);
-		}
-		/*
-		 * Let go of the mplist lock. We don't want to hold it
-		 * across a disk write
-		 */
-		flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC;
-		xfs_qm_mplist_unlock(mp);
-		xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
-		error = xfs_qm_dqflush(dqp, flush_flags);
-		xfs_dqunlock(dqp);
-		if (error && XFS_FORCED_SHUTDOWN(mp))
-			return 0;	/* Need to prevent umount failure */
-		else if (error)
-			return error;
-
-		xfs_qm_mplist_lock(mp);
-		if (recl != XFS_QI_MPLRECLAIMS(mp)) {
-			if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
-				break;
-
-			xfs_qm_mplist_unlock(mp);
-			goto again;
-		}
-	}
-
-	xfs_qm_mplist_unlock(mp);
-	return 0;
-}
-
-
-/*
- * This initializes all the quota information that's kept in the
- * mount structure
- */
-STATIC int
-xfs_qm_init_quotainfo(
-	xfs_mount_t	*mp)
-{
-	xfs_quotainfo_t *qinf;
-	int		error;
-	xfs_dquot_t	*dqp;
-
-	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-	/*
-	 * Tell XQM that we exist as soon as possible.
-	 */
-	if ((error = xfs_qm_hold_quotafs_ref(mp))) {
-		return error;
-	}
-
-	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
-
-	/*
-	 * See if quotainodes are setup, and if not, allocate them,
-	 * and change the superblock accordingly.
-	 */
-	if ((error = xfs_qm_init_quotainos(mp))) {
-		kmem_free(qinf, sizeof(xfs_quotainfo_t));
-		mp->m_quotainfo = NULL;
-		return error;
-	}
-
-	spinlock_init(&qinf->qi_pinlock, "xfs_qinf_pin");
-	xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
-	qinf->qi_dqreclaims = 0;
-
-	/* mutex used to serialize quotaoffs */
-	mutex_init(&qinf->qi_quotaofflock);
-
-	/* Precalc some constants */
-	qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
-	ASSERT(qinf->qi_dqchunklen);
-	qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
-	do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
-
-	mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
-
-	/*
-	 * We try to get the limits from the superuser's limits fields.
-	 * This is quite hacky, but it is standard quota practice.
-	 * We look at the USR dquot with id == 0 first, but if user quotas
-	 * are not enabled we goto the GRP dquot with id == 0.
-	 * We don't really care to keep separate default limits for user
-	 * and group quotas, at least not at this point.
-	 */
-	error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
-			     XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : 
-			     (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
-				XFS_DQ_PROJ),
-			     XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
-			     &dqp);
-	if (! error) {
-		xfs_disk_dquot_t	*ddqp = &dqp->q_core;
-
-		/*
-		 * The warnings and timers set the grace period given to
-		 * a user or group before he or she can not perform any
-		 * more writing. If it is zero, a default is used.
-		 */
-		qinf->qi_btimelimit = ddqp->d_btimer ?
-			be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT;
-		qinf->qi_itimelimit = ddqp->d_itimer ?
-			be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT;
-		qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ?
-			be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT;
-		qinf->qi_bwarnlimit = ddqp->d_bwarns ?
-			be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT;
-		qinf->qi_iwarnlimit = ddqp->d_iwarns ?
-			be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
-		qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
-			be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
-		qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
-		qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
-		qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
-		qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
-		qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
-		qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
- 
-		/*
-		 * We sent the XFS_QMOPT_DQSUSER flag to dqget because
-		 * we don't want this dquot cached. We haven't done a
-		 * quotacheck yet, and quotacheck doesn't like incore dquots.
-		 */
-		xfs_qm_dqdestroy(dqp);
-	} else {
-		qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
-		qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
-		qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
-		qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
-		qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
-		qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
-	}
-
-	return 0;
-}
-
-
-/*
- * Gets called when unmounting a filesystem or when all quotas get
- * turned off.
- * This purges the quota inodes, destroys locks and frees itself.
- */
-void
-xfs_qm_destroy_quotainfo(
-	xfs_mount_t	*mp)
-{
-	xfs_quotainfo_t *qi;
-
-	qi = mp->m_quotainfo;
-	ASSERT(qi != NULL);
-	ASSERT(xfs_Gqm != NULL);
-
-	/*
-	 * Release the reference that XQM kept, so that we know
-	 * when the XQM structure should be freed. We cannot assume
-	 * that xfs_Gqm is non-null after this point.
-	 */
-	xfs_qm_rele_quotafs_ref(mp);
-
-	spinlock_destroy(&qi->qi_pinlock);
-	xfs_qm_list_destroy(&qi->qi_dqlist);
-
-	if (qi->qi_uquotaip) {
-		XFS_PURGE_INODE(qi->qi_uquotaip);
-		qi->qi_uquotaip = NULL; /* paranoia */
-	}
-	if (qi->qi_gquotaip) {
-		XFS_PURGE_INODE(qi->qi_gquotaip);
-		qi->qi_gquotaip = NULL;
-	}
-	mutex_destroy(&qi->qi_quotaofflock);
-	kmem_free(qi, sizeof(xfs_quotainfo_t));
-	mp->m_quotainfo = NULL;
-}
-
-
-
-/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
-
-/* ARGSUSED */
-STATIC void
-xfs_qm_list_init(
-	xfs_dqlist_t	*list,
-	char		*str,
-	int		n)
-{
-	mutex_init(&list->qh_lock);
-	list->qh_next = NULL;
-	list->qh_version = 0;
-	list->qh_nelems = 0;
-}
-
-STATIC void
-xfs_qm_list_destroy(
-	xfs_dqlist_t	*list)
-{
-	mutex_destroy(&(list->qh_lock));
-}
-
-
-/*
- * Stripped down version of dqattach. This doesn't attach, or even look at the
- * dquots attached to the inode. The rationale is that there won't be any
- * attached at the time this is called from quotacheck.
- */
-STATIC int
-xfs_qm_dqget_noattach(
-	xfs_inode_t	*ip,
-	xfs_dquot_t	**O_udqpp,
-	xfs_dquot_t	**O_gdqpp)
-{
-	int		error;
-	xfs_mount_t	*mp;
-	xfs_dquot_t	*udqp, *gdqp;
-
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
-	mp = ip->i_mount;
-	udqp = NULL;
-	gdqp = NULL;
-
-	if (XFS_IS_UQUOTA_ON(mp)) {
-		ASSERT(ip->i_udquot == NULL);
-		/*
-		 * We want the dquot allocated if it doesn't exist.
-		 */
-		if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
-					 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
-					 &udqp))) {
-			/*
-			 * Shouldn't be able to turn off quotas here.
-			 */
-			ASSERT(error != ESRCH);
-			ASSERT(error != ENOENT);
-			return error;
-		}
-		ASSERT(udqp);
-	}
-
-	if (XFS_IS_OQUOTA_ON(mp)) {
-		ASSERT(ip->i_gdquot == NULL);
-		if (udqp)
-			xfs_dqunlock(udqp);
-		error = XFS_IS_GQUOTA_ON(mp) ?
-				xfs_qm_dqget(mp, ip,
-					     ip->i_d.di_gid, XFS_DQ_GROUP,
-					     XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-					     &gdqp) :
-				xfs_qm_dqget(mp, ip,
-					     ip->i_d.di_projid, XFS_DQ_PROJ,
-					     XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
-					     &gdqp);
-		if (error) {
-			if (udqp)
-				xfs_qm_dqrele(udqp);
-			ASSERT(error != ESRCH);
-			ASSERT(error != ENOENT);
-			return error;
-		}
-		ASSERT(gdqp);
-
-		/* Reacquire the locks in the right order */
-		if (udqp) {
-			if (! xfs_qm_dqlock_nowait(udqp)) {
-				xfs_dqunlock(gdqp);
-				xfs_dqlock(udqp);
-				xfs_dqlock(gdqp);
-			}
-		}
-	}
-
-	*O_udqpp = udqp;
-	*O_gdqpp = gdqp;
-
-#ifdef QUOTADEBUG
-	if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
-	if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
-#endif
-	return 0;
-}
-
-/*
- * Create an inode and return with a reference already taken, but unlocked
- * This is how we create quota inodes
- */
-STATIC int
-xfs_qm_qino_alloc(
-	xfs_mount_t	*mp,
-	xfs_inode_t	**ip,
-	__int64_t	sbfields,
-	uint		flags)
-{
-	xfs_trans_t	*tp;
-	int		error;
-	unsigned long	s;
-	cred_t		zerocr;
-	xfs_inode_t	zeroino;
-	int		committed;
-
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
-	if ((error = xfs_trans_reserve(tp,
-				      XFS_QM_QINOCREATE_SPACE_RES(mp),
-				      XFS_CREATE_LOG_RES(mp), 0,
-				      XFS_TRANS_PERM_LOG_RES,
-				      XFS_CREATE_LOG_COUNT))) {
-		xfs_trans_cancel(tp, 0);
-		return error;
-	}
-	memset(&zerocr, 0, sizeof(zerocr));
-	memset(&zeroino, 0, sizeof(zeroino));
-
-	if ((error = xfs_dir_ialloc(&tp, &zeroino, S_IFREG, 1, 0,
-				   &zerocr, 0, 1, ip, &committed))) {
-		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-				 XFS_TRANS_ABORT);
-		return error;
-	}
-
-	/*
-	 * Keep an extra reference to this quota inode. This inode is
-	 * locked exclusively and joined to the transaction already.
-	 */
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(*ip));
-	VN_HOLD(XFS_ITOV((*ip)));
-
-	/*
-	 * Make the changes in the superblock, and log those too.
-	 * sbfields arg may contain fields other than *QUOTINO;
-	 * VERSIONNUM for example.
-	 */
-	s = XFS_SB_LOCK(mp);
-	if (flags & XFS_QMOPT_SBVERSION) {
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-		unsigned oldv = mp->m_sb.sb_versionnum;
-#endif
-		ASSERT(!XFS_SB_VERSION_HASQUOTA(&mp->m_sb));
-		ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-				   XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
-		       (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-			XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
-
-		XFS_SB_VERSION_ADDQUOTA(&mp->m_sb);
-		mp->m_sb.sb_uquotino = NULLFSINO;
-		mp->m_sb.sb_gquotino = NULLFSINO;
-
-		/* qflags will get updated _after_ quotacheck */
-		mp->m_sb.sb_qflags = 0;
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-		cmn_err(CE_NOTE,
-			"Old superblock version %x, converting to %x.",
-			oldv, mp->m_sb.sb_versionnum);
-#endif
-	}
-	if (flags & XFS_QMOPT_UQUOTA)
-		mp->m_sb.sb_uquotino = (*ip)->i_ino;
-	else
-		mp->m_sb.sb_gquotino = (*ip)->i_ino;
-	XFS_SB_UNLOCK(mp, s);
-	xfs_mod_sb(tp, sbfields);
-
-	if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
-				     NULL))) {
-		xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
-		return error;
-	}
-	return 0;
-}
-
-
-STATIC int
-xfs_qm_reset_dqcounts(
-	xfs_mount_t	*mp,
-	xfs_buf_t	*bp,
-	xfs_dqid_t	id,
-	uint		type)
-{
-	xfs_disk_dquot_t	*ddq;
-	int			j;
-
-	xfs_buftrace("RESET DQUOTS", bp);
-	/*
-	 * Reset all counters and timers. They'll be
-	 * started afresh by xfs_qm_quotacheck.
-	 */
-#ifdef DEBUG
-	j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
-	do_div(j, sizeof(xfs_dqblk_t));
-	ASSERT(XFS_QM_DQPERBLK(mp) == j);
-#endif
-	ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
-	for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
-		/*
-		 * Do a sanity check, and if needed, repair the dqblk. Don't
-		 * output any warnings because it's perfectly possible to
-		 * find unitialized dquot blks. See comment in xfs_qm_dqcheck.
-		 */
-		(void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR,
-				      "xfs_quotacheck");
-		ddq->d_bcount = 0;
-		ddq->d_icount = 0;
-		ddq->d_rtbcount = 0;
-		ddq->d_btimer = 0;
-		ddq->d_itimer = 0;
-		ddq->d_rtbtimer = 0;
-		ddq->d_bwarns = 0;
-		ddq->d_iwarns = 0;
-		ddq->d_rtbwarns = 0;
-		ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
-	}
-
-	return 0;
-}
-
-STATIC int
-xfs_qm_dqiter_bufs(
-	xfs_mount_t	*mp,
-	xfs_dqid_t	firstid,
-	xfs_fsblock_t	bno,
-	xfs_filblks_t	blkcnt,
-	uint		flags)
-{
-	xfs_buf_t	*bp;
-	int		error;
-	int		notcommitted;
-	int		incr;
-	int		type;
-
-	ASSERT(blkcnt > 0);
-	notcommitted = 0;
-	incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
-		XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
-	type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
-		(flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
-	error = 0;
-
-	/*
-	 * Blkcnt arg can be a very big number, and might even be
-	 * larger than the log itself. So, we have to break it up into
-	 * manageable-sized transactions.
-	 * Note that we don't start a permanent transaction here; we might
-	 * not be able to get a log reservation for the whole thing up front,
-	 * and we don't really care to either, because we just discard
-	 * everything if we were to crash in the middle of this loop.
-	 */
-	while (blkcnt--) {
-		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
-			      XFS_FSB_TO_DADDR(mp, bno),
-			      (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
-		if (error)
-			break;
-
-		(void) xfs_qm_reset_dqcounts(mp, bp, firstid, type);
-		xfs_bdwrite(mp, bp);
-		/*
-		 * goto the next block.
-		 */
-		bno++;
-		firstid += XFS_QM_DQPERBLK(mp);
-	}
-	return error;
-}
-
-/*
- * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a
- * caller supplied function for every chunk of dquots that we find.
- */
-STATIC int
-xfs_qm_dqiterate(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*qip,
-	uint		flags)
-{
-	xfs_bmbt_irec_t		*map;
-	int			i, nmaps;	/* number of map entries */
-	int			error;		/* return value */
-	xfs_fileoff_t		lblkno;
-	xfs_filblks_t		maxlblkcnt;
-	xfs_dqid_t		firstid;
-	xfs_fsblock_t		rablkno;
-	xfs_filblks_t		rablkcnt;
-
-	error = 0;
-	/*
-	 * This looks racey, but we can't keep an inode lock across a
-	 * trans_reserve. But, this gets called during quotacheck, and that
-	 * happens only at mount time which is single threaded.
-	 */
-	if (qip->i_d.di_nblocks == 0)
-		return 0;
-
-	map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
-
-	lblkno = 0;
-	maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
-	do {
-		nmaps = XFS_DQITER_MAP_SIZE;
-		/*
-		 * We aren't changing the inode itself. Just changing
-		 * some of its data. No new blocks are added here, and
-		 * the inode is never added to the transaction.
-		 */
-		xfs_ilock(qip, XFS_ILOCK_SHARED);
-		error = xfs_bmapi(NULL, qip, lblkno,
-				  maxlblkcnt - lblkno,
-				  XFS_BMAPI_METADATA,
-				  NULL,
-				  0, map, &nmaps, NULL);
-		xfs_iunlock(qip, XFS_ILOCK_SHARED);
-		if (error)
-			break;
-
-		ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
-		for (i = 0; i < nmaps; i++) {
-			ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
-			ASSERT(map[i].br_blockcount);
-
-
-			lblkno += map[i].br_blockcount;
-
-			if (map[i].br_startblock == HOLESTARTBLOCK)
-				continue;
-
-			firstid = (xfs_dqid_t) map[i].br_startoff *
-				XFS_QM_DQPERBLK(mp);
-			/*
-			 * Do a read-ahead on the next extent.
-			 */
-			if ((i+1 < nmaps) &&
-			    (map[i+1].br_startblock != HOLESTARTBLOCK)) {
-				rablkcnt =  map[i+1].br_blockcount;
-				rablkno = map[i+1].br_startblock;
-				while (rablkcnt--) {
-					xfs_baread(mp->m_ddev_targp,
-					       XFS_FSB_TO_DADDR(mp, rablkno),
-					       (int)XFS_QI_DQCHUNKLEN(mp));
-					rablkno++;
-				}
-			}
-			/*
-			 * Iterate thru all the blks in the extent and
-			 * reset the counters of all the dquots inside them.
-			 */
-			if ((error = xfs_qm_dqiter_bufs(mp,
-						       firstid,
-						       map[i].br_startblock,
-						       map[i].br_blockcount,
-						       flags))) {
-				break;
-			}
-		}
-
-		if (error)
-			break;
-	} while (nmaps > 0);
-
-	kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map));
-
-	return error;
-}
-
-/*
- * Called by dqusage_adjust in doing a quotacheck.
- * Given the inode, and a dquot (either USR or GRP, doesn't matter),
- * this updates its incore copy as well as the buffer copy. This is
- * so that once the quotacheck is done, we can just log all the buffers,
- * as opposed to logging numerous updates to individual dquots.
- */
-STATIC void
-xfs_qm_quotacheck_dqadjust(
-	xfs_dquot_t		*dqp,
-	xfs_qcnt_t		nblks,
-	xfs_qcnt_t		rtblks)
-{
-	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	xfs_dqtrace_entry(dqp, "QCHECK DQADJUST");
-	/*
-	 * Adjust the inode count and the block count to reflect this inode's
-	 * resource usage.
-	 */
-	be64_add(&dqp->q_core.d_icount, 1);
-	dqp->q_res_icount++;
-	if (nblks) {
-		be64_add(&dqp->q_core.d_bcount, nblks);
-		dqp->q_res_bcount += nblks;
-	}
-	if (rtblks) {
-		be64_add(&dqp->q_core.d_rtbcount, rtblks);
-		dqp->q_res_rtbcount += rtblks;
-	}
-
-	/*
-	 * Set default limits, adjust timers (since we changed usages)
-	 */
-	if (! XFS_IS_SUSER_DQUOT(dqp)) {
-		xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
-		xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
-	}
-
-	dqp->dq_flags |= XFS_DQ_DIRTY;
-}
-
-STATIC int
-xfs_qm_get_rtblks(
-	xfs_inode_t	*ip,
-	xfs_qcnt_t	*O_rtblks)
-{
-	xfs_filblks_t	rtblks;			/* total rt blks */
-	xfs_ifork_t	*ifp;			/* inode fork pointer */
-	xfs_extnum_t	nextents;		/* number of extent entries */
-	xfs_bmbt_rec_t	*base;			/* base of extent array */
-	xfs_bmbt_rec_t	*ep;			/* pointer to an extent entry */
-	int		error;
-
-	ASSERT(XFS_IS_REALTIME_INODE(ip));
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-		if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
-			return error;
-	}
-	rtblks = 0;
-	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-	base = &ifp->if_u1.if_extents[0];
-	for (ep = base; ep < &base[nextents]; ep++)
-		rtblks += xfs_bmbt_get_blockcount(ep);
-	*O_rtblks = (xfs_qcnt_t)rtblks;
-	return 0;
-}
-
-/*
- * callback routine supplied to bulkstat(). Given an inumber, find its
- * dquots and update them to account for resources taken by that inode.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_dqusage_adjust(
-	xfs_mount_t	*mp,		/* mount point for filesystem */
-	xfs_ino_t	ino,		/* inode number to get data for */
-	void		__user *buffer,	/* not used */
-	int		ubsize,		/* not used */
-	void		*private_data,	/* not used */
-	xfs_daddr_t	bno,		/* starting block of inode cluster */
-	int		*ubused,	/* not used */
-	void		*dip,		/* on-disk inode pointer (not used) */
-	int		*res)		/* result code value */
-{
-	xfs_inode_t	*ip;
-	xfs_dquot_t	*udqp, *gdqp;
-	xfs_qcnt_t	nblks, rtblks;
-	int		error;
-
-	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-	/*
-	 * rootino must have its resources accounted for, not so with the quota
-	 * inodes.
-	 */
-	if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
-		*res = BULKSTAT_RV_NOTHING;
-		return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
-	 * interface expects the inode to be exclusively locked because that's
-	 * the case in all other instances. It's OK that we do this because
-	 * quotacheck is done only at mount time.
-	 */
-	if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) {
-		*res = BULKSTAT_RV_NOTHING;
-		return error;
-	}
-
-	if (ip->i_d.di_mode == 0) {
-		xfs_iput_new(ip, XFS_ILOCK_EXCL);
-		*res = BULKSTAT_RV_NOTHING;
-		return XFS_ERROR(ENOENT);
-	}
-
-	/*
-	 * Obtain the locked dquots. In case of an error (eg. allocation
-	 * fails for ENOSPC), we return the negative of the error number
-	 * to bulkstat, so that it can get propagated to quotacheck() and
-	 * making us disable quotas for the file system.
-	 */
-	if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
-		xfs_iput(ip, XFS_ILOCK_EXCL);
-		*res = BULKSTAT_RV_GIVEUP;
-		return error;
-	}
-
-	rtblks = 0;
-	if (! XFS_IS_REALTIME_INODE(ip)) {
-		nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
-	} else {
-		/*
-		 * Walk thru the extent list and count the realtime blocks.
-		 */
-		if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
-			xfs_iput(ip, XFS_ILOCK_EXCL);
-			if (udqp)
-				xfs_qm_dqput(udqp);
-			if (gdqp)
-				xfs_qm_dqput(gdqp);
-			*res = BULKSTAT_RV_GIVEUP;
-			return error;
-		}
-		nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
-	}
-	ASSERT(ip->i_delayed_blks == 0);
-
-	/*
-	 * We can't release the inode while holding its dquot locks.
-	 * The inode can go into inactive and might try to acquire the dquotlocks.
-	 * So, just unlock here and do a vn_rele at the end.
-	 */
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	/*
-	 * Add the (disk blocks and inode) resources occupied by this
-	 * inode to its dquots. We do this adjustment in the incore dquot,
-	 * and also copy the changes to its buffer.
-	 * We don't care about putting these changes in a transaction
-	 * envelope because if we crash in the middle of a 'quotacheck'
-	 * we have to start from the beginning anyway.
-	 * Once we're done, we'll log all the dquot bufs.
-	 *
-	 * The *QUOTA_ON checks below may look pretty racey, but quotachecks
-	 * and quotaoffs don't race. (Quotachecks happen at mount time only).
-	 */
-	if (XFS_IS_UQUOTA_ON(mp)) {
-		ASSERT(udqp);
-		xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks);
-		xfs_qm_dqput(udqp);
-	}
-	if (XFS_IS_OQUOTA_ON(mp)) {
-		ASSERT(gdqp);
-		xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks);
-		xfs_qm_dqput(gdqp);
-	}
-	/*
-	 * Now release the inode. This will send it to 'inactive', and
-	 * possibly even free blocks.
-	 */
-	VN_RELE(XFS_ITOV(ip));
-
-	/*
-	 * Goto next inode.
-	 */
-	*res = BULKSTAT_RV_DIDONE;
-	return 0;
-}
-
-/*
- * Walk thru all the filesystem inodes and construct a consistent view
- * of the disk quota world. If the quotacheck fails, disable quotas.
- */
-int
-xfs_qm_quotacheck(
-	xfs_mount_t	*mp)
-{
-	int		done, count, error;
-	xfs_ino_t	lastino;
-	size_t		structsz;
-	xfs_inode_t	*uip, *gip;
-	uint		flags;
-
-	count = INT_MAX;
-	structsz = 1;
-	lastino = 0;
-	flags = 0;
-
-	ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
-	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-	/*
-	 * There should be no cached dquots. The (simplistic) quotacheck
-	 * algorithm doesn't like that.
-	 */
-	ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
-
-	cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
-
-	/*
-	 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
-	 * their counters to zero. We need a clean slate.
-	 * We don't log our changes till later.
-	 */
-	if ((uip = XFS_QI_UQIP(mp))) {
-		if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
-			goto error_return;
-		flags |= XFS_UQUOTA_CHKD;
-	}
-
-	if ((gip = XFS_QI_GQIP(mp))) {
-		if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
-					XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA)))
-			goto error_return;
-		flags |= XFS_OQUOTA_CHKD;
-	}
-
-	do {
-		/*
-		 * Iterate thru all the inodes in the file system,
-		 * adjusting the corresponding dquot counters in core.
-		 */
-		if ((error = xfs_bulkstat(mp, &lastino, &count,
-				     xfs_qm_dqusage_adjust, NULL,
-				     structsz, NULL,
-				     BULKSTAT_FG_IGET|BULKSTAT_FG_VFSLOCKED,
-				     &done)))
-			break;
-
-	} while (! done);
-
-	/*
-	 * We can get this error if we couldn't do a dquot allocation inside
-	 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
-	 * dirty dquots that might be cached, we just want to get rid of them
-	 * and turn quotaoff. The dquots won't be attached to any of the inodes
-	 * at this point (because we intentionally didn't in dqget_noattach).
-	 */
-	if (error) {
-		xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
-		goto error_return;
-	}
-	/*
-	 * We've made all the changes that we need to make incore.
-	 * Now flush_them down to disk buffers.
-	 */
-	xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
-
-	/*
-	 * We didn't log anything, because if we crashed, we'll have to
-	 * start the quotacheck from scratch anyway. However, we must make
-	 * sure that our dquot changes are secure before we put the
-	 * quotacheck'd stamp on the superblock. So, here we do a synchronous
-	 * flush.
-	 */
-	XFS_bflush(mp->m_ddev_targp);
-
-	/*
-	 * If one type of quotas is off, then it will lose its
-	 * quotachecked status, since we won't be doing accounting for
-	 * that type anymore.
-	 */
-	mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
-	mp->m_qflags |= flags;
-
-	XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
-
- error_return:
-	if (error) {
-		cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): "
-			"Disabling quotas.",
-			mp->m_fsname, error);
-		/*
-		 * We must turn off quotas.
-		 */
-		ASSERT(mp->m_quotainfo != NULL);
-		ASSERT(xfs_Gqm != NULL);
-		xfs_qm_destroy_quotainfo(mp);
-		(void)xfs_mount_reset_sbqflags(mp);
-	} else {
-		cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
-	}
-	return (error);
-}
-
-/*
- * This is called after the superblock has been read in and we're ready to
- * iget the quota inodes.
- */
-STATIC int
-xfs_qm_init_quotainos(
-	xfs_mount_t	*mp)
-{
-	xfs_inode_t	*uip, *gip;
-	int		error;
-	__int64_t	sbflags;
-	uint		flags;
-
-	ASSERT(mp->m_quotainfo);
-	uip = gip = NULL;
-	sbflags = 0;
-	flags = 0;
-
-	/*
-	 * Get the uquota and gquota inodes
-	 */
-	if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
-		if (XFS_IS_UQUOTA_ON(mp) &&
-		    mp->m_sb.sb_uquotino != NULLFSINO) {
-			ASSERT(mp->m_sb.sb_uquotino > 0);
-			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-					     0, 0, &uip, 0)))
-				return XFS_ERROR(error);
-		}
-		if (XFS_IS_OQUOTA_ON(mp) &&
-		    mp->m_sb.sb_gquotino != NULLFSINO) {
-			ASSERT(mp->m_sb.sb_gquotino > 0);
-			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-					     0, 0, &gip, 0))) {
-				if (uip)
-					VN_RELE(XFS_ITOV(uip));
-				return XFS_ERROR(error);
-			}
-		}
-	} else {
-		flags |= XFS_QMOPT_SBVERSION;
-		sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-			    XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
-	}
-
-	/*
-	 * Create the two inodes, if they don't exist already. The changes
-	 * made above will get added to a transaction and logged in one of
-	 * the qino_alloc calls below.  If the device is readonly,
-	 * temporarily switch to read-write to do this.
-	 */
-	if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
-		if ((error = xfs_qm_qino_alloc(mp, &uip,
-					      sbflags | XFS_SB_UQUOTINO,
-					      flags | XFS_QMOPT_UQUOTA)))
-			return XFS_ERROR(error);
-
-		flags &= ~XFS_QMOPT_SBVERSION;
-	}
-	if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) {
-		flags |= (XFS_IS_GQUOTA_ON(mp) ?
-				XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
-		error = xfs_qm_qino_alloc(mp, &gip,
-					  sbflags | XFS_SB_GQUOTINO, flags);
-		if (error) {
-			if (uip)
-				VN_RELE(XFS_ITOV(uip));
-
-			return XFS_ERROR(error);
-		}
-	}
-
-	XFS_QI_UQIP(mp) = uip;
-	XFS_QI_GQIP(mp) = gip;
-
-	return 0;
-}
-
-
-/*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
- * favor the lookup function ...
- * XXXsup merge this with qm_reclaim_one().
- */
-STATIC int
-xfs_qm_shake_freelist(
-	int howmany)
-{
-	int		nreclaimed;
-	xfs_dqhash_t	*hash;
-	xfs_dquot_t	*dqp, *nextdqp;
-	int		restarts;
-	int		nflushes;
-
-	if (howmany <= 0)
-		return 0;
-
-	nreclaimed = 0;
-	restarts = 0;
-	nflushes = 0;
-
-#ifdef QUOTADEBUG
-	cmn_err(CE_DEBUG, "Shake free 0x%x", howmany);
-#endif
-	/* lock order is : hashchainlock, freelistlock, mplistlock */
- tryagain:
-	xfs_qm_freelist_lock(xfs_Gqm);
-
-	for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
-	     ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
-	      nreclaimed < howmany); ) {
-		xfs_dqlock(dqp);
-
-		/*
-		 * We are racing with dqlookup here. Naturally we don't
-		 * want to reclaim a dquot that lookup wants.
-		 */
-		if (dqp->dq_flags & XFS_DQ_WANT) {
-			xfs_dqunlock(dqp);
-			xfs_qm_freelist_unlock(xfs_Gqm);
-			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-				return nreclaimed;
-			XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-			goto tryagain;
-		}
-
-		/*
-		 * If the dquot is inactive, we are assured that it is
-		 * not on the mplist or the hashlist, and that makes our
-		 * life easier.
-		 */
-		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-			ASSERT(dqp->q_mount == NULL);
-			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-			ASSERT(dqp->HL_PREVP == NULL);
-			ASSERT(dqp->MPL_PREVP == NULL);
-			XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-			nextdqp = dqp->dq_flnext;
-			goto off_freelist;
-		}
-
-		ASSERT(dqp->MPL_PREVP);
-		/*
-		 * Try to grab the flush lock. If this dquot is in the process of
-		 * getting flushed to disk, we don't want to reclaim it.
-		 */
-		if (! xfs_qm_dqflock_nowait(dqp)) {
-			xfs_dqunlock(dqp);
-			dqp = dqp->dq_flnext;
-			continue;
-		}
-
-		/*
-		 * We have the flush lock so we know that this is not in the
-		 * process of being flushed. So, if this is dirty, flush it
-		 * DELWRI so that we don't get a freelist infested with
-		 * dirty dquots.
-		 */
-		if (XFS_DQ_IS_DIRTY(dqp)) {
-			xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
-			/*
-			 * We flush it delayed write, so don't bother
-			 * releasing the mplock.
-			 */
-			(void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
-			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-			dqp = dqp->dq_flnext;
-			continue;
-		}
-		/*
-		 * We're trying to get the hashlock out of order. This races
-		 * with dqlookup; so, we giveup and goto the next dquot if
-		 * we couldn't get the hashlock. This way, we won't starve
-		 * a dqlookup process that holds the hashlock that is
-		 * waiting for the freelist lock.
-		 */
-		if (! xfs_qm_dqhashlock_nowait(dqp)) {
-			xfs_dqfunlock(dqp);
-			xfs_dqunlock(dqp);
-			dqp = dqp->dq_flnext;
-			continue;
-		}
-		/*
-		 * This races with dquot allocation code as well as dqflush_all
-		 * and reclaim code. So, if we failed to grab the mplist lock,
-		 * giveup everything and start over.
-		 */
-		hash = dqp->q_hash;
-		ASSERT(hash);
-		if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
-			/* XXX put a sentinel so that we can come back here */
-			xfs_dqfunlock(dqp);
-			xfs_dqunlock(dqp);
-			XFS_DQ_HASH_UNLOCK(hash);
-			xfs_qm_freelist_unlock(xfs_Gqm);
-			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-				return nreclaimed;
-			goto tryagain;
-		}
-		xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING");
-#ifdef QUOTADEBUG
-		cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
-			dqp, be32_to_cpu(dqp->q_core.d_id));
-#endif
-		ASSERT(dqp->q_nrefs == 0);
-		nextdqp = dqp->dq_flnext;
-		XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
-		XQM_HASHLIST_REMOVE(hash, dqp);
-		xfs_dqfunlock(dqp);
-		xfs_qm_mplist_unlock(dqp->q_mount);
-		XFS_DQ_HASH_UNLOCK(hash);
-
- off_freelist:
-		XQM_FREELIST_REMOVE(dqp);
-		xfs_dqunlock(dqp);
-		nreclaimed++;
-		XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims);
-		xfs_qm_dqdestroy(dqp);
-		dqp = nextdqp;
-	}
-	xfs_qm_freelist_unlock(xfs_Gqm);
-	return nreclaimed;
-}
-
-
-/*
- * The kmem_shake interface is invoked when memory is running low.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
-{
-	int	ndqused, nfree, n;
-
-	if (!kmem_shake_allow(gfp_mask))
-		return 0;
-	if (!xfs_Gqm)
-		return 0;
-
-	nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
-	/* incore dquots in all f/s's */
-	ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
-
-	ASSERT(ndqused >= 0);
-
-	if (nfree <= ndqused && nfree < ndquot)
-		return 0;
-
-	ndqused *= xfs_Gqm->qm_dqfree_ratio;	/* target # of free dquots */
-	n = nfree - ndqused - ndquot;		/* # over target */
-
-	return xfs_qm_shake_freelist(MAX(nfree, n));
-}
-
-
-/*
- * Just pop the least recently used dquot off the freelist and
- * recycle it. The returned dquot is locked.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqreclaim_one(void)
-{
-	xfs_dquot_t	*dqpout;
-	xfs_dquot_t	*dqp;
-	int		restarts;
-	int		nflushes;
-
-	restarts = 0;
-	dqpout = NULL;
-	nflushes = 0;
-
-	/* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
- startagain:
-	xfs_qm_freelist_lock(xfs_Gqm);
-
-	FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
-		xfs_dqlock(dqp);
-
-		/*
-		 * We are racing with dqlookup here. Naturally we don't
-		 * want to reclaim a dquot that lookup wants. We release the
-		 * freelist lock and start over, so that lookup will grab
-		 * both the dquot and the freelistlock.
-		 */
-		if (dqp->dq_flags & XFS_DQ_WANT) {
-			ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-			xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT");
-			xfs_dqunlock(dqp);
-			xfs_qm_freelist_unlock(xfs_Gqm);
-			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-				return NULL;
-			XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-			goto startagain;
-		}
-
-		/*
-		 * If the dquot is inactive, we are assured that it is
-		 * not on the mplist or the hashlist, and that makes our
-		 * life easier.
-		 */
-		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-			ASSERT(dqp->q_mount == NULL);
-			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-			ASSERT(dqp->HL_PREVP == NULL);
-			ASSERT(dqp->MPL_PREVP == NULL);
-			XQM_FREELIST_REMOVE(dqp);
-			xfs_dqunlock(dqp);
-			dqpout = dqp;
-			XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-			break;
-		}
-
-		ASSERT(dqp->q_hash);
-		ASSERT(dqp->MPL_PREVP);
-
-		/*
-		 * Try to grab the flush lock. If this dquot is in the process of
-		 * getting flushed to disk, we don't want to reclaim it.
-		 */
-		if (! xfs_qm_dqflock_nowait(dqp)) {
-			xfs_dqunlock(dqp);
-			continue;
-		}
-
-		/*
-		 * We have the flush lock so we know that this is not in the
-		 * process of being flushed. So, if this is dirty, flush it
-		 * DELWRI so that we don't get a freelist infested with
-		 * dirty dquots.
-		 */
-		if (XFS_DQ_IS_DIRTY(dqp)) {
-			xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
-			/*
-			 * We flush it delayed write, so don't bother
-			 * releasing the freelist lock.
-			 */
-			(void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
-			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-			continue;
-		}
-
-		if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
-			xfs_dqfunlock(dqp);
-			xfs_dqunlock(dqp);
-			continue;
-		}
-
-		if (! xfs_qm_dqhashlock_nowait(dqp))
-			goto mplistunlock;
-
-		ASSERT(dqp->q_nrefs == 0);
-		xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING");
-		XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
-		XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
-		XQM_FREELIST_REMOVE(dqp);
-		dqpout = dqp;
-		XFS_DQ_HASH_UNLOCK(dqp->q_hash);
- mplistunlock:
-		xfs_qm_mplist_unlock(dqp->q_mount);
-		xfs_dqfunlock(dqp);
-		xfs_dqunlock(dqp);
-		if (dqpout)
-			break;
-	}
-
-	xfs_qm_freelist_unlock(xfs_Gqm);
-	return dqpout;
-}
-
-
-/*------------------------------------------------------------------*/
-
-/*
- * Return a new incore dquot. Depending on the number of
- * dquots in the system, we either allocate a new one on the kernel heap,
- * or reclaim a free one.
- * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
- * to reclaim an existing one from the freelist.
- */
-boolean_t
-xfs_qm_dqalloc_incore(
-	xfs_dquot_t **O_dqpp)
-{
-	xfs_dquot_t	*dqp;
-
-	/*
-	 * Check against high water mark to see if we want to pop
-	 * a nincompoop dquot off the freelist.
-	 */
-	if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
-		/*
-		 * Try to recycle a dquot from the freelist.
-		 */
-		if ((dqp = xfs_qm_dqreclaim_one())) {
-			XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
-			/*
-			 * Just zero the core here. The rest will get
-			 * reinitialized by caller. XXX we shouldn't even
-			 * do this zero ...
-			 */
-			memset(&dqp->q_core, 0, sizeof(dqp->q_core));
-			*O_dqpp = dqp;
-			return B_FALSE;
-		}
-		XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
-	}
-
-	/*
-	 * Allocate a brand new dquot on the kernel heap and return it
-	 * to the caller to initialize.
-	 */
-	ASSERT(xfs_Gqm->qm_dqzone != NULL);
-	*O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
-	atomic_inc(&xfs_Gqm->qm_totaldquots);
-
-	return B_TRUE;
-}
-
-
-/*
- * Start a transaction and write the incore superblock changes to
- * disk. flags parameter indicates which fields have changed.
- */
-int
-xfs_qm_write_sb_changes(
-	xfs_mount_t	*mp,
-	__int64_t	flags)
-{
-	xfs_trans_t	*tp;
-	int		error;
-
-#ifdef QUOTADEBUG
-	cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname);
-#endif
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-	if ((error = xfs_trans_reserve(tp, 0,
-				      mp->m_sb.sb_sectsize + 128, 0,
-				      0,
-				      XFS_DEFAULT_LOG_COUNT))) {
-		xfs_trans_cancel(tp, 0);
-		return error;
-	}
-
-	xfs_mod_sb(tp, flags);
-	(void) xfs_trans_commit(tp, 0, NULL);
-
-	return 0;
-}
-
-
-/* --------------- utility functions for vnodeops ---------------- */
-
-
-/*
- * Given an inode, a uid and gid (from cred_t) make sure that we have
- * allocated relevant dquot(s) on disk, and that we won't exceed inode
- * quotas by creating this file.
- * This also attaches dquot(s) to the given inode after locking it,
- * and returns the dquots corresponding to the uid and/or gid.
- *
- * in	: inode (unlocked)
- * out	: udquot, gdquot with references taken and unlocked
- */
-int
-xfs_qm_vop_dqalloc(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip,
-	uid_t		uid,
-	gid_t		gid,
-	prid_t		prid,
-	uint		flags,
-	xfs_dquot_t	**O_udqpp,
-	xfs_dquot_t	**O_gdqpp)
-{
-	int		error;
-	xfs_dquot_t	*uq, *gq;
-	uint		lockflags;
-
-	if (!XFS_IS_QUOTA_ON(mp))
-		return 0;
-
-	lockflags = XFS_ILOCK_EXCL;
-	xfs_ilock(ip, lockflags);
-
-	if ((flags & XFS_QMOPT_INHERIT) &&
-	    XFS_INHERIT_GID(ip, XFS_MTOVFS(mp)))
-		gid = ip->i_d.di_gid;
-
-	/*
-	 * Attach the dquot(s) to this inode, doing a dquot allocation
-	 * if necessary. The dquot(s) will not be locked.
-	 */
-	if (XFS_NOT_DQATTACHED(mp, ip)) {
-		if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC |
-					    XFS_QMOPT_ILOCKED))) {
-			xfs_iunlock(ip, lockflags);
-			return error;
-		}
-	}
-
-	uq = gq = NULL;
-	if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
-		if (ip->i_d.di_uid != uid) {
-			/*
-			 * What we need is the dquot that has this uid, and
-			 * if we send the inode to dqget, the uid of the inode
-			 * takes priority over what's sent in the uid argument.
-			 * We must unlock inode here before calling dqget if
-			 * we're not sending the inode, because otherwise
-			 * we'll deadlock by doing trans_reserve while
-			 * holding ilock.
-			 */
-			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
-						 XFS_DQ_USER,
-						 XFS_QMOPT_DQALLOC |
-						 XFS_QMOPT_DOWARN,
-						 &uq))) {
-				ASSERT(error != ENOENT);
-				return error;
-			}
-			/*
-			 * Get the ilock in the right order.
-			 */
-			xfs_dqunlock(uq);
-			lockflags = XFS_ILOCK_SHARED;
-			xfs_ilock(ip, lockflags);
-		} else {
-			/*
-			 * Take an extra reference, because we'll return
-			 * this to caller
-			 */
-			ASSERT(ip->i_udquot);
-			uq = ip->i_udquot;
-			xfs_dqlock(uq);
-			XFS_DQHOLD(uq);
-			xfs_dqunlock(uq);
-		}
-	}
-	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
-		if (ip->i_d.di_gid != gid) {
-			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
-						 XFS_DQ_GROUP,
-						 XFS_QMOPT_DQALLOC |
-						 XFS_QMOPT_DOWARN,
-						 &gq))) {
-				if (uq)
-					xfs_qm_dqrele(uq);
-				ASSERT(error != ENOENT);
-				return error;
-			}
-			xfs_dqunlock(gq);
-			lockflags = XFS_ILOCK_SHARED;
-			xfs_ilock(ip, lockflags);
-		} else {
-			ASSERT(ip->i_gdquot);
-			gq = ip->i_gdquot;
-			xfs_dqlock(gq);
-			XFS_DQHOLD(gq);
-			xfs_dqunlock(gq);
-		}
-	} else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
-		if (ip->i_d.di_projid != prid) {
-			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
-						 XFS_DQ_PROJ,
-						 XFS_QMOPT_DQALLOC |
-						 XFS_QMOPT_DOWARN,
-						 &gq))) {
-				if (uq)
-					xfs_qm_dqrele(uq);
-				ASSERT(error != ENOENT);
-				return (error);
-			}
-			xfs_dqunlock(gq);
-			lockflags = XFS_ILOCK_SHARED;
-			xfs_ilock(ip, lockflags);
-		} else {
-			ASSERT(ip->i_gdquot);
-			gq = ip->i_gdquot;
-			xfs_dqlock(gq);
-			XFS_DQHOLD(gq);
-			xfs_dqunlock(gq);
-		}
-	}
-	if (uq)
-		xfs_dqtrace_entry_ino(uq, "DQALLOC", ip);
-
-	xfs_iunlock(ip, lockflags);
-	if (O_udqpp)
-		*O_udqpp = uq;
-	else if (uq)
-		xfs_qm_dqrele(uq);
-	if (O_gdqpp)
-		*O_gdqpp = gq;
-	else if (gq)
-		xfs_qm_dqrele(gq);
-	return 0;
-}
-
-/*
- * Actually transfer ownership, and do dquot modifications.
- * These were already reserved.
- */
-xfs_dquot_t *
-xfs_qm_vop_chown(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip,
-	xfs_dquot_t	**IO_olddq,
-	xfs_dquot_t	*newdq)
-{
-	xfs_dquot_t	*prevdq;
-	uint		bfield = XFS_IS_REALTIME_INODE(ip) ?
-				 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
-
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
-	ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
-
-	/* old dquot */
-	prevdq = *IO_olddq;
-	ASSERT(prevdq);
-	ASSERT(prevdq != newdq);
-
-	xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks));
-	xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
-
-	/* the sparkling new dquot */
-	xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks);
-	xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
-
-	/*
-	 * Take an extra reference, because the inode
-	 * is going to keep this dquot pointer even
-	 * after the trans_commit.
-	 */
-	xfs_dqlock(newdq);
-	XFS_DQHOLD(newdq);
-	xfs_dqunlock(newdq);
-	*IO_olddq = newdq;
-
-	return prevdq;
-}
-
-/*
- * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
- */
-int
-xfs_qm_vop_chown_reserve(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip,
-	xfs_dquot_t	*udqp,
-	xfs_dquot_t	*gdqp,
-	uint		flags)
-{
-	int		error;
-	xfs_mount_t	*mp;
-	uint		delblks, blkflags;
-	xfs_dquot_t	*unresudq, *unresgdq, *delblksudq, *delblksgdq;
-
-	ASSERT(XFS_ISLOCKED_INODE(ip));
-	mp = ip->i_mount;
-	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-	delblks = ip->i_delayed_blks;
-	delblksudq = delblksgdq = unresudq = unresgdq = NULL;
-	blkflags = XFS_IS_REALTIME_INODE(ip) ?
-			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
-
-	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
-	    ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
-		delblksudq = udqp;
-		/*
-		 * If there are delayed allocation blocks, then we have to
-		 * unreserve those from the old dquot, and add them to the
-		 * new dquot.
-		 */
-		if (delblks) {
-			ASSERT(ip->i_udquot);
-			unresudq = ip->i_udquot;
-		}
-	}
-	if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
-		if ((XFS_IS_GQUOTA_ON(ip->i_mount) &&
-		     ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) ||
-		    (XFS_IS_PQUOTA_ON(ip->i_mount) &&
-		     ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id))) {
-			delblksgdq = gdqp;
-			if (delblks) {
-				ASSERT(ip->i_gdquot);
-				unresgdq = ip->i_gdquot;
-			}
-		}
-	}
-
-	if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
-				delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
-				flags | blkflags)))
-		return (error);
-
-	/*
-	 * Do the delayed blks reservations/unreservations now. Since, these
-	 * are done without the help of a transaction, if a reservation fails
-	 * its previous reservations won't be automatically undone by trans
-	 * code. So, we have to do it manually here.
-	 */
-	if (delblks) {
-		/*
-		 * Do the reservations first. Unreservation can't fail.
-		 */
-		ASSERT(delblksudq || delblksgdq);
-		ASSERT(unresudq || unresgdq);
-		if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
-				delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
-				flags | blkflags)))
-			return (error);
-		xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
-				unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
-				blkflags);
-	}
-
-	return (0);
-}
-
-int
-xfs_qm_vop_rename_dqattach(
-	xfs_inode_t	**i_tab)
-{
-	xfs_inode_t	*ip;
-	int		i;
-	int		error;
-
-	ip = i_tab[0];
-
-	if (! XFS_IS_QUOTA_ON(ip->i_mount))
-		return 0;
-
-	if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
-		error = xfs_qm_dqattach(ip, 0);
-		if (error)
-			return error;
-	}
-	for (i = 1; (i < 4 && i_tab[i]); i++) {
-		/*
-		 * Watch out for duplicate entries in the table.
-		 */
-		if ((ip = i_tab[i]) != i_tab[i-1]) {
-			if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
-				error = xfs_qm_dqattach(ip, 0);
-				if (error)
-					return error;
-			}
-		}
-	}
-	return 0;
-}
-
-void
-xfs_qm_vop_dqattach_and_dqmod_newinode(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip,
-	xfs_dquot_t	*udqp,
-	xfs_dquot_t	*gdqp)
-{
-	if (!XFS_IS_QUOTA_ON(tp->t_mountp))
-		return;
-
-	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
-	ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
-
-	if (udqp) {
-		xfs_dqlock(udqp);
-		XFS_DQHOLD(udqp);
-		xfs_dqunlock(udqp);
-		ASSERT(ip->i_udquot == NULL);
-		ip->i_udquot = udqp;
-		ASSERT(XFS_IS_UQUOTA_ON(tp->t_mountp));
-		ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
-		xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
-	}
-	if (gdqp) {
-		xfs_dqlock(gdqp);
-		XFS_DQHOLD(gdqp);
-		xfs_dqunlock(gdqp);
-		ASSERT(ip->i_gdquot == NULL);
-		ip->i_gdquot = gdqp;
-		ASSERT(XFS_IS_OQUOTA_ON(tp->t_mountp));
-		ASSERT((XFS_IS_GQUOTA_ON(tp->t_mountp) ?
-			ip->i_d.di_gid : ip->i_d.di_projid) ==
-				be32_to_cpu(gdqp->q_core.d_id));
-		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
-	}
-}
-
-/* ------------- list stuff -----------------*/
-STATIC void
-xfs_qm_freelist_init(xfs_frlist_t *ql)
-{
-	ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
-	mutex_init(&ql->qh_lock);
-	ql->qh_version = 0;
-	ql->qh_nelems = 0;
-}
-
-STATIC void
-xfs_qm_freelist_destroy(xfs_frlist_t *ql)
-{
-	xfs_dquot_t	*dqp, *nextdqp;
-
-	mutex_lock(&ql->qh_lock);
-	for (dqp = ql->qh_next;
-	     dqp != (xfs_dquot_t *)ql; ) {
-		xfs_dqlock(dqp);
-		nextdqp = dqp->dq_flnext;
-#ifdef QUOTADEBUG
-		cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
-#endif
-		XQM_FREELIST_REMOVE(dqp);
-		xfs_dqunlock(dqp);
-		xfs_qm_dqdestroy(dqp);
-		dqp = nextdqp;
-	}
-	/*
-	 * Don't bother about unlocking.
-	 */
-	mutex_destroy(&ql->qh_lock);
-
-	ASSERT(ql->qh_nelems == 0);
-}
-
-STATIC void
-xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-	dq->dq_flnext = ql->qh_next;
-	dq->dq_flprev = (xfs_dquot_t *)ql;
-	ql->qh_next = dq;
-	dq->dq_flnext->dq_flprev = dq;
-	xfs_Gqm->qm_dqfreelist.qh_nelems++;
-	xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-
-void
-xfs_qm_freelist_unlink(xfs_dquot_t *dq)
-{
-	xfs_dquot_t *next = dq->dq_flnext;
-	xfs_dquot_t *prev = dq->dq_flprev;
-
-	next->dq_flprev = prev;
-	prev->dq_flnext = next;
-	dq->dq_flnext = dq->dq_flprev = dq;
-	xfs_Gqm->qm_dqfreelist.qh_nelems--;
-	xfs_Gqm->qm_dqfreelist.qh_version++;
-}
-
-void
-xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
-{
-	xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
-}
-
-STATIC int
-xfs_qm_dqhashlock_nowait(
-	xfs_dquot_t *dqp)
-{
-	int locked;
-
-	locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
-	return locked;
-}
-
-int
-xfs_qm_freelist_lock_nowait(
-	xfs_qm_t *xqm)
-{
-	int locked;
-
-	locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
-	return locked;
-}
-
-STATIC int
-xfs_qm_mplist_nowait(
-	xfs_mount_t	*mp)
-{
-	int locked;
-
-	ASSERT(mp->m_quotainfo);
-	locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
-	return locked;
-}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
deleted file mode 100644
index 4568deb6da8..00000000000
--- a/fs/xfs/quota/xfs_qm.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QM_H__
-#define __XFS_QM_H__
-
-#include "xfs_dquot_item.h"
-#include "xfs_dquot.h"
-#include "xfs_quota_priv.h"
-#include "xfs_qm_stats.h"
-
-struct xfs_qm;
-struct xfs_inode;
-
-extern uint		ndquot;
-extern mutex_t		xfs_Gqm_lock;
-extern struct xfs_qm	*xfs_Gqm;
-extern kmem_zone_t	*qm_dqzone;
-extern kmem_zone_t	*qm_dqtrxzone;
-
-/*
- * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
- * iterate over the mountpt's dquot list in one call.
- */
-#define XFS_QM_SYNC_MAX_RESTARTS	7
-
-/*
- * Ditto, for xfs_qm_dqreclaim_one.
- */
-#define XFS_QM_RECLAIM_MAX_RESTARTS	4
-
-/*
- * Ideal ratio of free to in use dquots. Quota manager makes an attempt
- * to keep this balance.
- */
-#define XFS_QM_DQFREE_RATIO		2
-
-/*
- * Dquot hashtable constants/threshold values.
- */
-#define XFS_QM_HASHSIZE_LOW		(NBPP / sizeof(xfs_dqhash_t))
-#define XFS_QM_HASHSIZE_HIGH		((NBPP * 4) / sizeof(xfs_dqhash_t))
-
-/*
- * We output a cmn_err when quotachecking a quota file with more than
- * this many fsbs.
- */
-#define XFS_QM_BIG_QCHECK_NBLKS		500
-
-/*
- * This defines the unit of allocation of dquots.
- * Currently, it is just one file system block, and a 4K blk contains 30
- * (136 * 30 = 4080) dquots. It's probably not worth trying to make
- * this more dynamic.
- * XXXsup However, if this number is changed, we have to make sure that we don't
- * implicitly assume that we do allocations in chunks of a single filesystem
- * block in the dquot/xqm code.
- */
-#define XFS_DQUOT_CLUSTER_SIZE_FSB	(xfs_filblks_t)1
-/*
- * When doing a quotacheck, we log dquot clusters of this many FSBs at most
- * in a single transaction. We don't want to ask for too huge a log reservation.
- */
-#define XFS_QM_MAX_DQCLUSTER_LOGSZ	3
-
-typedef xfs_dqhash_t	xfs_dqlist_t;
-/*
- * The freelist head. The first two fields match the first two in the
- * xfs_dquot_t structure (in xfs_dqmarker_t)
- */
-typedef struct xfs_frlist {
-       struct xfs_dquot *qh_next;
-       struct xfs_dquot *qh_prev;
-       mutex_t		 qh_lock;
-       uint		 qh_version;
-       uint		 qh_nelems;
-} xfs_frlist_t;
-
-/*
- * Quota Manager (global) structure. Lives only in core.
- */
-typedef struct xfs_qm {
-	xfs_dqlist_t	*qm_usr_dqhtable;/* udquot hash table */
-	xfs_dqlist_t	*qm_grp_dqhtable;/* gdquot hash table */
-	uint		 qm_dqhashmask;	 /* # buckets in dq hashtab - 1 */
-	xfs_frlist_t	 qm_dqfreelist;	 /* freelist of dquots */
-	atomic_t	 qm_totaldquots; /* total incore dquots */
-	uint		 qm_nrefs;	 /* file systems with quota on */
-	int		 qm_dqfree_ratio;/* ratio of free to inuse dquots */
-	kmem_zone_t	*qm_dqzone;	 /* dquot mem-alloc zone */
-	kmem_zone_t	*qm_dqtrxzone;	 /* t_dqinfo of transactions */
-} xfs_qm_t;
-
-/*
- * Various quota information for individual filesystems.
- * The mount structure keeps a pointer to this.
- */
-typedef struct xfs_quotainfo {
-	xfs_inode_t	*qi_uquotaip;	 /* user quota inode */
-	xfs_inode_t	*qi_gquotaip;	 /* group quota inode */
-	lock_t		 qi_pinlock;	 /* dquot pinning mutex */
-	xfs_dqlist_t	 qi_dqlist;	 /* all dquots in filesys */
-	int		 qi_dqreclaims;	 /* a change here indicates
-					    a removal in the dqlist */
-	time_t		 qi_btimelimit;	 /* limit for blks timer */
-	time_t		 qi_itimelimit;	 /* limit for inodes timer */
-	time_t		 qi_rtbtimelimit;/* limit for rt blks timer */
-	xfs_qwarncnt_t	 qi_bwarnlimit;	 /* limit for blks warnings */
-	xfs_qwarncnt_t	 qi_iwarnlimit;	 /* limit for inodes warnings */
-	xfs_qwarncnt_t	 qi_rtbwarnlimit;/* limit for rt blks warnings */
-	mutex_t		 qi_quotaofflock;/* to serialize quotaoff */
-	xfs_filblks_t	 qi_dqchunklen;	 /* # BBs in a chunk of dqs */
-	uint		 qi_dqperchunk;	 /* # ondisk dqs in above chunk */
-	xfs_qcnt_t	 qi_bhardlimit;	 /* default data blk hard limit */
-	xfs_qcnt_t	 qi_bsoftlimit;	 /* default data blk soft limit */
-	xfs_qcnt_t	 qi_ihardlimit;	 /* default inode count hard limit */
-	xfs_qcnt_t	 qi_isoftlimit;	 /* default inode count soft limit */
-	xfs_qcnt_t	 qi_rtbhardlimit;/* default realtime blk hard limit */
-	xfs_qcnt_t	 qi_rtbsoftlimit;/* default realtime blk soft limit */
-} xfs_quotainfo_t;
-
-
-extern xfs_dqtrxops_t	xfs_trans_dquot_ops;
-
-extern void	xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
-extern int	xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
-			xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
-extern void	xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
-extern void	xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
-
-/*
- * We keep the usr and grp dquots separately so that locking will be easier
- * to do at commit time. All transactions that we know of at this point
- * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
- */
-#define XFS_QM_TRANS_MAXDQS		2
-typedef struct xfs_dquot_acct {
-	xfs_dqtrx_t	dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
-	xfs_dqtrx_t	dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
-} xfs_dquot_acct_t;
-
-/*
- * Users are allowed to have a usage exceeding their softlimit for
- * a period this long.
- */
-#define XFS_QM_BTIMELIMIT	(7 * 24*60*60)          /* 1 week */
-#define XFS_QM_RTBTIMELIMIT	(7 * 24*60*60)          /* 1 week */
-#define XFS_QM_ITIMELIMIT	(7 * 24*60*60)          /* 1 week */
-
-#define XFS_QM_BWARNLIMIT	5
-#define XFS_QM_IWARNLIMIT	5
-#define XFS_QM_RTBWARNLIMIT	5
-
-#define XFS_QM_LOCK(xqm)	(mutex_lock(&xqm##_lock))
-#define XFS_QM_UNLOCK(xqm)	(mutex_unlock(&xqm##_lock))
-#define XFS_QM_HOLD(xqm)	((xqm)->qm_nrefs++)
-#define XFS_QM_RELE(xqm)	((xqm)->qm_nrefs--)
-
-extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int		xfs_qm_mount_quotas(xfs_mount_t *, int);
-extern void		xfs_qm_mount_quotainit(xfs_mount_t *, uint);
-extern int		xfs_qm_quotacheck(xfs_mount_t *);
-extern void		xfs_qm_unmount_quotadestroy(xfs_mount_t *);
-extern int		xfs_qm_unmount_quotas(xfs_mount_t *);
-extern int		xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
-extern int		xfs_qm_sync(xfs_mount_t *, short);
-
-/* dquot stuff */
-extern boolean_t	xfs_qm_dqalloc_incore(xfs_dquot_t **);
-extern int		xfs_qm_dqattach(xfs_inode_t *, uint);
-extern void		xfs_qm_dqdetach(xfs_inode_t *);
-extern int		xfs_qm_dqpurge_all(xfs_mount_t *, uint);
-extern void		xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
-
-/* vop stuff */
-extern int		xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
-					uid_t, gid_t, prid_t, uint,
-					xfs_dquot_t **, xfs_dquot_t **);
-extern void		xfs_qm_vop_dqattach_and_dqmod_newinode(
-					xfs_trans_t *, xfs_inode_t *,
-					xfs_dquot_t *, xfs_dquot_t *);
-extern int		xfs_qm_vop_rename_dqattach(xfs_inode_t **);
-extern xfs_dquot_t *	xfs_qm_vop_chown(xfs_trans_t *, xfs_inode_t *,
-					xfs_dquot_t **, xfs_dquot_t *);
-extern int		xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
-					xfs_dquot_t *, xfs_dquot_t *, uint);
-
-/* list stuff */
-extern void		xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
-extern void		xfs_qm_freelist_unlink(xfs_dquot_t *);
-extern int		xfs_qm_freelist_lock_nowait(xfs_qm_t *);
-
-/* system call interface */
-extern int		xfs_qm_quotactl(bhv_desc_t *, int, int, xfs_caddr_t);
-
-#ifdef DEBUG
-extern int		xfs_qm_internalqcheck(xfs_mount_t *);
-#else
-#define xfs_qm_internalqcheck(mp)	(0)
-#endif
-
-#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
deleted file mode 100644
index 90402a1c398..00000000000
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_clnt.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
-#include "xfs_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_qm.h"
-
-#define MNTOPT_QUOTA	"quota"		/* disk quotas (user) */
-#define MNTOPT_NOQUOTA	"noquota"	/* no quotas */
-#define MNTOPT_USRQUOTA	"usrquota"	/* user quota enabled */
-#define MNTOPT_GRPQUOTA	"grpquota"	/* group quota enabled */
-#define MNTOPT_PRJQUOTA	"prjquota"	/* project quota enabled */
-#define MNTOPT_UQUOTA	"uquota"	/* user quota (IRIX variant) */
-#define MNTOPT_GQUOTA	"gquota"	/* group quota (IRIX variant) */
-#define MNTOPT_PQUOTA	"pquota"	/* project quota (IRIX variant) */
-#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
-#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
-#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
-#define MNTOPT_QUOTANOENF  "qnoenforce"	/* same as uqnoenforce */
-
-STATIC int
-xfs_qm_parseargs(
-	struct bhv_desc		*bhv,
-	char			*options,
-	struct xfs_mount_args	*args,
-	int			update)
-{
-	size_t			length;
-	char			*local_options = options;
-	char			*this_char;
-	int			error;
-	int			referenced = update;
-
-	while ((this_char = strsep(&local_options, ",")) != NULL) {
-		length = strlen(this_char);
-		if (local_options)
-			length++;
-
-		if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
-			args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA);
-			args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA);
-			referenced = update;
-		} else if (!strcmp(this_char, MNTOPT_QUOTA) ||
-			   !strcmp(this_char, MNTOPT_UQUOTA) ||
-			   !strcmp(this_char, MNTOPT_USRQUOTA)) {
-			args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
-			referenced = 1;
-		} else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
-			   !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
-			args->flags |= XFSMNT_UQUOTA;
-			args->flags &= ~XFSMNT_UQUOTAENF;
-			referenced = 1;
-		} else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
-			   !strcmp(this_char, MNTOPT_PRJQUOTA)) {
-			args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF;
-			referenced = 1;
-		} else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
-			args->flags |= XFSMNT_PQUOTA;
-			args->flags &= ~XFSMNT_PQUOTAENF;
-			referenced = 1;
-		} else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
-			   !strcmp(this_char, MNTOPT_GRPQUOTA)) {
-			args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
-			referenced = 1;
-		} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
-			args->flags |= XFSMNT_GQUOTA;
-			args->flags &= ~XFSMNT_GQUOTAENF;
-			referenced = 1;
-		} else {
-			if (local_options)
-				*(local_options-1) = ',';
-			continue;
-		}
-
-		while (length--)
-			*this_char++ = ',';
-	}
-
-	if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) {
-		cmn_err(CE_WARN,
-			"XFS: cannot mount with both project and group quota");
-		return XFS_ERROR(EINVAL);
-	}
-
-	PVFS_PARSEARGS(BHV_NEXT(bhv), options, args, update, error);
-	if (!error && !referenced)
-		bhv_remove_vfsops(bhvtovfs(bhv), VFS_POSITION_QM);
-	return error;
-}
-
-STATIC int
-xfs_qm_showargs(
-	struct bhv_desc		*bhv,
-	struct seq_file		*m)
-{
-	struct vfs		*vfsp = bhvtovfs(bhv);
-	struct xfs_mount	*mp = XFS_VFSTOM(vfsp);
-	int			error;
-
-	if (mp->m_qflags & XFS_UQUOTA_ACCT) {
-		(mp->m_qflags & XFS_UQUOTA_ENFD) ?
-			seq_puts(m, "," MNTOPT_USRQUOTA) :
-			seq_puts(m, "," MNTOPT_UQUOTANOENF);
-	}
-
-	if (mp->m_qflags & XFS_PQUOTA_ACCT) {
-		(mp->m_qflags & XFS_OQUOTA_ENFD) ?
-			seq_puts(m, "," MNTOPT_PRJQUOTA) :
-			seq_puts(m, "," MNTOPT_PQUOTANOENF);
-	}
-
-	if (mp->m_qflags & XFS_GQUOTA_ACCT) {
-		(mp->m_qflags & XFS_OQUOTA_ENFD) ?
-			seq_puts(m, "," MNTOPT_GRPQUOTA) :
-			seq_puts(m, "," MNTOPT_GQUOTANOENF);
-	}
-
-	if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
-		seq_puts(m, "," MNTOPT_NOQUOTA);
-
-	PVFS_SHOWARGS(BHV_NEXT(bhv), m, error);
-	return error;
-}
-
-STATIC int
-xfs_qm_mount(
-	struct bhv_desc		*bhv,
-	struct xfs_mount_args	*args,
-	struct cred		*cr)
-{
-	struct vfs		*vfsp = bhvtovfs(bhv);
-	struct xfs_mount	*mp = XFS_VFSTOM(vfsp);
-	int			error;
-
-	if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA | XFSMNT_PQUOTA))
-		xfs_qm_mount_quotainit(mp, args->flags);
-	PVFS_MOUNT(BHV_NEXT(bhv), args, cr, error);
-	return error;
-}
-
-STATIC int
-xfs_qm_syncall(
-	struct bhv_desc		*bhv,
-	int			flags,
-	cred_t			*credp)
-{
-	struct vfs		*vfsp = bhvtovfs(bhv);
-	struct xfs_mount	*mp = XFS_VFSTOM(vfsp);
-	int			error;
-
-	/*
-	 * Get the Quota Manager to flush the dquots.
-	 */
-	if (XFS_IS_QUOTA_ON(mp)) {
-		if ((error = xfs_qm_sync(mp, flags))) {
-			/*
-			 * If we got an IO error, we will be shutting down.
-			 * So, there's nothing more for us to do here.
-			 */
-			ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
-			if (XFS_FORCED_SHUTDOWN(mp)) {
-				return XFS_ERROR(error);
-			}
-		}
-	}
-	PVFS_SYNC(BHV_NEXT(bhv), flags, credp, error);
-	return error;
-}
-
-STATIC int
-xfs_qm_newmount(
-	xfs_mount_t	*mp,
-	uint		*needquotamount,
-	uint		*quotaflags)
-{
-	uint		quotaondisk;
-	uint		uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
-
-	*quotaflags = 0;
-	*needquotamount = B_FALSE;
-
-	quotaondisk = XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
-				(mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
-
-	if (quotaondisk) {
-		uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT;
-		pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT;
-		gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT;
-	}
-
-	/*
-	 * If the device itself is read-only, we can't allow
-	 * the user to change the state of quota on the mount -
-	 * this would generate a transaction on the ro device,
-	 * which would lead to an I/O error and shutdown
-	 */
-
-	if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
-	    (!uquotaondisk &&  XFS_IS_UQUOTA_ON(mp)) ||
-	     (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
-	    (!pquotaondisk &&  XFS_IS_PQUOTA_ON(mp)) ||
-	     (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
-	    (!gquotaondisk &&  XFS_IS_OQUOTA_ON(mp)))  &&
-	    xfs_dev_is_read_only(mp, "changing quota state")) {
-		cmn_err(CE_WARN,
-			"XFS: please mount with%s%s%s%s.",
-			(!quotaondisk ? "out quota" : ""),
-			(uquotaondisk ? " usrquota" : ""),
-			(pquotaondisk ? " prjquota" : ""),
-			(gquotaondisk ? " grpquota" : ""));
-		return XFS_ERROR(EPERM);
-	}
-
-	if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
-		/*
-		 * Call mount_quotas at this point only if we won't have to do
-		 * a quotacheck.
-		 */
-		if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
-			/*
-			 * If an error occured, qm_mount_quotas code
-			 * has already disabled quotas. So, just finish
-			 * mounting, and get on with the boring life
-			 * without disk quotas.
-			 */
-			xfs_qm_mount_quotas(mp, 0);
-		} else {
-			/*
-			 * Clear the quota flags, but remember them. This
-			 * is so that the quota code doesn't get invoked
-			 * before we're ready. This can happen when an
-			 * inode goes inactive and wants to free blocks,
-			 * or via xfs_log_mount_finish.
-			 */
-			*needquotamount = B_TRUE;
-			*quotaflags = mp->m_qflags;
-			mp->m_qflags = 0;
-		}
-	}
-
-	return 0;
-}
-
-STATIC int
-xfs_qm_endmount(
-	xfs_mount_t	*mp,
-	uint		needquotamount,
-	uint		quotaflags,
-	int		mfsi_flags)
-{
-	if (needquotamount) {
-		ASSERT(mp->m_qflags == 0);
-		mp->m_qflags = quotaflags;
-		xfs_qm_mount_quotas(mp, mfsi_flags);
-	}
-
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-	if (! (XFS_IS_QUOTA_ON(mp)))
-		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
-	else
-		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
-#endif
-
-#ifdef QUOTADEBUG
-	if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp))
-		cmn_err(CE_WARN, "XFS: mount internalqcheck failed");
-#endif
-
-	return 0;
-}
-
-STATIC void
-xfs_qm_dqrele_null(
-	xfs_dquot_t	*dq)
-{
-	/*
-	 * Called from XFS, where we always check first for a NULL dquot.
-	 */
-	if (!dq)
-		return;
-	xfs_qm_dqrele(dq);
-}
-
-
-STATIC struct xfs_qmops xfs_qmcore_xfs = {
-	.xfs_qminit		= xfs_qm_newmount,
-	.xfs_qmdone		= xfs_qm_unmount_quotadestroy,
-	.xfs_qmmount		= xfs_qm_endmount,
-	.xfs_qmunmount		= xfs_qm_unmount_quotas,
-	.xfs_dqrele		= xfs_qm_dqrele_null,
-	.xfs_dqattach		= xfs_qm_dqattach,
-	.xfs_dqdetach		= xfs_qm_dqdetach,
-	.xfs_dqpurgeall		= xfs_qm_dqpurge_all,
-	.xfs_dqvopalloc		= xfs_qm_vop_dqalloc,
-	.xfs_dqvopcreate	= xfs_qm_vop_dqattach_and_dqmod_newinode,
-	.xfs_dqvoprename	= xfs_qm_vop_rename_dqattach,
-	.xfs_dqvopchown		= xfs_qm_vop_chown,
-	.xfs_dqvopchownresv	= xfs_qm_vop_chown_reserve,
-	.xfs_dqtrxops		= &xfs_trans_dquot_ops,
-};
-
-struct bhv_vfsops xfs_qmops = { {
-	BHV_IDENTITY_INIT(VFS_BHV_QM, VFS_POSITION_QM),
-	.vfs_parseargs		= xfs_qm_parseargs,
-	.vfs_showargs		= xfs_qm_showargs,
-	.vfs_mount		= xfs_qm_mount,
-	.vfs_sync		= xfs_qm_syncall,
-	.vfs_quotactl		= xfs_qm_quotactl, },
-};
-
-
-void __init
-xfs_qm_init(void)
-{
-	static char	message[] __initdata =
-		KERN_INFO "SGI XFS Quota Management subsystem\n";
-
-	printk(message);
-	mutex_init(&xfs_Gqm_lock);
-	vfs_bhv_set_custom(&xfs_qmops, &xfs_qmcore_xfs);
-	xfs_qm_init_procfs();
-}
-
-void __exit
-xfs_qm_exit(void)
-{
-	vfs_bhv_clr_custom(&xfs_qmops);
-	xfs_qm_cleanup_procfs();
-	if (qm_dqzone)
-		kmem_cache_destroy(qm_dqzone);
-	if (qm_dqtrxzone)
-		kmem_cache_destroy(qm_dqtrxzone);
-}
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
deleted file mode 100644
index 0570f773355..00000000000
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_qm.h"
-
-struct xqmstats xqmstats;
-
-STATIC int
-xfs_qm_read_xfsquota(
-	char		*buffer,
-	char		**start,
-	off_t		offset,
-	int		count,
-	int		*eof,
-	void		*data)
-{
-	int		len;
-
-	/* maximum; incore; ratio free to inuse; freelist */
-	len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
-			ndquot,
-			xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
-			xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
-			xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
-
-	if (offset >= len) {
-		*start = buffer;
-		*eof = 1;
-		return 0;
-	}
-	*start = buffer + offset;
-	if ((len -= offset) > count)
-		return count;
-	*eof = 1;
-
-	return len;
-}
-
-STATIC int
-xfs_qm_read_stats(
-	char		*buffer,
-	char		**start,
-	off_t		offset,
-	int		count,
-	int		*eof,
-	void		*data)
-{
-	int		len;
-
-	/* quota performance statistics */
-	len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n",
-			xqmstats.xs_qm_dqreclaims,
-			xqmstats.xs_qm_dqreclaim_misses,
-			xqmstats.xs_qm_dquot_dups,
-			xqmstats.xs_qm_dqcachemisses,
-			xqmstats.xs_qm_dqcachehits,
-			xqmstats.xs_qm_dqwants,
-			xqmstats.xs_qm_dqshake_reclaims,
-			xqmstats.xs_qm_dqinact_reclaims);
-
-	if (offset >= len) {
-		*start = buffer;
-		*eof = 1;
-		return 0;
-	}
-	*start = buffer + offset;
-	if ((len -= offset) > count)
-		return count;
-	*eof = 1;
-
-	return len;
-}
-
-void
-xfs_qm_init_procfs(void)
-{
-	create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL);
-	create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL);
-}
-
-void
-xfs_qm_cleanup_procfs(void)
-{
-	remove_proc_entry("fs/xfs/xqm", NULL);
-	remove_proc_entry("fs/xfs/xqmstat", NULL);
-}
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
deleted file mode 100644
index a50ffabcf55..00000000000
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2002 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QM_STATS_H__
-#define __XFS_QM_STATS_H__
-
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
-/*
- * XQM global statistics
- */
-struct xqmstats {
-	__uint32_t		xs_qm_dqreclaims;
-	__uint32_t		xs_qm_dqreclaim_misses;
-	__uint32_t		xs_qm_dquot_dups;
-	__uint32_t		xs_qm_dqcachemisses;
-	__uint32_t		xs_qm_dqcachehits;
-	__uint32_t		xs_qm_dqwants;
-	__uint32_t		xs_qm_dqshake_reclaims;
-	__uint32_t		xs_qm_dqinact_reclaims;
-};
-
-extern struct xqmstats xqmstats;
-
-# define XQM_STATS_INC(count)	( (count)++ )
-
-extern void xfs_qm_init_procfs(void);
-extern void xfs_qm_cleanup_procfs(void);
-
-#else
-
-# define XQM_STATS_INC(count)	do { } while (0)
-
-static __inline void xfs_qm_init_procfs(void) { };
-static __inline void xfs_qm_cleanup_procfs(void) { };
-
-#endif
-
-#endif	/* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
deleted file mode 100644
index 676884394aa..00000000000
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ /dev/null
@@ -1,1483 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <linux/capability.h>
-
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_dmapi.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_btree.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_acl.h"
-#include "xfs_cap.h"
-#include "xfs_mac.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_qm.h"
-
-#ifdef DEBUG
-# define qdprintk(s, args...)	cmn_err(CE_DEBUG, s, ## args)
-#else
-# define qdprintk(s, args...)	do { } while (0)
-#endif
-
-STATIC int	xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-STATIC int	xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
-					fs_disk_quota_t *);
-STATIC int	xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-STATIC int	xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
-					fs_disk_quota_t *);
-STATIC int	xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-STATIC int	xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
-STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
-STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
-					uint);
-STATIC uint	xfs_qm_import_flags(uint);
-STATIC uint	xfs_qm_export_flags(uint);
-STATIC uint	xfs_qm_import_qtype_flags(uint);
-STATIC uint	xfs_qm_export_qtype_flags(uint);
-STATIC void	xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
-					fs_disk_quota_t *);
-
-
-/*
- * The main distribution switch of all XFS quotactl system calls.
- */
-int
-xfs_qm_quotactl(
-	struct bhv_desc *bdp,
-	int		cmd,
-	int		id,
-	xfs_caddr_t	addr)
-{
-	xfs_mount_t	*mp;
-	int		error;
-	struct vfs	*vfsp;
-
-	vfsp = bhvtovfs(bdp);
-	mp = XFS_VFSTOM(vfsp);
-
-	ASSERT(addr != NULL || cmd == Q_XQUOTASYNC);
-
-	/*
-	 * The following commands are valid even when quotaoff.
-	 */
-	switch (cmd) {
-	case Q_XQUOTARM:
-		/*
-		 * Truncate quota files. quota must be off.
-		 */
-		if (XFS_IS_QUOTA_ON(mp))
-			return XFS_ERROR(EINVAL);
-		if (vfsp->vfs_flag & VFS_RDONLY)
-			return XFS_ERROR(EROFS);
-		return (xfs_qm_scall_trunc_qfiles(mp,
-			       xfs_qm_import_qtype_flags(*(uint *)addr)));
-
-	case Q_XGETQSTAT:
-		/*
-		 * Get quota status information.
-		 */
-		return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr));
-
-	case Q_XQUOTAON:
-		/*
-		 * QUOTAON - enabling quota enforcement.
-		 * Quota accounting must be turned on at mount time.
-		 */
-		if (vfsp->vfs_flag & VFS_RDONLY)
-			return XFS_ERROR(EROFS);
-		return (xfs_qm_scall_quotaon(mp,
-					  xfs_qm_import_flags(*(uint *)addr)));
-
-	case Q_XQUOTAOFF:
-		if (vfsp->vfs_flag & VFS_RDONLY)
-			return XFS_ERROR(EROFS);
-		break;
-
-	case Q_XQUOTASYNC:
-		return (xfs_sync_inodes(mp, SYNC_DELWRI, 0, NULL));
-
-	default:
-		break;
-	}
-
-	if (! XFS_IS_QUOTA_ON(mp))
-		return XFS_ERROR(ESRCH);
-
-	switch (cmd) {
-	case Q_XQUOTAOFF:
-		if (vfsp->vfs_flag & VFS_RDONLY)
-			return XFS_ERROR(EROFS);
-		error = xfs_qm_scall_quotaoff(mp,
-					    xfs_qm_import_flags(*(uint *)addr),
-					    B_FALSE);
-		break;
-
-	case Q_XGETQUOTA:
-		error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER,
-					(fs_disk_quota_t *)addr);
-		break;
-	case Q_XGETGQUOTA:
-		error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
-					(fs_disk_quota_t *)addr);
-		break;
-	case Q_XGETPQUOTA:
-		error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
-					(fs_disk_quota_t *)addr);
-		break;
-
-	case Q_XSETQLIM:
-		if (vfsp->vfs_flag & VFS_RDONLY)
-			return XFS_ERROR(EROFS);
-		error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER,
-					     (fs_disk_quota_t *)addr);
-		break;
-	case Q_XSETGQLIM:
-		if (vfsp->vfs_flag & VFS_RDONLY)
-			return XFS_ERROR(EROFS);
-		error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
-					     (fs_disk_quota_t *)addr);
-		break;
-	case Q_XSETPQLIM:
-		if (vfsp->vfs_flag & VFS_RDONLY)
-			return XFS_ERROR(EROFS);
-		error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
-					     (fs_disk_quota_t *)addr);
-		break;
-
-	default:
-		error = XFS_ERROR(EINVAL);
-		break;
-	}
-
-	return (error);
-}
-
-/*
- * Turn off quota accounting and/or enforcement for all udquots and/or
- * gdquots. Called only at unmount time.
- *
- * This assumes that there are no dquots of this file system cached
- * incore, and modifies the ondisk dquot directly. Therefore, for example,
- * it is an error to call this twice, without purging the cache.
- */
-STATIC int
-xfs_qm_scall_quotaoff(
-	xfs_mount_t		*mp,
-	uint			flags,
-	boolean_t		force)
-{
-	uint			dqtype;
-	unsigned long	s;
-	int			error;
-	uint			inactivate_flags;
-	xfs_qoff_logitem_t	*qoffstart;
-	int			nculprits;
-
-	if (!force && !capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
-	/*
-	 * No file system can have quotas enabled on disk but not in core.
-	 * Note that quota utilities (like quotaoff) _expect_
-	 * errno == EEXIST here.
-	 */
-	if ((mp->m_qflags & flags) == 0)
-		return XFS_ERROR(EEXIST);
-	error = 0;
-
-	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
-
-	/*
-	 * We don't want to deal with two quotaoffs messing up each other,
-	 * so we're going to serialize it. quotaoff isn't exactly a performance
-	 * critical thing.
-	 * If quotaoff, then we must be dealing with the root filesystem.
-	 */
-	ASSERT(mp->m_quotainfo);
-	if (mp->m_quotainfo)
-		mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
-
-	ASSERT(mp->m_quotainfo);
-
-	/*
-	 * If we're just turning off quota enforcement, change mp and go.
-	 */
-	if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
-		mp->m_qflags &= ~(flags);
-
-		s = XFS_SB_LOCK(mp);
-		mp->m_sb.sb_qflags = mp->m_qflags;
-		XFS_SB_UNLOCK(mp, s);
-		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-
-		/* XXX what to do if error ? Revert back to old vals incore ? */
-		error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
-		return (error);
-	}
-
-	dqtype = 0;
-	inactivate_flags = 0;
-	/*
-	 * If accounting is off, we must turn enforcement off, clear the
-	 * quota 'CHKD' certificate to make it known that we have to
-	 * do a quotacheck the next time this quota is turned on.
-	 */
-	if (flags & XFS_UQUOTA_ACCT) {
-		dqtype |= XFS_QMOPT_UQUOTA;
-		flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
-		inactivate_flags |= XFS_UQUOTA_ACTIVE;
-	}
-	if (flags & XFS_GQUOTA_ACCT) {
-		dqtype |= XFS_QMOPT_GQUOTA;
-		flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
-		inactivate_flags |= XFS_GQUOTA_ACTIVE;
-	} else if (flags & XFS_PQUOTA_ACCT) {
-		dqtype |= XFS_QMOPT_PQUOTA;
-		flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
-		inactivate_flags |= XFS_PQUOTA_ACTIVE;
-	}
-
-	/*
-	 * Nothing to do?  Don't complain. This happens when we're just
-	 * turning off quota enforcement.
-	 */
-	if ((mp->m_qflags & flags) == 0) {
-		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-		return (0);
-	}
-
-	/*
-	 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
-	 * and synchronously.
-	 */
-	xfs_qm_log_quotaoff(mp, &qoffstart, flags);
-
-	/*
-	 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
-	 * to take care of the race between dqget and quotaoff. We don't take
-	 * any special locks to reset these bits. All processes need to check
-	 * these bits *after* taking inode lock(s) to see if the particular
-	 * quota type is in the process of being turned off. If *ACTIVE, it is
-	 * guaranteed that all dquot structures and all quotainode ptrs will all
-	 * stay valid as long as that inode is kept locked.
-	 *
-	 * There is no turning back after this.
-	 */
-	mp->m_qflags &= ~inactivate_flags;
-
-	/*
-	 * Give back all the dquot reference(s) held by inodes.
-	 * Here we go thru every single incore inode in this file system, and
-	 * do a dqrele on the i_udquot/i_gdquot that it may have.
-	 * Essentially, as long as somebody has an inode locked, this guarantees
-	 * that quotas will not be turned off. This is handy because in a
-	 * transaction once we lock the inode(s) and check for quotaon, we can
-	 * depend on the quota inodes (and other things) being valid as long as
-	 * we keep the lock(s).
-	 */
-	xfs_qm_dqrele_all_inodes(mp, flags);
-
-	/*
-	 * Next we make the changes in the quota flag in the mount struct.
-	 * This isn't protected by a particular lock directly, because we
-	 * don't want to take a mrlock everytime we depend on quotas being on.
-	 */
-	mp->m_qflags &= ~(flags);
-
-	/*
-	 * Go through all the dquots of this file system and purge them,
-	 * according to what was turned off. We may not be able to get rid
-	 * of all dquots, because dquots can have temporary references that
-	 * are not attached to inodes. eg. xfs_setattr, xfs_create.
-	 * So, if we couldn't purge all the dquots from the filesystem,
-	 * we can't get rid of the incore data structures.
-	 */
-	while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF)))
-		delay(10 * nculprits);
-
-	/*
-	 * Transactions that had started before ACTIVE state bit was cleared
-	 * could have logged many dquots, so they'd have higher LSNs than
-	 * the first QUOTAOFF log record does. If we happen to crash when
-	 * the tail of the log has gone past the QUOTAOFF record, but
-	 * before the last dquot modification, those dquots __will__
-	 * recover, and that's not good.
-	 *
-	 * So, we have QUOTAOFF start and end logitems; the start
-	 * logitem won't get overwritten until the end logitem appears...
-	 */
-	xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
-
-	/*
-	 * If quotas is completely disabled, close shop.
-	 */
-	if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
-	    ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
-		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-		xfs_qm_destroy_quotainfo(mp);
-		return (0);
-	}
-
-	/*
-	 * Release our quotainode references, and vn_purge them,
-	 * if we don't need them anymore.
-	 */
-	if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
-		XFS_PURGE_INODE(XFS_QI_UQIP(mp));
-		XFS_QI_UQIP(mp) = NULL;
-	}
-	if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) {
-		XFS_PURGE_INODE(XFS_QI_GQIP(mp));
-		XFS_QI_GQIP(mp) = NULL;
-	}
-	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-
-	return (error);
-}
-
-STATIC int
-xfs_qm_scall_trunc_qfiles(
-	xfs_mount_t	*mp,
-	uint		flags)
-{
-	int		error;
-	xfs_inode_t	*qip;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
-	error = 0;
-	if (!XFS_SB_VERSION_HASQUOTA(&mp->m_sb) || flags == 0) {
-		qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
-		return XFS_ERROR(EINVAL);
-	}
-
-	if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
-		error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
-		if (! error) {
-			(void) xfs_truncate_file(mp, qip);
-			VN_RELE(XFS_ITOV(qip));
-		}
-	}
-
-	if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
-	    mp->m_sb.sb_gquotino != NULLFSINO) {
-		error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
-		if (! error) {
-			(void) xfs_truncate_file(mp, qip);
-			VN_RELE(XFS_ITOV(qip));
-		}
-	}
-
-	return (error);
-}
-
-
-/*
- * Switch on (a given) quota enforcement for a filesystem.  This takes
- * effect immediately.
- * (Switching on quota accounting must be done at mount time.)
- */
-STATIC int
-xfs_qm_scall_quotaon(
-	xfs_mount_t	*mp,
-	uint		flags)
-{
-	int		error;
-	unsigned long	s;
-	uint		qf;
-	uint		accflags;
-	__int64_t	sbflags;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
-
-	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
-	/*
-	 * Switching on quota accounting must be done at mount time.
-	 */
-	accflags = flags & XFS_ALL_QUOTA_ACCT;
-	flags &= ~(XFS_ALL_QUOTA_ACCT);
-
-	sbflags = 0;
-
-	if (flags == 0) {
-		qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags);
-		return XFS_ERROR(EINVAL);
-	}
-
-	/* No fs can turn on quotas with a delayed effect */
-	ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
-
-	/*
-	 * Can't enforce without accounting. We check the superblock
-	 * qflags here instead of m_qflags because rootfs can have
-	 * quota acct on ondisk without m_qflags' knowing.
-	 */
-	if (((flags & XFS_UQUOTA_ACCT) == 0 &&
-	    (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
-	    (flags & XFS_UQUOTA_ENFD))
-	    ||
-	    ((flags & XFS_PQUOTA_ACCT) == 0 &&
-	    (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
-	    (flags & XFS_OQUOTA_ENFD))
-	    ||
-	    ((flags & XFS_GQUOTA_ACCT) == 0 &&
-	    (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
-	    (flags & XFS_OQUOTA_ENFD))) {
-		qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
-			flags, mp->m_sb.sb_qflags);
-		return XFS_ERROR(EINVAL);
-	}
-	/*
-	 * If everything's upto-date incore, then don't waste time.
-	 */
-	if ((mp->m_qflags & flags) == flags)
-		return XFS_ERROR(EEXIST);
-
-	/*
-	 * Change sb_qflags on disk but not incore mp->qflags
-	 * if this is the root filesystem.
-	 */
-	s = XFS_SB_LOCK(mp);
-	qf = mp->m_sb.sb_qflags;
-	mp->m_sb.sb_qflags = qf | flags;
-	XFS_SB_UNLOCK(mp, s);
-
-	/*
-	 * There's nothing to change if it's the same.
-	 */
-	if ((qf & flags) == flags && sbflags == 0)
-		return XFS_ERROR(EEXIST);
-	sbflags |= XFS_SB_QFLAGS;
-
-	if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
-		return (error);
-	/*
-	 * If we aren't trying to switch on quota enforcement, we are done.
-	 */
-	if  (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
-	     (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
-	     ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
-	     (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
-	     ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
-	     (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
-	    (flags & XFS_ALL_QUOTA_ENFD) == 0)
-		return (0);
-
-	if (! XFS_IS_QUOTA_RUNNING(mp))
-		return XFS_ERROR(ESRCH);
-
-	/*
-	 * Switch on quota enforcement in core.
-	 */
-	mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
-	mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
-	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-
-	return (0);
-}
-
-
-/*
- * Return quota status information, such as uquota-off, enforcements, etc.
- */
-STATIC int
-xfs_qm_scall_getqstat(
-	xfs_mount_t	*mp,
-	fs_quota_stat_t *out)
-{
-	xfs_inode_t	*uip, *gip;
-	boolean_t	tempuqip, tempgqip;
-
-	uip = gip = NULL;
-	tempuqip = tempgqip = B_FALSE;
-	memset(out, 0, sizeof(fs_quota_stat_t));
-
-	out->qs_version = FS_QSTAT_VERSION;
-	if (! XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
-		out->qs_uquota.qfs_ino = NULLFSINO;
-		out->qs_gquota.qfs_ino = NULLFSINO;
-		return (0);
-	}
-	out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
-							(XFS_ALL_QUOTA_ACCT|
-							 XFS_ALL_QUOTA_ENFD));
-	out->qs_pad = 0;
-	out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
-	out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
-
-	if (mp->m_quotainfo) {
-		uip = mp->m_quotainfo->qi_uquotaip;
-		gip = mp->m_quotainfo->qi_gquotaip;
-	}
-	if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
-		if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-					0, 0, &uip, 0) == 0)
-			tempuqip = B_TRUE;
-	}
-	if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
-		if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-					0, 0, &gip, 0) == 0)
-			tempgqip = B_TRUE;
-	}
-	if (uip) {
-		out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
-		out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
-		if (tempuqip)
-			VN_RELE(XFS_ITOV(uip));
-	}
-	if (gip) {
-		out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
-		out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
-		if (tempgqip)
-			VN_RELE(XFS_ITOV(gip));
-	}
-	if (mp->m_quotainfo) {
-		out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
-		out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
-		out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
-		out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
-		out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
-		out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
-	}
-	return (0);
-}
-
-/*
- * Adjust quota limits, and start/stop timers accordingly.
- */
-STATIC int
-xfs_qm_scall_setqlim(
-	xfs_mount_t		*mp,
-	xfs_dqid_t		id,
-	uint			type,
-	fs_disk_quota_t		*newlim)
-{
-	xfs_disk_dquot_t	*ddq;
-	xfs_dquot_t		*dqp;
-	xfs_trans_t		*tp;
-	int			error;
-	xfs_qcnt_t		hard, soft;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return XFS_ERROR(EPERM);
-
-	if ((newlim->d_fieldmask &
-	    (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
-		return (0);
-
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
-	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
-				      0, 0, XFS_DEFAULT_LOG_COUNT))) {
-		xfs_trans_cancel(tp, 0);
-		return (error);
-	}
-
-	/*
-	 * We don't want to race with a quotaoff so take the quotaoff lock.
-	 * (We don't hold an inode lock, so there's nothing else to stop
-	 * a quotaoff from happening). (XXXThis doesn't currently happen
-	 * because we take the vfslock before calling xfs_qm_sysent).
-	 */
-	mutex_lock(&(XFS_QI_QOFFLOCK(mp)));
-
-	/*
-	 * Get the dquot (locked), and join it to the transaction.
-	 * Allocate the dquot if this doesn't exist.
-	 */
-	if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
-		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-		ASSERT(error != ENOENT);
-		return (error);
-	}
-	xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET");
-	xfs_trans_dqjoin(tp, dqp);
-	ddq = &dqp->q_core;
-
-	/*
-	 * Make sure that hardlimits are >= soft limits before changing.
-	 */
-	hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
-			be64_to_cpu(ddq->d_blk_hardlimit);
-	soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
-			be64_to_cpu(ddq->d_blk_softlimit);
-	if (hard == 0 || hard >= soft) {
-		ddq->d_blk_hardlimit = cpu_to_be64(hard);
-		ddq->d_blk_softlimit = cpu_to_be64(soft);
-		if (id == 0) {
-			mp->m_quotainfo->qi_bhardlimit = hard;
-			mp->m_quotainfo->qi_bsoftlimit = soft;
-		}
-	} else {
-		qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
-	}
-	hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
-			be64_to_cpu(ddq->d_rtb_hardlimit);
-	soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
-			be64_to_cpu(ddq->d_rtb_softlimit);
-	if (hard == 0 || hard >= soft) {
-		ddq->d_rtb_hardlimit = cpu_to_be64(hard);
-		ddq->d_rtb_softlimit = cpu_to_be64(soft);
-		if (id == 0) {
-			mp->m_quotainfo->qi_rtbhardlimit = hard;
-			mp->m_quotainfo->qi_rtbsoftlimit = soft;
-		}
-	} else {
-		qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
-	}
-
-	hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
-		(xfs_qcnt_t) newlim->d_ino_hardlimit :
-			be64_to_cpu(ddq->d_ino_hardlimit);
-	soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
-		(xfs_qcnt_t) newlim->d_ino_softlimit :
-			be64_to_cpu(ddq->d_ino_softlimit);
-	if (hard == 0 || hard >= soft) {
-		ddq->d_ino_hardlimit = cpu_to_be64(hard);
-		ddq->d_ino_softlimit = cpu_to_be64(soft);
-		if (id == 0) {
-			mp->m_quotainfo->qi_ihardlimit = hard;
-			mp->m_quotainfo->qi_isoftlimit = soft;
-		}
-	} else {
-		qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
-	}
-
-	/*
-	 * Update warnings counter(s) if requested
-	 */
-	if (newlim->d_fieldmask & FS_DQ_BWARNS)
-		ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns);
-	if (newlim->d_fieldmask & FS_DQ_IWARNS)
-		ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns);
-	if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-		ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns);
-
-	if (id == 0) {
-		/*
-		 * Timelimits for the super user set the relative time
-		 * the other users can be over quota for this file system.
-		 * If it is zero a default is used.  Ditto for the default
-		 * soft and hard limit values (already done, above), and
-		 * for warnings.
-		 */
-		if (newlim->d_fieldmask & FS_DQ_BTIMER) {
-			mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
-			ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
-		}
-		if (newlim->d_fieldmask & FS_DQ_ITIMER) {
-			mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
-			ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
-		}
-		if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
-			mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
-			ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
-		}
-		if (newlim->d_fieldmask & FS_DQ_BWARNS)
-			mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns;
-		if (newlim->d_fieldmask & FS_DQ_IWARNS)
-			mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns;
-		if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-			mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns;
-	} else {
-		/*
-		 * If the user is now over quota, start the timelimit.
-		 * The user will not be 'warned'.
-		 * Note that we keep the timers ticking, whether enforcement
-		 * is on or off. We don't really want to bother with iterating
-		 * over all ondisk dquots and turning the timers on/off.
-		 */
-		xfs_qm_adjust_dqtimers(mp, ddq);
-	}
-	dqp->dq_flags |= XFS_DQ_DIRTY;
-	xfs_trans_log_dquot(tp, dqp);
-
-	xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
-	xfs_trans_commit(tp, 0, NULL);
-	xfs_qm_dqprint(dqp);
-	xfs_qm_dqrele(dqp);
-	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
-
-	return (0);
-}
-
-STATIC int
-xfs_qm_scall_getquota(
-	xfs_mount_t	*mp,
-	xfs_dqid_t	id,
-	uint		type,
-	fs_disk_quota_t *out)
-{
-	xfs_dquot_t	*dqp;
-	int		error;
-
-	/*
-	 * Try to get the dquot. We don't want it allocated on disk, so
-	 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
-	 * exist, we'll get ENOENT back.
-	 */
-	if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
-		return (error);
-	}
-
-	xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS");
-	/*
-	 * If everything's NULL, this dquot doesn't quite exist as far as
-	 * our utility programs are concerned.
-	 */
-	if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
-		xfs_qm_dqput(dqp);
-		return XFS_ERROR(ENOENT);
-	}
-	/* xfs_qm_dqprint(dqp); */
-	/*
-	 * Convert the disk dquot to the exportable format
-	 */
-	xfs_qm_export_dquot(mp, &dqp->q_core, out);
-	xfs_qm_dqput(dqp);
-	return (error ? XFS_ERROR(EFAULT) : 0);
-}
-
-
-STATIC int
-xfs_qm_log_quotaoff_end(
-	xfs_mount_t		*mp,
-	xfs_qoff_logitem_t	*startqoff,
-	uint			flags)
-{
-	xfs_trans_t		*tp;
-	int			error;
-	xfs_qoff_logitem_t	*qoffi;
-
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
-
-	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
-				      0, 0, XFS_DEFAULT_LOG_COUNT))) {
-		xfs_trans_cancel(tp, 0);
-		return (error);
-	}
-
-	qoffi = xfs_trans_get_qoff_item(tp, startqoff,
-					flags & XFS_ALL_QUOTA_ACCT);
-	xfs_trans_log_quotaoff_item(tp, qoffi);
-
-	/*
-	 * We have to make sure that the transaction is secure on disk before we
-	 * return and actually stop quota accounting. So, make it synchronous.
-	 * We don't care about quotoff's performance.
-	 */
-	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0, NULL);
-	return (error);
-}
-
-
-STATIC int
-xfs_qm_log_quotaoff(
-	xfs_mount_t	       *mp,
-	xfs_qoff_logitem_t     **qoffstartp,
-	uint		       flags)
-{
-	xfs_trans_t	       *tp;
-	int			error;
-	unsigned long	s;
-	xfs_qoff_logitem_t     *qoffi=NULL;
-	uint			oldsbqflag=0;
-
-	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
-	if ((error = xfs_trans_reserve(tp, 0,
-				      sizeof(xfs_qoff_logitem_t) * 2 +
-				      mp->m_sb.sb_sectsize + 128,
-				      0,
-				      0,
-				      XFS_DEFAULT_LOG_COUNT))) {
-		goto error0;
-	}
-
-	qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
-	xfs_trans_log_quotaoff_item(tp, qoffi);
-
-	s = XFS_SB_LOCK(mp);
-	oldsbqflag = mp->m_sb.sb_qflags;
-	mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
-	XFS_SB_UNLOCK(mp, s);
-
-	xfs_mod_sb(tp, XFS_SB_QFLAGS);
-
-	/*
-	 * We have to make sure that the transaction is secure on disk before we
-	 * return and actually stop quota accounting. So, make it synchronous.
-	 * We don't care about quotoff's performance.
-	 */
-	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0, NULL);
-
-error0:
-	if (error) {
-		xfs_trans_cancel(tp, 0);
-		/*
-		 * No one else is modifying sb_qflags, so this is OK.
-		 * We still hold the quotaofflock.
-		 */
-		s = XFS_SB_LOCK(mp);
-		mp->m_sb.sb_qflags = oldsbqflag;
-		XFS_SB_UNLOCK(mp, s);
-	}
-	*qoffstartp = qoffi;
-	return (error);
-}
-
-
-/*
- * Translate an internal style on-disk-dquot to the exportable format.
- * The main differences are that the counters/limits are all in Basic
- * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
- * to be converted to the native endianness.
- */
-STATIC void
-xfs_qm_export_dquot(
-	xfs_mount_t		*mp,
-	xfs_disk_dquot_t	*src,
-	struct fs_disk_quota	*dst)
-{
-	memset(dst, 0, sizeof(*dst));
-	dst->d_version = FS_DQUOT_VERSION;  /* different from src->d_version */
-	dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags);
-	dst->d_id = be32_to_cpu(src->d_id);
-	dst->d_blk_hardlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit));
-	dst->d_blk_softlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit));
-	dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit);
-	dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit);
-	dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount));
-	dst->d_icount = be64_to_cpu(src->d_icount);
-	dst->d_btimer = be32_to_cpu(src->d_btimer);
-	dst->d_itimer = be32_to_cpu(src->d_itimer);
-	dst->d_iwarns = be16_to_cpu(src->d_iwarns);
-	dst->d_bwarns = be16_to_cpu(src->d_bwarns);
-	dst->d_rtb_hardlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit));
-	dst->d_rtb_softlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit));
-	dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount));
-	dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer);
-	dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns);
-
-	/*
-	 * Internally, we don't reset all the timers when quota enforcement
-	 * gets turned off. No need to confuse the userlevel code,
-	 * so return zeroes in that case.
-	 */
-	if (! XFS_IS_QUOTA_ENFORCED(mp)) {
-		dst->d_btimer = 0;
-		dst->d_itimer = 0;
-		dst->d_rtbtimer = 0;
-	}
-
-#ifdef DEBUG
-	if (XFS_IS_QUOTA_ENFORCED(mp) && dst->d_id != 0) {
-		if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
-		    (dst->d_blk_softlimit > 0)) {
-			ASSERT(dst->d_btimer != 0);
-		}
-		if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
-		    (dst->d_ino_softlimit > 0)) {
-			ASSERT(dst->d_itimer != 0);
-		}
-	}
-#endif
-}
-
-STATIC uint
-xfs_qm_import_qtype_flags(
-	uint		uflags)
-{
-	uint		oflags = 0;
-
-	/*
-	 * Can't be more than one, or none.
-	 */
-	if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
-			(XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
-	    ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ==
-			(XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ||
-	    ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ==
-			(XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ||
-	    ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0))
-		return (0);
-
-	oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0;
-	oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0;
-	oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0;
-	return oflags;
-}
-
-STATIC uint
-xfs_qm_export_qtype_flags(
-	uint flags)
-{
-	/*
-	 * Can't be more than one, or none.
-	 */
-	ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) !=
-		(XFS_PROJ_QUOTA | XFS_USER_QUOTA));
-	ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) !=
-		(XFS_PROJ_QUOTA | XFS_GROUP_QUOTA));
-	ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) !=
-		(XFS_USER_QUOTA | XFS_GROUP_QUOTA));
-	ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0);
-
-	return (flags & XFS_DQ_USER) ?
-		XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
-			XFS_PROJ_QUOTA : XFS_GROUP_QUOTA;
-}
-
-STATIC uint
-xfs_qm_import_flags(
-	uint uflags)
-{
-	uint flags = 0;
-
-	if (uflags & XFS_QUOTA_UDQ_ACCT)
-		flags |= XFS_UQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_PDQ_ACCT)
-		flags |= XFS_PQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_GDQ_ACCT)
-		flags |= XFS_GQUOTA_ACCT;
-	if (uflags & XFS_QUOTA_UDQ_ENFD)
-		flags |= XFS_UQUOTA_ENFD;
-	if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
-		flags |= XFS_OQUOTA_ENFD;
-	return (flags);
-}
-
-
-STATIC uint
-xfs_qm_export_flags(
-	uint flags)
-{
-	uint uflags;
-
-	uflags = 0;
-	if (flags & XFS_UQUOTA_ACCT)
-		uflags |= XFS_QUOTA_UDQ_ACCT;
-	if (flags & XFS_PQUOTA_ACCT)
-		uflags |= XFS_QUOTA_PDQ_ACCT;
-	if (flags & XFS_GQUOTA_ACCT)
-		uflags |= XFS_QUOTA_GDQ_ACCT;
-	if (flags & XFS_UQUOTA_ENFD)
-		uflags |= XFS_QUOTA_UDQ_ENFD;
-	if (flags & (XFS_OQUOTA_ENFD)) {
-		uflags |= (flags & XFS_GQUOTA_ACCT) ?
-			XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD;
-	}
-	return (uflags);
-}
-
-
-/*
- * Go thru all the inodes in the file system, releasing their dquots.
- * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff. This also gets called from
- * xfs_rootumount.
- */
-void
-xfs_qm_dqrele_all_inodes(
-	struct xfs_mount *mp,
-	uint		 flags)
-{
-	xfs_inode_t	*ip, *topino;
-	uint		ireclaims;
-	vnode_t		*vp;
-	boolean_t	vnode_refd;
-
-	ASSERT(mp->m_quotainfo);
-
-	XFS_MOUNT_ILOCK(mp);
-again:
-	ip = mp->m_inodes;
-	if (ip == NULL) {
-		XFS_MOUNT_IUNLOCK(mp);
-		return;
-	}
-	do {
-		/* Skip markers inserted by xfs_sync */
-		if (ip->i_mount == NULL) {
-			ip = ip->i_mnext;
-			continue;
-		}
-		/* Root inode, rbmip and rsumip have associated blocks */
-		if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
-			ASSERT(ip->i_udquot == NULL);
-			ASSERT(ip->i_gdquot == NULL);
-			ip = ip->i_mnext;
-			continue;
-		}
-		vp = XFS_ITOV_NULL(ip);
-		if (!vp) {
-			ASSERT(ip->i_udquot == NULL);
-			ASSERT(ip->i_gdquot == NULL);
-			ip = ip->i_mnext;
-			continue;
-		}
-		vnode_refd = B_FALSE;
-		if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-			ireclaims = mp->m_ireclaims;
-			topino = mp->m_inodes;
-			vp = vn_grab(vp);
-			if (!vp)
-				goto again;
-
-			XFS_MOUNT_IUNLOCK(mp);
-			/* XXX restart limit ? */
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			vnode_refd = B_TRUE;
-		} else {
-			ireclaims = mp->m_ireclaims;
-			topino = mp->m_inodes;
-			XFS_MOUNT_IUNLOCK(mp);
-		}
-
-		/*
-		 * We don't keep the mountlock across the dqrele() call,
-		 * since it can take a while..
-		 */
-		if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
-			xfs_qm_dqrele(ip->i_udquot);
-			ip->i_udquot = NULL;
-		}
-		if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
-			xfs_qm_dqrele(ip->i_gdquot);
-			ip->i_gdquot = NULL;
-		}
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		/*
-		 * Wait until we've dropped the ilock and mountlock to
-		 * do the vn_rele. Or be condemned to an eternity in the
-		 * inactive code in hell.
-		 */
-		if (vnode_refd)
-			VN_RELE(vp);
-		XFS_MOUNT_ILOCK(mp);
-		/*
-		 * If an inode was inserted or removed, we gotta
-		 * start over again.
-		 */
-		if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
-			/* XXX use a sentinel */
-			goto again;
-		}
-		ip = ip->i_mnext;
-	} while (ip != mp->m_inodes);
-
-	XFS_MOUNT_IUNLOCK(mp);
-}
-
-/*------------------------------------------------------------------------*/
-#ifdef DEBUG
-/*
- * This contains all the test functions for XFS disk quotas.
- * Currently it does a quota accounting check. ie. it walks through
- * all inodes in the file system, calculating the dquot accounting fields,
- * and prints out any inconsistencies.
- */
-xfs_dqhash_t *qmtest_udqtab;
-xfs_dqhash_t *qmtest_gdqtab;
-int	      qmtest_hashmask;
-int	      qmtest_nfails;
-mutex_t	      qcheck_lock;
-
-#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
-				 (__psunsigned_t)(id)) & \
-				(qmtest_hashmask - 1))
-
-#define DQTEST_HASH(mp, id, type)   ((type & XFS_DQ_USER) ? \
-				     (qmtest_udqtab + \
-				      DQTEST_HASHVAL(mp, id)) : \
-				     (qmtest_gdqtab + \
-				      DQTEST_HASHVAL(mp, id)))
-
-#define DQTEST_LIST_PRINT(l, NXT, title) \
-{ \
-	  xfs_dqtest_t	*dqp; int i = 0;\
-	  cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
-	  for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
-	       dqp = (xfs_dqtest_t *)dqp->NXT) { \
-		cmn_err(CE_DEBUG, "  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
-			 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp),	     \
-			 dqp->d_bcount, dqp->d_icount); } \
-}
-
-typedef struct dqtest {
-	xfs_dqmarker_t	q_lists;
-	xfs_dqhash_t	*q_hash;	/* the hashchain header */
-	xfs_mount_t	*q_mount;	/* filesystem this relates to */
-	xfs_dqid_t	d_id;		/* user id or group id */
-	xfs_qcnt_t	d_bcount;	/* # disk blocks owned by the user */
-	xfs_qcnt_t	d_icount;	/* # inodes owned by the user */
-} xfs_dqtest_t;
-
-STATIC void
-xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
-{
-	xfs_dquot_t *d;
-	if (((d) = (h)->qh_next))
-		(d)->HL_PREVP = &((dqp)->HL_NEXT);
-	(dqp)->HL_NEXT = d;
-	(dqp)->HL_PREVP = &((h)->qh_next);
-	(h)->qh_next = (xfs_dquot_t *)dqp;
-	(h)->qh_version++;
-	(h)->qh_nelems++;
-}
-STATIC void
-xfs_qm_dqtest_print(
-	xfs_dqtest_t	*d)
-{
-	cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------");
-	cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id);
-	cmn_err(CE_DEBUG, "---- fs       = 0x%p", d->q_mount);
-	cmn_err(CE_DEBUG, "---- bcount   = %Lu (0x%x)",
-		d->d_bcount, (int)d->d_bcount);
-	cmn_err(CE_DEBUG, "---- icount   = %Lu (0x%x)",
-		d->d_icount, (int)d->d_icount);
-	cmn_err(CE_DEBUG, "---------------------------");
-}
-
-STATIC void
-xfs_qm_dqtest_failed(
-	xfs_dqtest_t	*d,
-	xfs_dquot_t	*dqp,
-	char		*reason,
-	xfs_qcnt_t	a,
-	xfs_qcnt_t	b,
-	int		error)
-{
-	qmtest_nfails++;
-	if (error)
-		cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s",
-		       d->d_id, error, reason);
-	else
-		cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]",
-		       d->d_id, reason, (int)a, (int)b);
-	xfs_qm_dqtest_print(d);
-	if (dqp)
-		xfs_qm_dqprint(dqp);
-}
-
-STATIC int
-xfs_dqtest_cmp2(
-	xfs_dqtest_t	*d,
-	xfs_dquot_t	*dqp)
-{
-	int err = 0;
-	if (be64_to_cpu(dqp->q_core.d_icount) != d->d_icount) {
-		xfs_qm_dqtest_failed(d, dqp, "icount mismatch",
-			be64_to_cpu(dqp->q_core.d_icount),
-			d->d_icount, 0);
-		err++;
-	}
-	if (be64_to_cpu(dqp->q_core.d_bcount) != d->d_bcount) {
-		xfs_qm_dqtest_failed(d, dqp, "bcount mismatch",
-			be64_to_cpu(dqp->q_core.d_bcount),
-			d->d_bcount, 0);
-		err++;
-	}
-	if (dqp->q_core.d_blk_softlimit &&
-	    be64_to_cpu(dqp->q_core.d_bcount) >=
-	    be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
-		if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
-			cmn_err(CE_DEBUG,
-				"%d [%s] [0x%p] BLK TIMER NOT STARTED",
-				d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
-			err++;
-		}
-	}
-	if (dqp->q_core.d_ino_softlimit &&
-	    be64_to_cpu(dqp->q_core.d_icount) >=
-	    be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
-		if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
-			cmn_err(CE_DEBUG,
-				"%d [%s] [0x%p] INO TIMER NOT STARTED",
-				d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
-			err++;
-		}
-	}
-#ifdef QUOTADEBUG
-	if (!err) {
-		cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked",
-			d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
-	}
-#endif
-	return (err);
-}
-
-STATIC void
-xfs_dqtest_cmp(
-	xfs_dqtest_t	*d)
-{
-	xfs_dquot_t	*dqp;
-	int		error;
-
-	/* xfs_qm_dqtest_print(d); */
-	if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0,
-				 &dqp))) {
-		xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error);
-		return;
-	}
-	xfs_dqtest_cmp2(d, dqp);
-	xfs_qm_dqput(dqp);
-}
-
-STATIC int
-xfs_qm_internalqcheck_dqget(
-	xfs_mount_t	*mp,
-	xfs_dqid_t	id,
-	uint		type,
-	xfs_dqtest_t	**O_dq)
-{
-	xfs_dqtest_t	*d;
-	xfs_dqhash_t	*h;
-
-	h = DQTEST_HASH(mp, id, type);
-	for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
-	     d = (xfs_dqtest_t *) d->HL_NEXT) {
-		/* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
-		if (d->d_id == id && mp == d->q_mount) {
-			*O_dq = d;
-			return (0);
-		}
-	}
-	d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP);
-	d->dq_flags = type;
-	d->d_id = id;
-	d->q_mount = mp;
-	d->q_hash = h;
-	xfs_qm_hashinsert(h, d);
-	*O_dq = d;
-	return (0);
-}
-
-STATIC void
-xfs_qm_internalqcheck_get_dquots(
-	xfs_mount_t	*mp,
-	xfs_dqid_t	uid,
-	xfs_dqid_t	projid,
-	xfs_dqid_t	gid,
-	xfs_dqtest_t	**ud,
-	xfs_dqtest_t	**gd)
-{
-	if (XFS_IS_UQUOTA_ON(mp))
-		xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud);
-	if (XFS_IS_GQUOTA_ON(mp))
-		xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd);
-	else if (XFS_IS_PQUOTA_ON(mp))
-		xfs_qm_internalqcheck_dqget(mp, projid, XFS_DQ_PROJ, gd);
-}
-
-
-STATIC void
-xfs_qm_internalqcheck_dqadjust(
-	xfs_inode_t		*ip,
-	xfs_dqtest_t		*d)
-{
-	d->d_icount++;
-	d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks;
-}
-
-STATIC int
-xfs_qm_internalqcheck_adjust(
-	xfs_mount_t	*mp,		/* mount point for filesystem */
-	xfs_ino_t	ino,		/* inode number to get data for */
-	void		__user *buffer,	/* not used */
-	int		ubsize,		/* not used */
-	void		*private_data,	/* not used */
-	xfs_daddr_t	bno,		/* starting block of inode cluster */
-	int		*ubused,	/* not used */
-	void		*dip,		/* not used */
-	int		*res)		/* bulkstat result code */
-{
-	xfs_inode_t		*ip;
-	xfs_dqtest_t		*ud, *gd;
-	uint			lock_flags;
-	boolean_t		ipreleased;
-	int			error;
-
-	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-	if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
-		*res = BULKSTAT_RV_NOTHING;
-		qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n",
-			(unsigned long long) ino,
-			(unsigned long long) mp->m_sb.sb_uquotino,
-			(unsigned long long) mp->m_sb.sb_gquotino);
-		return XFS_ERROR(EINVAL);
-	}
-	ipreleased = B_FALSE;
- again:
-	lock_flags = XFS_ILOCK_SHARED;
-	if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) {
-		*res = BULKSTAT_RV_NOTHING;
-		return (error);
-	}
-
-	if (ip->i_d.di_mode == 0) {
-		xfs_iput_new(ip, lock_flags);
-		*res = BULKSTAT_RV_NOTHING;
-		return XFS_ERROR(ENOENT);
-	}
-
-	/*
-	 * This inode can have blocks after eof which can get released
-	 * when we send it to inactive. Since we don't check the dquot
-	 * until the after all our calculations are done, we must get rid
-	 * of those now.
-	 */
-	if (! ipreleased) {
-		xfs_iput(ip, lock_flags);
-		ipreleased = B_TRUE;
-		goto again;
-	}
-	xfs_qm_internalqcheck_get_dquots(mp,
-					(xfs_dqid_t) ip->i_d.di_uid,
-					(xfs_dqid_t) ip->i_d.di_projid,
-					(xfs_dqid_t) ip->i_d.di_gid,
-					&ud, &gd);
-	if (XFS_IS_UQUOTA_ON(mp)) {
-		ASSERT(ud);
-		xfs_qm_internalqcheck_dqadjust(ip, ud);
-	}
-	if (XFS_IS_OQUOTA_ON(mp)) {
-		ASSERT(gd);
-		xfs_qm_internalqcheck_dqadjust(ip, gd);
-	}
-	xfs_iput(ip, lock_flags);
-	*res = BULKSTAT_RV_DIDONE;
-	return (0);
-}
-
-
-/* PRIVATE, debugging */
-int
-xfs_qm_internalqcheck(
-	xfs_mount_t	*mp)
-{
-	xfs_ino_t	lastino;
-	int		done, count;
-	int		i;
-	xfs_dqtest_t	*d, *e;
-	xfs_dqhash_t	*h1;
-	int		error;
-
-	lastino = 0;
-	qmtest_hashmask = 32;
-	count = 5;
-	done = 0;
-	qmtest_nfails = 0;
-
-	if (! XFS_IS_QUOTA_ON(mp))
-		return XFS_ERROR(ESRCH);
-
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
-	XFS_bflush(mp->m_ddev_targp);
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
-	XFS_bflush(mp->m_ddev_targp);
-
-	mutex_lock(&qcheck_lock);
-	/* There should be absolutely no quota activity while this
-	   is going on. */
-	qmtest_udqtab = kmem_zalloc(qmtest_hashmask *
-				    sizeof(xfs_dqhash_t), KM_SLEEP);
-	qmtest_gdqtab = kmem_zalloc(qmtest_hashmask *
-				    sizeof(xfs_dqhash_t), KM_SLEEP);
-	do {
-		/*
-		 * Iterate thru all the inodes in the file system,
-		 * adjusting the corresponding dquot counters
-		 */
-		if ((error = xfs_bulkstat(mp, &lastino, &count,
-				 xfs_qm_internalqcheck_adjust, NULL,
-				 0, NULL, BULKSTAT_FG_IGET, &done))) {
-			break;
-		}
-	} while (! done);
-	if (error) {
-		cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
-	}
-	cmn_err(CE_DEBUG, "Checking results against system dquots");
-	for (i = 0; i < qmtest_hashmask; i++) {
-		h1 = &qmtest_udqtab[i];
-		for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
-			xfs_dqtest_cmp(d);
-			e = (xfs_dqtest_t *) d->HL_NEXT;
-			kmem_free(d, sizeof(xfs_dqtest_t));
-			d = e;
-		}
-		h1 = &qmtest_gdqtab[i];
-		for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
-			xfs_dqtest_cmp(d);
-			e = (xfs_dqtest_t *) d->HL_NEXT;
-			kmem_free(d, sizeof(xfs_dqtest_t));
-			d = e;
-		}
-	}
-
-	if (qmtest_nfails) {
-		cmn_err(CE_DEBUG, "******** quotacheck failed  ********");
-		cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails);
-	} else {
-		cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
-	}
-	kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
-	kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
-	mutex_unlock(&qcheck_lock);
-	return (qmtest_nfails);
-}
-
-#endif /* DEBUG */
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
deleted file mode 100644
index b7ddd04aae3..00000000000
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QUOTA_PRIV_H__
-#define __XFS_QUOTA_PRIV_H__
-
-/*
- * Number of bmaps that we ask from bmapi when doing a quotacheck.
- * We make this restriction to keep the memory usage to a minimum.
- */
-#define XFS_DQITER_MAP_SIZE	10
-
-/* Number of dquots that fit in to a dquot block */
-#define XFS_QM_DQPERBLK(mp)	((mp)->m_quotainfo->qi_dqperchunk)
-
-#define XFS_ISLOCKED_INODE(ip)		(ismrlocked(&(ip)->i_lock, \
-					    MR_UPDATE | MR_ACCESS) != 0)
-#define XFS_ISLOCKED_INODE_EXCL(ip)	(ismrlocked(&(ip)->i_lock, \
-					    MR_UPDATE) != 0)
-
-#define XFS_DQ_IS_ADDEDTO_TRX(t, d)	((d)->q_transp == (t))
-
-#define XFS_QI_MPLRECLAIMS(mp)	((mp)->m_quotainfo->qi_dqreclaims)
-#define XFS_QI_UQIP(mp)		((mp)->m_quotainfo->qi_uquotaip)
-#define XFS_QI_GQIP(mp)		((mp)->m_quotainfo->qi_gquotaip)
-#define XFS_QI_DQCHUNKLEN(mp)	((mp)->m_quotainfo->qi_dqchunklen)
-#define XFS_QI_BTIMELIMIT(mp)	((mp)->m_quotainfo->qi_btimelimit)
-#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
-#define XFS_QI_ITIMELIMIT(mp)	((mp)->m_quotainfo->qi_itimelimit)
-#define XFS_QI_BWARNLIMIT(mp)	((mp)->m_quotainfo->qi_bwarnlimit)
-#define XFS_QI_RTBWARNLIMIT(mp)	((mp)->m_quotainfo->qi_rtbwarnlimit)
-#define XFS_QI_IWARNLIMIT(mp)	((mp)->m_quotainfo->qi_iwarnlimit)
-#define XFS_QI_QOFFLOCK(mp)	((mp)->m_quotainfo->qi_quotaofflock)
-
-#define XFS_QI_MPL_LIST(mp)	((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLLOCK(mp)	((mp)->m_quotainfo->qi_dqlist.qh_lock)
-#define XFS_QI_MPLNEXT(mp)	((mp)->m_quotainfo->qi_dqlist.qh_next)
-#define XFS_QI_MPLNDQUOTS(mp)	((mp)->m_quotainfo->qi_dqlist.qh_nelems)
-
-#define XQMLCK(h)			(mutex_lock(&((h)->qh_lock)))
-#define XQMUNLCK(h)			(mutex_unlock(&((h)->qh_lock)))
-#ifdef DEBUG
-struct xfs_dqhash;
-static inline int XQMISLCKD(struct xfs_dqhash *h)
-{
-	if (mutex_trylock(&h->qh_lock)) {
-		mutex_unlock(&h->qh_lock);
-		return 0;
-	}
-	return 1;
-}
-#endif
-
-#define XFS_DQ_HASH_LOCK(h)		XQMLCK(h)
-#define XFS_DQ_HASH_UNLOCK(h)		XQMUNLCK(h)
-#define XFS_DQ_IS_HASH_LOCKED(h)	XQMISLCKD(h)
-
-#define xfs_qm_mplist_lock(mp)		XQMLCK(&(XFS_QI_MPL_LIST(mp)))
-#define xfs_qm_mplist_unlock(mp)	XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
-#define XFS_QM_IS_MPLIST_LOCKED(mp)	XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
-
-#define xfs_qm_freelist_lock(qm)	XQMLCK(&((qm)->qm_dqfreelist))
-#define xfs_qm_freelist_unlock(qm)	XQMUNLCK(&((qm)->qm_dqfreelist))
-#define XFS_QM_IS_FREELIST_LOCKED(qm)	XQMISLCKD(&((qm)->qm_dqfreelist))
-
-/*
- * Hash into a bucket in the dquot hash table, based on <mp, id>.
- */
-#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
-				 (__psunsigned_t)(id)) & \
-				(xfs_Gqm->qm_dqhashmask - 1))
-#define XFS_DQ_HASH(mp, id, type)   (type == XFS_DQ_USER ? \
-				     (xfs_Gqm->qm_usr_dqhtable + \
-				      XFS_DQ_HASHVAL(mp, id)) : \
-				     (xfs_Gqm->qm_grp_dqhtable + \
-				      XFS_DQ_HASHVAL(mp, id)))
-#define XFS_IS_DQTYPE_ON(mp, type)   (type == XFS_DQ_USER ? \
-					XFS_IS_UQUOTA_ON(mp) : \
-					XFS_IS_OQUOTA_ON(mp))
-#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
-	!dqp->q_core.d_blk_hardlimit && \
-	!dqp->q_core.d_blk_softlimit && \
-	!dqp->q_core.d_rtb_hardlimit && \
-	!dqp->q_core.d_rtb_softlimit && \
-	!dqp->q_core.d_ino_hardlimit && \
-	!dqp->q_core.d_ino_softlimit && \
-	!dqp->q_core.d_bcount && \
-	!dqp->q_core.d_rtbcount && \
-	!dqp->q_core.d_icount)
-
-#define HL_PREVP	dq_hashlist.ql_prevp
-#define HL_NEXT		dq_hashlist.ql_next
-#define MPL_PREVP	dq_mplist.ql_prevp
-#define MPL_NEXT	dq_mplist.ql_next
-
-
-#define _LIST_REMOVE(h, dqp, PVP, NXT)				\
-	{							\
-		 xfs_dquot_t *d;				\
-		 if (((d) = (dqp)->NXT))				\
-			 (d)->PVP = (dqp)->PVP;			\
-		 *((dqp)->PVP) = d;				\
-		 (dqp)->NXT = NULL;				\
-		 (dqp)->PVP = NULL;				\
-		 (h)->qh_version++;				\
-		 (h)->qh_nelems--;				\
-	}
-
-#define _LIST_INSERT(h, dqp, PVP, NXT)				\
-	{							\
-		 xfs_dquot_t *d;				\
-		 if (((d) = (h)->qh_next))			\
-			 (d)->PVP = &((dqp)->NXT);		\
-		 (dqp)->NXT = d;				\
-		 (dqp)->PVP = &((h)->qh_next);			\
-		 (h)->qh_next = dqp;				\
-		 (h)->qh_version++;				\
-		 (h)->qh_nelems++;				\
-	 }
-
-#define FOREACH_DQUOT_IN_MP(dqp, mp) \
-	for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
-
-#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist)	\
-for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
-     (dqp) = (dqp)->dq_flnext)
-
-#define XQM_HASHLIST_INSERT(h, dqp)	\
-	 _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
-
-#define XQM_FREELIST_INSERT(h, dqp)	\
-	 xfs_qm_freelist_append(h, dqp)
-
-#define XQM_MPLIST_INSERT(h, dqp)	\
-	 _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
-
-#define XQM_HASHLIST_REMOVE(h, dqp)	\
-	 _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
-#define XQM_FREELIST_REMOVE(dqp)	\
-	 xfs_qm_freelist_unlink(dqp)
-#define XQM_MPLIST_REMOVE(h, dqp)	\
-	{ _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
-	  XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
-
-#define XFS_DQ_IS_LOGITEM_INITD(dqp)	((dqp)->q_logitem.qli_dquot == (dqp))
-
-#define XFS_QM_DQP_TO_DQACCT(tp, dqp)	(XFS_QM_ISUDQ(dqp) ? \
-					 (tp)->t_dqinfo->dqa_usrdquots : \
-					 (tp)->t_dqinfo->dqa_grpdquots)
-#define XFS_IS_SUSER_DQUOT(dqp)		\
-	(!((dqp)->q_core.d_id))
-
-#define XFS_PURGE_INODE(ip)		\
-	IRELE(ip);
-
-#define DQFLAGTO_TYPESTR(d)	(((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
-				 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
-				 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
-#define DQFLAGTO_DIRTYSTR(d)	(XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY")
-
-#endif	/* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
deleted file mode 100644
index b08b3d9345b..00000000000
--- a/fs/xfs/support/debug.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "debug.h"
-#include "spin.h"
-#include <asm/page.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-
-static char		message[256];	/* keep it off the stack */
-static DEFINE_SPINLOCK(xfs_err_lock);
-
-/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
-#define XFS_MAX_ERR_LEVEL	7
-#define XFS_ERR_MASK		((1 << 3) - 1)
-static const char * const	err_level[XFS_MAX_ERR_LEVEL+1] =
-					{KERN_EMERG, KERN_ALERT, KERN_CRIT,
-					 KERN_ERR, KERN_WARNING, KERN_NOTICE,
-					 KERN_INFO, KERN_DEBUG};
-
-void
-cmn_err(register int level, char *fmt, ...)
-{
-	char	*fp = fmt;
-	int	len;
-	ulong	flags;
-	va_list	ap;
-
-	level &= XFS_ERR_MASK;
-	if (level > XFS_MAX_ERR_LEVEL)
-		level = XFS_MAX_ERR_LEVEL;
-	spin_lock_irqsave(&xfs_err_lock,flags);
-	va_start(ap, fmt);
-	if (*fmt == '!') fp++;
-	len = vsprintf(message, fp, ap);
-	if (message[len-1] != '\n')
-		strcat(message, "\n");
-	printk("%s%s", err_level[level], message);
-	va_end(ap);
-	spin_unlock_irqrestore(&xfs_err_lock,flags);
-
-	if (level == CE_PANIC)
-		BUG();
-}
-
-void
-icmn_err(register int level, char *fmt, va_list ap)
-{
-	ulong	flags;
-	int	len;
-
-	level &= XFS_ERR_MASK;
-	if(level > XFS_MAX_ERR_LEVEL)
-		level = XFS_MAX_ERR_LEVEL;
-	spin_lock_irqsave(&xfs_err_lock,flags);
-	len = vsprintf(message, fmt, ap);
-	if (message[len-1] != '\n')
-		strcat(message, "\n");
-	spin_unlock_irqrestore(&xfs_err_lock,flags);
-	printk("%s%s", err_level[level], message);
-	if (level == CE_PANIC)
-		BUG();
-}
-
-void
-assfail(char *expr, char *file, int line)
-{
-	printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line);
-	BUG();
-}
-
-#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM))
-unsigned long random(void)
-{
-	static unsigned long	RandomValue = 1;
-	/* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */
-	register long	rv = RandomValue;
-	register long	lo;
-	register long	hi;
-
-	hi = rv / 127773;
-	lo = rv % 127773;
-	rv = 16807 * lo - 2836 * hi;
-	if (rv <= 0) rv += 2147483647;
-	return RandomValue = rv;
-}
-#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
deleted file mode 100644
index e3bf58112e7..00000000000
--- a/fs/xfs/support/debug.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef	__XFS_SUPPORT_DEBUG_H__
-#define	__XFS_SUPPORT_DEBUG_H__
-
-#include <stdarg.h>
-
-#define CE_DEBUG        7               /* debug        */
-#define CE_CONT         6               /* continuation */
-#define CE_NOTE         5               /* notice       */
-#define CE_WARN         4               /* warning      */
-#define CE_ALERT        1               /* alert        */
-#define CE_PANIC        0               /* panic        */
-
-extern void icmn_err(int, char *, va_list)
-	__attribute__ ((format (printf, 2, 0)));
-extern void cmn_err(int, char *, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern void assfail(char *expr, char *f, int l);
-
-#define prdev(fmt,targ,args...) \
-	printk("Device %s - " fmt "\n", XFS_BUFTARG_NAME(targ), ## args)
-
-#define ASSERT_ALWAYS(expr)	\
-	(unlikely((expr) != 0) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-
-#ifndef DEBUG
-# define ASSERT(expr)	((void)0)
-#else
-# define ASSERT(expr)	ASSERT_ALWAYS(expr)
-extern unsigned long random(void);
-#endif
-
-#ifndef STATIC
-# define STATIC static
-#endif
-
-#endif  /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
deleted file mode 100644
index 841aa4c15b8..00000000000
--- a/fs/xfs/support/ktrace.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <xfs.h>
-
-static kmem_zone_t *ktrace_hdr_zone;
-static kmem_zone_t *ktrace_ent_zone;
-static int          ktrace_zentries;
-
-void
-ktrace_init(int zentries)
-{
-	ktrace_zentries = zentries;
-
-	ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
-					"ktrace_hdr");
-	ASSERT(ktrace_hdr_zone);
-
-	ktrace_ent_zone = kmem_zone_init(ktrace_zentries
-					* sizeof(ktrace_entry_t),
-					"ktrace_ent");
-	ASSERT(ktrace_ent_zone);
-}
-
-void
-ktrace_uninit(void)
-{
-	kmem_cache_destroy(ktrace_hdr_zone);
-	kmem_cache_destroy(ktrace_ent_zone);
-}
-
-/*
- * ktrace_alloc()
- *
- * Allocate a ktrace header and enough buffering for the given
- * number of entries.
- */
-ktrace_t *
-ktrace_alloc(int nentries, unsigned int __nocast sleep)
-{
-	ktrace_t        *ktp;
-	ktrace_entry_t  *ktep;
-
-	ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
-
-	if (ktp == (ktrace_t*)NULL) {
-		/*
-		 * KM_SLEEP callers don't expect failure.
-		 */
-		if (sleep & KM_SLEEP)
-			panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
-
-		return NULL;
-	}
-
-	/*
-	 * Special treatment for buffers with the ktrace_zentries entries
-	 */
-	if (nentries == ktrace_zentries) {
-		ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
-							    sleep);
-	} else {
-		ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
-							    sleep);
-	}
-
-	if (ktep == NULL) {
-		/*
-		 * KM_SLEEP callers don't expect failure.
-		 */
-		if (sleep & KM_SLEEP)
-			panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
-
-		kmem_free(ktp, sizeof(*ktp));
-
-		return NULL;
-	}
-
-	spinlock_init(&(ktp->kt_lock), "kt_lock");
-
-	ktp->kt_entries  = ktep;
-	ktp->kt_nentries = nentries;
-	ktp->kt_index    = 0;
-	ktp->kt_rollover = 0;
-	return ktp;
-}
-
-
-/*
- * ktrace_free()
- *
- * Free up the ktrace header and buffer.  It is up to the caller
- * to ensure that no-one is referencing it.
- */
-void
-ktrace_free(ktrace_t *ktp)
-{
-	int     entries_size;
-
-	if (ktp == (ktrace_t *)NULL)
-		return;
-
-	spinlock_destroy(&ktp->kt_lock);
-
-	/*
-	 * Special treatment for the Vnode trace buffer.
-	 */
-	if (ktp->kt_nentries == ktrace_zentries) {
-		kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
-	} else {
-		entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
-
-		kmem_free(ktp->kt_entries, entries_size);
-	}
-
-	kmem_zone_free(ktrace_hdr_zone, ktp);
-}
-
-
-/*
- * Enter the given values into the "next" entry in the trace buffer.
- * kt_index is always the index of the next entry to be filled.
- */
-void
-ktrace_enter(
-	ktrace_t        *ktp,
-	void            *val0,
-	void            *val1,
-	void            *val2,
-	void            *val3,
-	void            *val4,
-	void            *val5,
-	void            *val6,
-	void            *val7,
-	void            *val8,
-	void            *val9,
-	void            *val10,
-	void            *val11,
-	void            *val12,
-	void            *val13,
-	void            *val14,
-	void            *val15)
-{
-	static DEFINE_SPINLOCK(wrap_lock);
-	unsigned long	flags;
-	int             index;
-	ktrace_entry_t  *ktep;
-
-	ASSERT(ktp != NULL);
-
-	/*
-	 * Grab an entry by pushing the index up to the next one.
-	 */
-	spin_lock_irqsave(&wrap_lock, flags);
-	index = ktp->kt_index;
-	if (++ktp->kt_index == ktp->kt_nentries)
-		ktp->kt_index = 0;
-	spin_unlock_irqrestore(&wrap_lock, flags);
-
-	if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
-		ktp->kt_rollover = 1;
-
-	ASSERT((index >= 0) && (index < ktp->kt_nentries));
-
-	ktep = &(ktp->kt_entries[index]);
-
-	ktep->val[0]  = val0;
-	ktep->val[1]  = val1;
-	ktep->val[2]  = val2;
-	ktep->val[3]  = val3;
-	ktep->val[4]  = val4;
-	ktep->val[5]  = val5;
-	ktep->val[6]  = val6;
-	ktep->val[7]  = val7;
-	ktep->val[8]  = val8;
-	ktep->val[9]  = val9;
-	ktep->val[10] = val10;
-	ktep->val[11] = val11;
-	ktep->val[12] = val12;
-	ktep->val[13] = val13;
-	ktep->val[14] = val14;
-	ktep->val[15] = val15;
-}
-
-/*
- * Return the number of entries in the trace buffer.
- */
-int
-ktrace_nentries(
-	ktrace_t        *ktp)
-{
-	if (ktp == NULL) {
-		return 0;
-	}
-
-	return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index);
-}
-
-/*
- * ktrace_first()
- *
- * This is used to find the start of the trace buffer.
- * In conjunction with ktrace_next() it can be used to
- * iterate through the entire trace buffer.  This code does
- * not do any locking because it is assumed that it is called
- * from the debugger.
- *
- * The caller must pass in a pointer to a ktrace_snap
- * structure in which we will keep some state used to
- * iterate through the buffer.  This state must not touched
- * by any code outside of this module.
- */
-ktrace_entry_t *
-ktrace_first(ktrace_t   *ktp, ktrace_snap_t     *ktsp)
-{
-	ktrace_entry_t  *ktep;
-	int             index;
-	int             nentries;
-
-	if (ktp->kt_rollover)
-		index = ktp->kt_index;
-	else
-		index = 0;
-
-	ktsp->ks_start = index;
-	ktep = &(ktp->kt_entries[index]);
-
-	nentries = ktrace_nentries(ktp);
-	index++;
-	if (index < nentries) {
-		ktsp->ks_index = index;
-	} else {
-		ktsp->ks_index = 0;
-		if (index > nentries)
-			ktep = NULL;
-	}
-	return ktep;
-}
-
-/*
- * ktrace_next()
- *
- * This is used to iterate through the entries of the given
- * trace buffer.  The caller must pass in the ktrace_snap_t
- * structure initialized by ktrace_first().  The return value
- * will be either a pointer to the next ktrace_entry or NULL
- * if all of the entries have been traversed.
- */
-ktrace_entry_t *
-ktrace_next(
-	ktrace_t        *ktp,
-	ktrace_snap_t   *ktsp)
-{
-	int             index;
-	ktrace_entry_t  *ktep;
-
-	index = ktsp->ks_index;
-	if (index == ktsp->ks_start) {
-		ktep = NULL;
-	} else {
-		ktep = &ktp->kt_entries[index];
-	}
-
-	index++;
-	if (index == ktrace_nentries(ktp)) {
-		ktsp->ks_index = 0;
-	} else {
-		ktsp->ks_index = index;
-	}
-
-	return ktep;
-}
-
-/*
- * ktrace_skip()
- *
- * Skip the next "count" entries and return the entry after that.
- * Return NULL if this causes us to iterate past the beginning again.
- */
-ktrace_entry_t *
-ktrace_skip(
-	ktrace_t        *ktp,
-	int             count,
-	ktrace_snap_t   *ktsp)
-{
-	int             index;
-	int             new_index;
-	ktrace_entry_t  *ktep;
-	int             nentries = ktrace_nentries(ktp);
-
-	index = ktsp->ks_index;
-	new_index = index + count;
-	while (new_index >= nentries) {
-		new_index -= nentries;
-	}
-	if (index == ktsp->ks_start) {
-		/*
-		 * We've iterated around to the start, so we're done.
-		 */
-		ktep = NULL;
-	} else if ((new_index < index) && (index < ktsp->ks_index)) {
-		/*
-		 * We've skipped past the start again, so we're done.
-		 */
-		ktep = NULL;
-		ktsp->ks_index = ktsp->ks_start;
-	} else {
-		ktep = &(ktp->kt_entries[new_index]);
-		new_index++;
-		if (new_index == nentries) {
-			ktsp->ks_index = 0;
-		} else {
-			ktsp->ks_index = new_index;
-		}
-	}
-	return ktep;
-}
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
deleted file mode 100644
index 0d73216287c..00000000000
--- a/fs/xfs/support/ktrace.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_KTRACE_H__
-#define __XFS_SUPPORT_KTRACE_H__
-
-#include <spin.h>
-
-/*
- * Trace buffer entry structure.
- */
-typedef struct ktrace_entry {
-	void	*val[16];
-} ktrace_entry_t;
-
-/*
- * Trace buffer header structure.
- */
-typedef struct ktrace {
-	lock_t		kt_lock;	/* mutex to guard counters */
-	int		kt_nentries;	/* number of entries in trace buf */
-	int		kt_index;	/* current index in entries */
-	int		kt_rollover;
-	ktrace_entry_t	*kt_entries;	/* buffer of entries */
-} ktrace_t;
-
-/*
- * Trace buffer snapshot structure.
- */
-typedef struct ktrace_snap {
-	int		ks_start;	/* kt_index at time of snap */
-	int		ks_index;	/* current index */
-} ktrace_snap_t;
-
-
-#ifdef CONFIG_XFS_TRACE
-
-extern void ktrace_init(int zentries);
-extern void ktrace_uninit(void);
-
-extern ktrace_t *ktrace_alloc(int, unsigned int __nocast);
-extern void ktrace_free(ktrace_t *);
-
-extern void ktrace_enter(
-	ktrace_t	*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*,
-	void		*);
-
-extern ktrace_entry_t   *ktrace_first(ktrace_t *, ktrace_snap_t *);
-extern int              ktrace_nentries(ktrace_t *);
-extern ktrace_entry_t   *ktrace_next(ktrace_t *, ktrace_snap_t *);
-extern ktrace_entry_t   *ktrace_skip(ktrace_t *, int, ktrace_snap_t *);
-
-#else
-#define ktrace_init(x)	do { } while (0)
-#define ktrace_uninit()	do { } while (0)
-#endif	/* CONFIG_XFS_TRACE */
-
-#endif	/* __XFS_SUPPORT_KTRACE_H__ */
diff --git a/fs/xfs/support/move.c b/fs/xfs/support/move.c
deleted file mode 100644
index caefa17b80f..00000000000
--- a/fs/xfs/support/move.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <xfs.h>
-
-/* Read from kernel buffer at src to user/kernel buffer defined
- * by the uio structure. Advance the pointer in the uio struct
- * as we go.
- */
-int
-uio_read(caddr_t src, size_t len, struct uio *uio)
-{
-	size_t	count;
-
-	if (!len || !uio->uio_resid)
-		return 0;
-
-	count = uio->uio_iov->iov_len;
-	if (!count)
-		return 0;
-	if (count > len)
-		count = len;
-
-	if (uio->uio_segflg == UIO_USERSPACE) {
-		if (copy_to_user(uio->uio_iov->iov_base, src, count))
-			return EFAULT;
-	} else {
-		ASSERT(uio->uio_segflg == UIO_SYSSPACE);
-		memcpy(uio->uio_iov->iov_base, src, count);
-	}
-
-	uio->uio_iov->iov_base = (void*)((char*)uio->uio_iov->iov_base + count);
-	uio->uio_iov->iov_len -= count;
-	uio->uio_offset += count;
-	uio->uio_resid -= count;
-	return 0;
-}
diff --git a/fs/xfs/support/move.h b/fs/xfs/support/move.h
deleted file mode 100644
index 97a2498d2da..00000000000
--- a/fs/xfs/support/move.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Portions Copyright (c) 1982, 1986, 1993, 1994
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#ifndef __XFS_SUPPORT_MOVE_H__
-#define __XFS_SUPPORT_MOVE_H__
-
-#include <linux/uio.h>
-#include <asm/uaccess.h>
-
-/* Segment flag values. */
-enum uio_seg {
-	UIO_USERSPACE,          /* from user data space */
-	UIO_SYSSPACE,           /* from system space */
-};
-
-struct uio {
-	struct iovec	*uio_iov;   /* pointer to array of iovecs */
-	int		uio_iovcnt; /* number of iovecs in array */
-	xfs_off_t	uio_offset; /* offset in file this uio corresponds to */
-	int		uio_resid;  /* residual i/o count */
-	enum uio_seg	uio_segflg; /* see above */
-};
-
-typedef struct uio uio_t;
-typedef struct iovec iovec_t;
-
-extern int	uio_read (caddr_t, size_t, uio_t *);
-
-#endif  /* __XFS_SUPPORT_MOVE_H__ */
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/time.h
index 387e695a184..387e695a184 100644
--- a/fs/xfs/linux-2.6/time.h
+++ b/fs/xfs/time.h
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/uuid.c
index a3d565a6773..b83f76b6d41 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/uuid.c
@@ -17,17 +17,6 @@
  */
 #include <xfs.h>
 
-static mutex_t	uuid_monitor;
-static int	uuid_table_size;
-static uuid_t	*uuid_table;
-
-void
-uuid_init(void)
-{
-	mutex_init(&uuid_monitor);
-}
-
-
 /* IRIX interpretation of an uuid_t */
 typedef struct {
 	__be32	uu_timelow;
@@ -50,13 +39,7 @@ uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
 
 	fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) |
 		   be16_to_cpu(uup->uu_timemid);
-	fsid[1] = be16_to_cpu(uup->uu_timelow);
-}
-
-void
-uuid_create_nil(uuid_t *uuid)
-{
-	memset(uuid, 0, sizeof(*uuid));
+	fsid[1] = be32_to_cpu(uup->uu_timelow);
 }
 
 int
@@ -78,64 +61,3 @@ uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
 {
 	return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
 }
-
-/*
- * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
- * 64-bit words.  NOTE: This function can not be changed EVER.  Although
- * brain-dead, some applications depend on this 64-bit value remaining
- * persistent.  Specifically, DMI vendors store the value as a persistent
- * filehandle.
- */
-__uint64_t
-uuid_hash64(uuid_t *uuid)
-{
-	__uint64_t	*sp = (__uint64_t *)uuid;
-
-	return sp[0] + sp[1];
-}
-
-int
-uuid_table_insert(uuid_t *uuid)
-{
-	int	i, hole;
-
-	mutex_lock(&uuid_monitor);
-	for (i = 0, hole = -1; i < uuid_table_size; i++) {
-		if (uuid_is_nil(&uuid_table[i])) {
-			hole = i;
-			continue;
-		}
-		if (uuid_equal(uuid, &uuid_table[i])) {
-			mutex_unlock(&uuid_monitor);
-			return 0;
-		}
-	}
-	if (hole < 0) {
-		uuid_table = kmem_realloc(uuid_table,
-			(uuid_table_size + 1) * sizeof(*uuid_table),
-			uuid_table_size  * sizeof(*uuid_table),
-			KM_SLEEP);
-		hole = uuid_table_size++;
-	}
-	uuid_table[hole] = *uuid;
-	mutex_unlock(&uuid_monitor);
-	return 1;
-}
-
-void
-uuid_table_remove(uuid_t *uuid)
-{
-	int	i;
-
-	mutex_lock(&uuid_monitor);
-	for (i = 0; i < uuid_table_size; i++) {
-		if (uuid_is_nil(&uuid_table[i]))
-			continue;
-		if (!uuid_equal(uuid, &uuid_table[i]))
-			continue;
-		uuid_create_nil(&uuid_table[i]);
-		break;
-	}
-	ASSERT(i < uuid_table_size);
-	mutex_unlock(&uuid_monitor);
-}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/uuid.h
index b6f5922199b..104db0f3bed 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/uuid.h
@@ -22,13 +22,14 @@ typedef struct {
 	unsigned char	__u_bits[16];
 } uuid_t;
 
-extern void uuid_init(void);
-extern void uuid_create_nil(uuid_t *uuid);
 extern int uuid_is_nil(uuid_t *uuid);
 extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
 extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
-extern __uint64_t uuid_hash64(uuid_t *uuid);
-extern int uuid_table_insert(uuid_t *uuid);
-extern void uuid_table_remove(uuid_t *uuid);
+
+static inline void
+uuid_copy(uuid_t *dst, uuid_t *src)
+{
+	memcpy(dst, src, sizeof(uuid_t));
+}
 
 #endif	/* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 1a48dbb902a..a742c47f7d5 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -17,5 +17,18 @@
  */
 #ifndef __XFS_H__
 #define __XFS_H__
-#include <linux-2.6/xfs_linux.h>
+
+#ifdef CONFIG_XFS_DEBUG
+#define STATIC
+#define DEBUG 1
+#define XFS_BUF_LOCK_TRACKING 1
+#endif
+
+#ifdef CONFIG_XFS_WARN
+#define XFS_WARN 1
+#endif
+
+
+#include "xfs_linux.h"
+
 #endif	/* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 4ff0f4e41c6..6888ad886ff 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008, Christoph Hellwig
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -16,912 +16,291 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
 #include "xfs_inode.h"
-#include "xfs_btree.h"
 #include "xfs_acl.h"
-#include "xfs_mac.h"
 #include "xfs_attr.h"
-
-#include <linux/capability.h>
+#include "xfs_trace.h"
+#include <linux/slab.h>
+#include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
 
-STATIC int	xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *);
-STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
-STATIC void	xfs_acl_get_endian(xfs_acl_t *);
-STATIC int	xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
-STATIC int	xfs_acl_invalid(xfs_acl_t *);
-STATIC void	xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void	xfs_acl_get_attr(vnode_t *, xfs_acl_t *, int, int, int *);
-STATIC void	xfs_acl_set_attr(vnode_t *, xfs_acl_t *, int, int *);
-STATIC int	xfs_acl_allow_set(vnode_t *, int);
-
-kmem_zone_t *xfs_acl_zone;
-
-
-/*
- * Test for existence of access ACL attribute as efficiently as possible.
- */
-int
-xfs_acl_vhasacl_access(
-	vnode_t		*vp)
-{
-	int		error;
-
-	xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error);
-	return (error == 0);
-}
 
 /*
- * Test for existence of default ACL attribute as efficiently as possible.
+ * Locking scheme:
+ *  - all ACL updates are protected by inode->i_mutex, which is taken before
+ *    calling into this file.
  */
-int
-xfs_acl_vhasacl_default(
-	vnode_t		*vp)
-{
-	int		error;
 
-	if (!VN_ISDIR(vp))
-		return 0;
-	xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
-	return (error == 0);
-}
-
-/*
- * Convert from extended attribute representation to in-memory for XFS.
- */
-STATIC int
-posix_acl_xattr_to_xfs(
-	posix_acl_xattr_header	*src,
-	size_t			size,
-	xfs_acl_t		*dest)
+STATIC struct posix_acl *
+xfs_acl_from_disk(
+	struct xfs_acl	*aclp,
+	int		max_entries)
 {
-	posix_acl_xattr_entry	*src_entry;
-	xfs_acl_entry_t		*dest_entry;
-	int			n;
-
-	if (!src || !dest)
-		return EINVAL;
-
-	if (size < sizeof(posix_acl_xattr_header))
-		return EINVAL;
+	struct posix_acl_entry *acl_e;
+	struct posix_acl *acl;
+	struct xfs_acl_entry *ace;
+	unsigned int count, i;
 
-	if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
-		return EOPNOTSUPP;
+	count = be32_to_cpu(aclp->acl_cnt);
+	if (count > max_entries)
+		return ERR_PTR(-EFSCORRUPTED);
 
-	memset(dest, 0, sizeof(xfs_acl_t));
-	dest->acl_cnt = posix_acl_xattr_count(size);
-	if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES)
-		return EINVAL;
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
 
-	/*
-	 * acl_set_file(3) may request that we set default ACLs with
-	 * zero length -- defend (gracefully) against that here.
-	 */
-	if (!dest->acl_cnt)
-		return 0;
+	for (i = 0; i < count; i++) {
+		acl_e = &acl->a_entries[i];
+		ace = &aclp->acl_entry[i];
 
-	src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src));
-	dest_entry = &dest->acl_entry[0];
+		/*
+		 * The tag is 32 bits on disk and 16 bits in core.
+		 *
+		 * Because every access to it goes through the core
+		 * format first this is not a problem.
+		 */
+		acl_e->e_tag = be32_to_cpu(ace->ae_tag);
+		acl_e->e_perm = be16_to_cpu(ace->ae_perm);
 
-	for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) {
-		dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm);
-		if (_ACL_PERM_INVALID(dest_entry->ae_perm))
-			return EINVAL;
-		dest_entry->ae_tag  = le16_to_cpu(src_entry->e_tag);
-		switch(dest_entry->ae_tag) {
+		switch (acl_e->e_tag) {
 		case ACL_USER:
+			acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
+			break;
 		case ACL_GROUP:
-			dest_entry->ae_id = le32_to_cpu(src_entry->e_id);
+			acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
 			break;
 		case ACL_USER_OBJ:
 		case ACL_GROUP_OBJ:
 		case ACL_MASK:
 		case ACL_OTHER:
-			dest_entry->ae_id = ACL_UNDEFINED_ID;
 			break;
 		default:
-			return EINVAL;
+			goto fail;
 		}
 	}
-	if (xfs_acl_invalid(dest))
-		return EINVAL;
+	return acl;
 
-	return 0;
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
 }
 
-/*
- * Comparison function called from xfs_sort().
- * Primary key is ae_tag, secondary key is ae_id.
- */
-STATIC int
-xfs_acl_entry_compare(
-	const void	*va,
-	const void	*vb)
+STATIC void
+xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
 {
-	xfs_acl_entry_t	*a = (xfs_acl_entry_t *)va,
-			*b = (xfs_acl_entry_t *)vb;
+	const struct posix_acl_entry *acl_e;
+	struct xfs_acl_entry *ace;
+	int i;
 
-	if (a->ae_tag == b->ae_tag)
-		return (a->ae_id - b->ae_id);
-	return (a->ae_tag - b->ae_tag);
-}
+	aclp->acl_cnt = cpu_to_be32(acl->a_count);
+	for (i = 0; i < acl->a_count; i++) {
+		ace = &aclp->acl_entry[i];
+		acl_e = &acl->a_entries[i];
 
-/*
- * Convert from in-memory XFS to extended attribute representation.
- */
-STATIC int
-posix_acl_xfs_to_xattr(
-	xfs_acl_t		*src,
-	posix_acl_xattr_header	*dest,
-	size_t			size)
-{
-	int			n;
-	size_t			new_size = posix_acl_xattr_size(src->acl_cnt);
-	posix_acl_xattr_entry	*dest_entry;
-	xfs_acl_entry_t		*src_entry;
-
-	if (size < new_size)
-		return -ERANGE;
-
-	/* Need to sort src XFS ACL by <ae_tag,ae_id> */
-	xfs_sort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]),
-		 xfs_acl_entry_compare);
-
-	dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
-	dest_entry = &dest->a_entries[0];
-	src_entry = &src->acl_entry[0];
-	for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) {
-		dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm);
-		if (_ACL_PERM_INVALID(src_entry->ae_perm))
-			return -EINVAL;
-		dest_entry->e_tag  = cpu_to_le16(src_entry->ae_tag);
-		switch (src_entry->ae_tag) {
+		ace->ae_tag = cpu_to_be32(acl_e->e_tag);
+		switch (acl_e->e_tag) {
 		case ACL_USER:
+			ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
+			break;
 		case ACL_GROUP:
-			dest_entry->e_id = cpu_to_le32(src_entry->ae_id);
-				break;
-		case ACL_USER_OBJ:
-		case ACL_GROUP_OBJ:
-		case ACL_MASK:
-		case ACL_OTHER:
-			dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+			ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
 			break;
 		default:
-			return -EINVAL;
-		}
-	}
-	return new_size;
-}
-
-int
-xfs_acl_vget(
-	vnode_t		*vp,
-	void		*acl,
-	size_t		size,
-	int		kind)
-{
-	int			error;
-	xfs_acl_t		*xfs_acl = NULL;
-	posix_acl_xattr_header	*ext_acl = acl;
-	int			flags = 0;
-
-	VN_HOLD(vp);
-	if(size) {
-		if (!(_ACL_ALLOC(xfs_acl))) {
-			error = ENOMEM;
-			goto out;
+			ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
+			break;
 		}
-		memset(xfs_acl, 0, sizeof(xfs_acl_t));
-	} else
-		flags = ATTR_KERNOVAL;
 
-	xfs_acl_get_attr(vp, xfs_acl, kind, flags, &error);
-	if (error)
-		goto out;
-
-	if (!size) {
-		error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES);
-	} else {
-		if (xfs_acl_invalid(xfs_acl)) {
-			error = EINVAL;
-			goto out;
-		}
-		if (kind == _ACL_TYPE_ACCESS) {
-			vattr_t	va;
-
-			va.va_mask = XFS_AT_MODE;
-			VOP_GETATTR(vp, &va, 0, sys_cred, error);
-			if (error)
-				goto out;
-			xfs_acl_sync_mode(va.va_mode, xfs_acl);
-		}
-		error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
+		ace->ae_perm = cpu_to_be16(acl_e->e_perm);
 	}
-out:
-	VN_RELE(vp);
-	if(xfs_acl)
-		_ACL_FREE(xfs_acl);
-	return -error;
 }
 
-int
-xfs_acl_vremove(
-	vnode_t		*vp,
-	int		kind)
+struct posix_acl *
+xfs_get_acl(struct inode *inode, int type)
 {
-	int		error;
-
-	VN_HOLD(vp);
-	error = xfs_acl_allow_set(vp, kind);
-	if (!error) {
-		VOP_ATTR_REMOVE(vp, kind == _ACL_TYPE_DEFAULT?
-				SGI_ACL_DEFAULT: SGI_ACL_FILE,
-				ATTR_ROOT, sys_cred, error);
-		if (error == ENOATTR)
-			error = 0;	/* 'scool */
+	struct xfs_inode *ip = XFS_I(inode);
+	struct posix_acl *acl = NULL;
+	struct xfs_acl *xfs_acl;
+	unsigned char *ea_name;
+	int error;
+	int len;
+
+	trace_xfs_get_acl(ip);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		ea_name = SGI_ACL_FILE;
+		break;
+	case ACL_TYPE_DEFAULT:
+		ea_name = SGI_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
 	}
-	VN_RELE(vp);
-	return -error;
-}
 
-int
-xfs_acl_vset(
-	vnode_t			*vp,
-	void			*acl,
-	size_t			size,
-	int			kind)
-{
-	posix_acl_xattr_header	*ext_acl = acl;
-	xfs_acl_t		*xfs_acl;
-	int			error;
-	int			basicperms = 0; /* more than std unix perms? */
-
-	if (!acl)
-		return -EINVAL;
-
-	if (!(_ACL_ALLOC(xfs_acl)))
-		return -ENOMEM;
+	/*
+	 * If we have a cached ACLs value just return it, not need to
+	 * go out to the disk.
+	 */
+	len = XFS_ACL_MAX_SIZE(ip->i_mount);
+	xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+	if (!xfs_acl)
+		return ERR_PTR(-ENOMEM);
 
-	error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl);
+	error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+							&len, ATTR_ROOT);
 	if (error) {
-		_ACL_FREE(xfs_acl);
-		return -error;
-	}
-	if (!xfs_acl->acl_cnt) {
-		_ACL_FREE(xfs_acl);
-		return 0;
+		/*
+		 * If the attribute doesn't exist make sure we have a negative
+		 * cache entry, for any other error assume it is transient and
+		 * leave the cache entry as ACL_NOT_CACHED.
+		 */
+		if (error == -ENOATTR)
+			goto out_update_cache;
+		goto out;
 	}
 
-	VN_HOLD(vp);
-	error = xfs_acl_allow_set(vp, kind);
-	if (error)
+	acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
+	if (IS_ERR(acl))
 		goto out;
 
-	/* Incoming ACL exists, set file mode based on its value */
-	if (kind == _ACL_TYPE_ACCESS)
-		xfs_acl_setmode(vp, xfs_acl, &basicperms);
-
-	/*
-	 * If we have more than std unix permissions, set up the actual attr.
-	 * Otherwise, delete any existing attr.  This prevents us from
-	 * having actual attrs for permissions that can be stored in the
-	 * standard permission bits.
-	 */
-	if (!basicperms) {
-		xfs_acl_set_attr(vp, xfs_acl, kind, &error);
-	} else {
-		xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
-	}
-
+out_update_cache:
+	set_cached_acl(inode, type, acl);
 out:
-	VN_RELE(vp);
-	_ACL_FREE(xfs_acl);
-	return -error;
+	kmem_free(xfs_acl);
+	return acl;
 }
 
-int
-xfs_acl_iaccess(
-	xfs_inode_t	*ip,
-	mode_t		mode,
-	cred_t		*cr)
+STATIC int
+__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
-	xfs_acl_t	*acl;
-	int		rval;
-
-	if (!(_ACL_ALLOC(acl)))
-		return -1;
-
-	/* If the file has no ACL return -1. */
-	rval = sizeof(xfs_acl_t);
-	if (xfs_attr_fetch(ip, SGI_ACL_FILE, SGI_ACL_FILE_SIZE,
-			(char *)acl, &rval, ATTR_ROOT | ATTR_KERNACCESS, cr)) {
-		_ACL_FREE(acl);
-		return -1;
-	}
-	xfs_acl_get_endian(acl);
+	struct xfs_inode *ip = XFS_I(inode);
+	unsigned char *ea_name;
+	int error;
 
-	/* If the file has an empty ACL return -1. */
-	if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) {
-		_ACL_FREE(acl);
-		return -1;
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		ea_name = SGI_ACL_FILE;
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		ea_name = SGI_ACL_DEFAULT;
+		break;
+	default:
+		return -EINVAL;
 	}
 
-	/* Synchronize ACL with mode bits */
-	xfs_acl_sync_mode(ip->i_d.di_mode, acl);
+	if (acl) {
+		struct xfs_acl *xfs_acl;
+		int len = XFS_ACL_MAX_SIZE(ip->i_mount);
 
-	rval = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr);
-	_ACL_FREE(acl);
-	return rval;
-}
+		xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+		if (!xfs_acl)
+			return -ENOMEM;
 
-STATIC int
-xfs_acl_allow_set(
-	vnode_t		*vp,
-	int		kind)
-{
-	vattr_t		va;
-	int		error;
-
-	if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
-		return EPERM;
-	if (kind == _ACL_TYPE_DEFAULT && !VN_ISDIR(vp))
-		return ENOTDIR;
-	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
-		return EROFS;
-	va.va_mask = XFS_AT_UID;
-	VOP_GETATTR(vp, &va, 0, NULL, error);
-	if (error)
-		return error;
-	if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
-		return EPERM;
-	return error;
-}
-
-/*
- * The access control process to determine the access permission:
- *	if uid == file owner id, use the file owner bits.
- *	if gid == file owner group id, use the file group bits.
- *	scan ACL for a maching user or group, and use matched entry
- *	permission. Use total permissions of all matching group entries,
- *	until all acl entries are exhausted. The final permission produced
- *	by matching acl entry or entries needs to be & with group permission.
- *	if not owner, owning group, or matching entry in ACL, use file
- *	other bits.  
- */
-STATIC int
-xfs_acl_capability_check(
-	mode_t		mode,
-	cred_t		*cr)
-{
-	if ((mode & ACL_READ) && !capable_cred(cr, CAP_DAC_READ_SEARCH))
-		return EACCES;
-	if ((mode & ACL_WRITE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
-		return EACCES;
-	if ((mode & ACL_EXECUTE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
-		return EACCES;
-
-	return 0;
-}
+		xfs_acl_to_disk(xfs_acl, acl);
 
-/*
- * Note: cr is only used here for the capability check if the ACL test fails.
- *       It is not used to find out the credentials uid or groups etc, as was
- *       done in IRIX. It is assumed that the uid and groups for the current
- *       thread are taken from "current" instead of the cr parameter.
- */
-STATIC int
-xfs_acl_access(
-	uid_t		fuid,
-	gid_t		fgid,
-	xfs_acl_t	*fap,
-	mode_t		md,
-	cred_t		*cr)
-{
-	xfs_acl_entry_t	matched;
-	int		i, allows;
-	int		maskallows = -1;	/* true, but not 1, either */
-	int		seen_userobj = 0;
+		/* subtract away the unused acl entries */
+		len -= sizeof(struct xfs_acl_entry) *
+			 (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
 
-	matched.ae_tag = 0;	/* Invalid type */
-	matched.ae_perm = 0;
-	md >>= 6;	/* Normalize the bits for comparison */
+		error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
+				len, ATTR_ROOT);
 
-	for (i = 0; i < fap->acl_cnt; i++) {
+		kmem_free(xfs_acl);
+	} else {
 		/*
-		 * Break out if we've got a user_obj entry or
-		 * a user entry and the mask (and have processed USER_OBJ)
+		 * A NULL ACL argument means we want to remove the ACL.
 		 */
-		if (matched.ae_tag == ACL_USER_OBJ)
-			break;
-		if (matched.ae_tag == ACL_USER) {
-			if (maskallows != -1 && seen_userobj)
-				break;
-			if (fap->acl_entry[i].ae_tag != ACL_MASK &&
-			    fap->acl_entry[i].ae_tag != ACL_USER_OBJ)
-				continue;
-		}
-		/* True if this entry allows the requested access */
-		allows = ((fap->acl_entry[i].ae_perm & md) == md);
+		error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
 
-		switch (fap->acl_entry[i].ae_tag) {
-		case ACL_USER_OBJ:
-			seen_userobj = 1;
-			if (fuid != current->fsuid)
-				continue;
-			matched.ae_tag = ACL_USER_OBJ;
-			matched.ae_perm = allows;
-			break;
-		case ACL_USER:
-			if (fap->acl_entry[i].ae_id != current->fsuid)
-				continue;
-			matched.ae_tag = ACL_USER;
-			matched.ae_perm = allows;
-			break;
-		case ACL_GROUP_OBJ:
-			if ((matched.ae_tag == ACL_GROUP_OBJ ||
-			    matched.ae_tag == ACL_GROUP) && !allows)
-				continue;
-			if (!in_group_p(fgid))
-				continue;
-			matched.ae_tag = ACL_GROUP_OBJ;
-			matched.ae_perm = allows;
-			break;
-		case ACL_GROUP:
-			if ((matched.ae_tag == ACL_GROUP_OBJ ||
-			    matched.ae_tag == ACL_GROUP) && !allows)
-				continue;
-			if (!in_group_p(fap->acl_entry[i].ae_id))
-				continue;
-			matched.ae_tag = ACL_GROUP;
-			matched.ae_perm = allows;
-			break;
-		case ACL_MASK:
-			maskallows = allows;
-			break;
-		case ACL_OTHER:
-			if (matched.ae_tag != 0)
-				continue;
-			matched.ae_tag = ACL_OTHER;
-			matched.ae_perm = allows;
-			break;
-		}
-	}
-	/*
-	 * First possibility is that no matched entry allows access.
-	 * The capability to override DAC may exist, so check for it.
-	 */
-	switch (matched.ae_tag) {
-	case ACL_OTHER:
-	case ACL_USER_OBJ:
-		if (matched.ae_perm)
-			return 0;
-		break;
-	case ACL_USER:
-	case ACL_GROUP_OBJ:
-	case ACL_GROUP:
-		if (maskallows && matched.ae_perm)
-			return 0;
-		break;
-	case 0:
-		break;
+		/*
+		 * If the attribute didn't exist to start with that's fine.
+		 */
+		if (error == -ENOATTR)
+			error = 0;
 	}
 
-	return xfs_acl_capability_check(md, cr);
+	if (!error)
+		set_cached_acl(inode, type, acl);
+	return error;
 }
 
-/*
- * ACL validity checker.
- *   This acl validation routine checks each ACL entry read in makes sense.
- */
-STATIC int
-xfs_acl_invalid(
-	xfs_acl_t	*aclp)
+static int
+xfs_set_mode(struct inode *inode, umode_t mode)
 {
-	xfs_acl_entry_t	*entry, *e;
-	int		user = 0, group = 0, other = 0, mask = 0;
-	int		mask_required = 0;
-	int		i, j;
+	int error = 0;
 
-	if (!aclp)
-		goto acl_invalid;
+	if (mode != inode->i_mode) {
+		struct iattr iattr;
 
-	if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES)
-		goto acl_invalid;
-
-	for (i = 0; i < aclp->acl_cnt; i++) {
-		entry = &aclp->acl_entry[i];
-		switch (entry->ae_tag) {
-		case ACL_USER_OBJ:
-			if (user++)
-				goto acl_invalid;
-			break;
-		case ACL_GROUP_OBJ:
-			if (group++)
-				goto acl_invalid;
-			break;
-		case ACL_OTHER:
-			if (other++)
-				goto acl_invalid;
-			break;
-		case ACL_USER:
-		case ACL_GROUP:
-			for (j = i + 1; j < aclp->acl_cnt; j++) {
-				e = &aclp->acl_entry[j];
-				if (e->ae_id == entry->ae_id &&
-				    e->ae_tag == entry->ae_tag)
-					goto acl_invalid;
-			}
-			mask_required++;
-			break;
-		case ACL_MASK:
-			if (mask++)
-				goto acl_invalid;
-			break;
-		default:
-			goto acl_invalid;
-		}
-	}
-	if (!user || !group || !other || (mask_required && !mask))
-		goto acl_invalid;
-	else
-		return 0;
-acl_invalid:
-	return EINVAL;
-}
+		iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
+		iattr.ia_mode = mode;
+		iattr.ia_ctime = current_fs_time(inode->i_sb);
 
-/*
- * Do ACL endian conversion.
- */
-STATIC void
-xfs_acl_get_endian(
-	xfs_acl_t	*aclp)
-{
-	xfs_acl_entry_t	*ace, *end;
-
-	INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-	end = &aclp->acl_entry[0]+aclp->acl_cnt;
-	for (ace = &aclp->acl_entry[0]; ace < end; ace++) {
-		INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag);
-		INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id);
-		INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm);
+		error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
 	}
-}
 
-/*
- * Get the ACL from the EA and do endian conversion.
- */
-STATIC void
-xfs_acl_get_attr(
-	vnode_t		*vp,
-	xfs_acl_t	*aclp,
-	int		kind,
-	int		flags,
-	int		*error)
-{
-	int		len = sizeof(xfs_acl_t);
-
-	ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
-	flags |= ATTR_ROOT;
-	VOP_ATTR_GET(vp,
-		kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT,
-		(char *)aclp, &len, flags, sys_cred, *error);
-	if (*error || (flags & ATTR_KERNOVAL))
-		return;
-	xfs_acl_get_endian(aclp);
+	return error;
 }
 
-/*
- * Set the EA with the ACL and do endian conversion.
- */
-STATIC void
-xfs_acl_set_attr(
-	vnode_t		*vp,
-	xfs_acl_t	*aclp,
-	int		kind,
-	int		*error)
+static int
+xfs_acl_exists(struct inode *inode, unsigned char *name)
 {
-	xfs_acl_entry_t	*ace, *newace, *end;
-	xfs_acl_t	*newacl;
-	int		len;
+	int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
 
-	if (!(_ACL_ALLOC(newacl))) {
-		*error = ENOMEM;
-		return;
-	}
-
-	len = sizeof(xfs_acl_t) -
-	      (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt));
-	end = &aclp->acl_entry[0]+aclp->acl_cnt;
-	for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0];
-	     ace < end;
-	     ace++, newace++) {
-		INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag);
-		INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id);
-		INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
-	}
-	INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-	VOP_ATTR_SET(vp,
-		kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT,
-		(char *)newacl, len, ATTR_ROOT, sys_cred, *error);
-	_ACL_FREE(newacl);
+	return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
+			    ATTR_ROOT|ATTR_KERNOVAL) == 0);
 }
 
 int
-xfs_acl_vtoacl(
-	vnode_t		*vp,
-	xfs_acl_t	*access_acl,
-	xfs_acl_t	*default_acl)
+posix_acl_access_exists(struct inode *inode)
 {
-	vattr_t		va;
-	int		error = 0;
-
-	if (access_acl) {
-		/*
-		 * Get the Access ACL and the mode.  If either cannot
-		 * be obtained for some reason, invalidate the access ACL.
-		 */
-		xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error);
-		if (!error) {
-			/* Got the ACL, need the mode... */
-			va.va_mask = XFS_AT_MODE;
-			VOP_GETATTR(vp, &va, 0, sys_cred, error);
-		}
-
-		if (error)
-			access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
-		else /* We have a good ACL and the file mode, synchronize. */
-			xfs_acl_sync_mode(va.va_mode, access_acl);
-	}
-
-	if (default_acl) {
-		xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error);
-		if (error)
-			default_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
-	}
-	return error;
+	return xfs_acl_exists(inode, SGI_ACL_FILE);
 }
 
-/*
- * This function retrieves the parent directory's acl, processes it
- * and lets the child inherit the acl(s) that it should.
- */
 int
-xfs_acl_inherit(
-	vnode_t		*vp,
-	vattr_t		*vap,
-	xfs_acl_t	*pdaclp)
+posix_acl_default_exists(struct inode *inode)
 {
-	xfs_acl_t	*cacl;
-	int		error = 0;
-	int		basicperms = 0;
-
-	/*
-	 * If the parent does not have a default ACL, or it's an
-	 * invalid ACL, we're done.
-	 */
-	if (!vp)
-		return 0;
-	if (!pdaclp || xfs_acl_invalid(pdaclp))
+	if (!S_ISDIR(inode->i_mode))
 		return 0;
-
-	/*
-	 * Copy the default ACL of the containing directory to
-	 * the access ACL of the new file and use the mode that
-	 * was passed in to set up the correct initial values for
-	 * the u::,g::[m::], and o:: entries.  This is what makes
-	 * umask() "work" with ACL's.
-	 */
-
-	if (!(_ACL_ALLOC(cacl)))
-		return ENOMEM;
-
-	memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
-	xfs_acl_filter_mode(vap->va_mode, cacl);
-	xfs_acl_setmode(vp, cacl, &basicperms);
-
-	/*
-	 * Set the Default and Access ACL on the file.  The mode is already
-	 * set on the file, so we don't need to worry about that.
-	 *
-	 * If the new file is a directory, its default ACL is a copy of
-	 * the containing directory's default ACL.
-	 */
-	if (VN_ISDIR(vp))
-		xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
-	if (!error && !basicperms)
-		xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
-	_ACL_FREE(cacl);
-	return error;
+	return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
 }
 
-/*
- * Set up the correct mode on the file based on the supplied ACL.  This
- * makes sure that the mode on the file reflects the state of the
- * u::,g::[m::], and o:: entries in the ACL.  Since the mode is where
- * the ACL is going to get the permissions for these entries, we must
- * synchronize the mode whenever we set the ACL on a file.
- */
-STATIC int
-xfs_acl_setmode(
-	vnode_t		*vp,
-	xfs_acl_t	*acl,
-	int		*basicperms)
+int
+xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-	vattr_t		va;
-	xfs_acl_entry_t	*ap;
-	xfs_acl_entry_t	*gap = NULL;
-	int		i, error, nomask = 1;
+	int error = 0;
 
-	*basicperms = 1;
-
-	if (acl->acl_cnt == XFS_ACL_NOT_PRESENT)
-		return 0;
+	if (!acl)
+		goto set_acl;
 
-	/*
-	 * Copy the u::, g::, o::, and m:: bits from the ACL into the
-	 * mode.  The m:: bits take precedence over the g:: bits.
-	 */
-	va.va_mask = XFS_AT_MODE;
-	VOP_GETATTR(vp, &va, 0, sys_cred, error);
-	if (error)
+	error = -E2BIG;
+	if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
 		return error;
 
-	va.va_mask = XFS_AT_MODE;
-	va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
-	ap = acl->acl_entry;
-	for (i = 0; i < acl->acl_cnt; ++i) {
-		switch (ap->ae_tag) {
-		case ACL_USER_OBJ:
-			va.va_mode |= ap->ae_perm << 6;
-			break;
-		case ACL_GROUP_OBJ:
-			gap = ap;
-			break;
-		case ACL_MASK:	/* more than just standard modes */
-			nomask = 0;
-			va.va_mode |= ap->ae_perm << 3;
-			*basicperms = 0;
-			break;
-		case ACL_OTHER:
-			va.va_mode |= ap->ae_perm;
-			break;
-		default:	/* more than just standard modes */
-			*basicperms = 0;
-			break;
-		}
-		ap++;
-	}
-
-	/* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
-	if (gap && nomask)
-		va.va_mode |= gap->ae_perm << 3;
+	if (type == ACL_TYPE_ACCESS) {
+		umode_t mode = inode->i_mode;
+		error = posix_acl_equiv_mode(acl, &mode);
 
-	VOP_SETATTR(vp, &va, 0, sys_cred, error);
-	return error;
-}
+		if (error <= 0) {
+			acl = NULL;
 
-/*
- * The permissions for the special ACL entries (u::, g::[m::], o::) are
- * actually stored in the file mode (if there is both a group and a mask,
- * the group is stored in the ACL entry and the mask is stored on the file).
- * This allows the mode to remain automatically in sync with the ACL without
- * the need for a call-back to the ACL system at every point where the mode
- * could change.  This function takes the permissions from the specified mode
- * and places it in the supplied ACL.
- *
- * This implementation draws its validity from the fact that, when the ACL
- * was assigned, the mode was copied from the ACL.
- * If the mode did not change, therefore, the mode remains exactly what was
- * taken from the special ACL entries at assignment.
- * If a subsequent chmod() was done, the POSIX spec says that the change in
- * mode must cause an update to the ACL seen at user level and used for
- * access checks.  Before and after a mode change, therefore, the file mode
- * most accurately reflects what the special ACL entries should permit/deny.
- *
- * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly,
- *         the existing mode bits will override whatever is in the
- *         ACL. Similarly, if there is a pre-existing ACL that was
- *         never in sync with its mode (owing to a bug in 6.5 and
- *         before), it will now magically (or mystically) be
- *         synchronized.  This could cause slight astonishment, but
- *         it is better than inconsistent permissions.
- *
- * The supplied ACL is a template that may contain any combination
- * of special entries.  These are treated as place holders when we fill
- * out the ACL.  This routine does not add or remove special entries, it
- * simply unites each special entry with its associated set of permissions.
- */
-STATIC void
-xfs_acl_sync_mode(
-	mode_t		mode,
-	xfs_acl_t	*acl)
-{
-	int		i, nomask = 1;
-	xfs_acl_entry_t	*ap;
-	xfs_acl_entry_t	*gap = NULL;
-
-	/*
-	 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
-	 * be set instead of the GROUP entry, if there is a MASK.
-	 */
-	for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
-		switch (ap->ae_tag) {
-		case ACL_USER_OBJ:
-			ap->ae_perm = (mode >> 6) & 0x7;
-			break;
-		case ACL_GROUP_OBJ:
-			gap = ap;
-			break;
-		case ACL_MASK:
-			nomask = 0;
-			ap->ae_perm = (mode >> 3) & 0x7;
-			break;
-		case ACL_OTHER:
-			ap->ae_perm = mode & 0x7;
-			break;
-		default:
-			break;
+			if (error < 0)
+				return error;
 		}
-	}
-	/* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
-	if (gap && nomask)
-		gap->ae_perm = (mode >> 3) & 0x7;
-}
-
-/*
- * When inheriting an Access ACL from a directory Default ACL,
- * the ACL bits are set to the intersection of the ACL default
- * permission bits and the file permission bits in mode. If there
- * are no permission bits on the file then we must not give them
- * the ACL. This is what what makes umask() work with ACLs.
- */
-STATIC void
-xfs_acl_filter_mode(
-	mode_t		mode,
-	xfs_acl_t	*acl)
-{
-	int		i, nomask = 1;
-	xfs_acl_entry_t	*ap;
-	xfs_acl_entry_t	*gap = NULL;
 
-	/*
-	 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
-	 * be merged with GROUP entry, if there is a MASK.
-	 */
-	for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
-		switch (ap->ae_tag) {
-		case ACL_USER_OBJ:
-			ap->ae_perm &= (mode >> 6) & 0x7;
-			break;
-		case ACL_GROUP_OBJ:
-			gap = ap;
-			break;
-		case ACL_MASK:
-			nomask = 0;
-			ap->ae_perm &= (mode >> 3) & 0x7;
-			break;
-		case ACL_OTHER:
-			ap->ae_perm &= mode & 0x7;
-			break;
-		default:
-			break;
-		}
+		error = xfs_set_mode(inode, mode);
+		if (error)
+			return error;
 	}
-	/* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
-	if (gap && nomask)
-		gap->ae_perm &= (mode >> 3) & 0x7;
+
+ set_acl:
+	return __xfs_set_acl(inode, type, acl);
 }
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index f9315bc960c..5dc16374451 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -18,85 +18,58 @@
 #ifndef __XFS_ACL_H__
 #define __XFS_ACL_H__
 
-/*
- * Access Control Lists
- */
-typedef __uint16_t	xfs_acl_perm_t;
-typedef __int32_t	xfs_acl_type_t;
-typedef __int32_t	xfs_acl_tag_t;
-typedef __int32_t	xfs_acl_id_t;
+struct inode;
+struct posix_acl;
+struct xfs_inode;
 
-#define XFS_ACL_MAX_ENTRIES 25
 #define XFS_ACL_NOT_PRESENT (-1)
 
-typedef struct xfs_acl_entry {
-	xfs_acl_tag_t	ae_tag;
-	xfs_acl_id_t	ae_id;
-	xfs_acl_perm_t	ae_perm;
-} xfs_acl_entry_t;
+/* On-disk XFS access control list structure */
+struct xfs_acl_entry {
+	__be32	ae_tag;
+	__be32	ae_id;
+	__be16	ae_perm;
+	__be16	ae_pad;		/* fill the implicit hole in the structure */
+};
 
-typedef struct xfs_acl {
-	__int32_t	acl_cnt;
-	xfs_acl_entry_t	acl_entry[XFS_ACL_MAX_ENTRIES];
-} xfs_acl_t;
+struct xfs_acl {
+	__be32			acl_cnt;
+	struct xfs_acl_entry	acl_entry[0];
+};
+
+/*
+ * The number of ACL entries allowed is defined by the on-disk format.
+ * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is
+ * limited only by the maximum size of the xattr that stores the information.
+ */
+#define XFS_ACL_MAX_ENTRIES(mp)	\
+	(xfs_sb_version_hascrc(&mp->m_sb) \
+		?  (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+						sizeof(struct xfs_acl_entry) \
+		: 25)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+	(sizeof(struct xfs_acl) + \
+		sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
 
 /* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE	"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT	"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE		(unsigned char *)"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT		(unsigned char *)"SGI_ACL_DEFAULT"
 #define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
 
-
 #ifdef CONFIG_XFS_POSIX_ACL
-
-struct vattr;
-struct vnode;
-struct xfs_inode;
-
-extern struct kmem_zone *xfs_acl_zone;
-#define xfs_acl_zone_init(zone, name)	\
-		(zone) = kmem_zone_init(sizeof(xfs_acl_t), name)
-#define xfs_acl_zone_destroy(zone)	kmem_cache_destroy(zone)
-
-extern int xfs_acl_inherit(struct vnode *, struct vattr *, xfs_acl_t *);
-extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(struct vnode *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(struct vnode *);
-extern int xfs_acl_vhasacl_default(struct vnode *);
-extern int xfs_acl_vset(struct vnode *, void *, size_t, int);
-extern int xfs_acl_vget(struct vnode *, void *, size_t, int);
-extern int xfs_acl_vremove(struct vnode *vp, int);
-
-#define _ACL_TYPE_ACCESS	1
-#define _ACL_TYPE_DEFAULT	2
-#define _ACL_PERM_INVALID(perm)	((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
-
-#define _ACL_INHERIT(c,v,d)	(xfs_acl_inherit(c,v,d))
-#define _ACL_GET_ACCESS(pv,pa)	(xfs_acl_vtoacl(pv,pa,NULL) == 0)
-#define _ACL_GET_DEFAULT(pv,pd)	(xfs_acl_vtoacl(pv,NULL,pd) == 0)
-#define _ACL_ACCESS_EXISTS	xfs_acl_vhasacl_access
-#define _ACL_DEFAULT_EXISTS	xfs_acl_vhasacl_default
-#define _ACL_XFS_IACCESS(i,m,c) (XFS_IFORK_Q(i) ? xfs_acl_iaccess(i,m,c) : -1)
-
-#define _ACL_ALLOC(a)		((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP))
-#define _ACL_FREE(a)		((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0)
-
+extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
+extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int posix_acl_access_exists(struct inode *inode);
+extern int posix_acl_default_exists(struct inode *inode);
 #else
-#define xfs_acl_zone_init(zone,name)
-#define xfs_acl_zone_destroy(zone)
-#define xfs_acl_vset(v,p,sz,t)	(-EOPNOTSUPP)
-#define xfs_acl_vget(v,p,sz,t)	(-EOPNOTSUPP)
-#define xfs_acl_vremove(v,t)	(-EOPNOTSUPP)
-#define xfs_acl_vhasacl_access(v)	(0)
-#define xfs_acl_vhasacl_default(v)	(0)
-#define _ACL_ALLOC(a)		(1)	/* successfully allocate nothing */
-#define _ACL_FREE(a)		((void)0)
-#define _ACL_INHERIT(c,v,d)	(0)
-#define _ACL_GET_ACCESS(pv,pa)	(0)
-#define _ACL_GET_DEFAULT(pv,pd)	(0)
-#define _ACL_ACCESS_EXISTS	(NULL)
-#define _ACL_DEFAULT_EXISTS	(NULL)
-#define _ACL_XFS_IACCESS(i,m,c) (-1)
-#endif
-
+static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
+{
+	return NULL;
+}
+# define xfs_set_acl					NULL
+# define posix_acl_access_exists(inode)			0
+# define posix_acl_default_exists(inode)		0
+#endif /* CONFIG_XFS_POSIX_ACL */
 #endif	/* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index a96e2ffce0c..6e247a99f5d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -30,6 +30,7 @@ struct xfs_trans;
 
 #define	XFS_AGF_MAGIC	0x58414746	/* 'XAGF' */
 #define	XFS_AGI_MAGIC	0x58414749	/* 'XAGI' */
+#define	XFS_AGFL_MAGIC	0x5841464c	/* 'XAFL' */
 #define	XFS_AGF_VERSION	1
 #define	XFS_AGI_VERSION	1
 
@@ -63,13 +64,33 @@ typedef struct xfs_agf {
 	__be32		agf_spare0;	/* spare field */
 	__be32		agf_levels[XFS_BTNUM_AGF];	/* btree levels */
 	__be32		agf_spare1;	/* spare field */
+
 	__be32		agf_flfirst;	/* first freelist block's index */
 	__be32		agf_fllast;	/* last freelist block's index */
 	__be32		agf_flcount;	/* count of blocks in freelist */
 	__be32		agf_freeblks;	/* total free blocks */
+
 	__be32		agf_longest;	/* longest free space */
+	__be32		agf_btreeblks;	/* # of blocks held in AGF btrees */
+	uuid_t		agf_uuid;	/* uuid of filesystem */
+
+	/*
+	 * reserve some contiguous space for future logged fields before we add
+	 * the unlogged fields. This makes the range logging via flags and
+	 * structure offsets much simpler.
+	 */
+	__be64		agf_spare64[16];
+
+	/* unlogged fields, written during buffer writeback. */
+	__be64		agf_lsn;	/* last write sequence */
+	__be32		agf_crc;	/* crc of agf sector */
+	__be32		agf_spare2;
+
+	/* structure must be padded to 64 bit alignment */
 } xfs_agf_t;
 
+#define XFS_AGF_CRC_OFF		offsetof(struct xfs_agf, agf_crc)
+
 #define	XFS_AGF_MAGICNUM	0x00000001
 #define	XFS_AGF_VERSIONNUM	0x00000002
 #define	XFS_AGF_SEQNO		0x00000004
@@ -81,14 +102,33 @@ typedef struct xfs_agf {
 #define	XFS_AGF_FLCOUNT		0x00000100
 #define	XFS_AGF_FREEBLKS	0x00000200
 #define	XFS_AGF_LONGEST		0x00000400
-#define	XFS_AGF_NUM_BITS	11
+#define	XFS_AGF_BTREEBLKS	0x00000800
+#define	XFS_AGF_UUID		0x00001000
+#define	XFS_AGF_NUM_BITS	13
 #define	XFS_AGF_ALL_BITS	((1 << XFS_AGF_NUM_BITS) - 1)
 
+#define XFS_AGF_FLAGS \
+	{ XFS_AGF_MAGICNUM,	"MAGICNUM" }, \
+	{ XFS_AGF_VERSIONNUM,	"VERSIONNUM" }, \
+	{ XFS_AGF_SEQNO,	"SEQNO" }, \
+	{ XFS_AGF_LENGTH,	"LENGTH" }, \
+	{ XFS_AGF_ROOTS,	"ROOTS" }, \
+	{ XFS_AGF_LEVELS,	"LEVELS" }, \
+	{ XFS_AGF_FLFIRST,	"FLFIRST" }, \
+	{ XFS_AGF_FLLAST,	"FLLAST" }, \
+	{ XFS_AGF_FLCOUNT,	"FLCOUNT" }, \
+	{ XFS_AGF_FREEBLKS,	"FREEBLKS" }, \
+	{ XFS_AGF_LONGEST,	"LONGEST" }, \
+	{ XFS_AGF_BTREEBLKS,	"BTREEBLKS" }, \
+	{ XFS_AGF_UUID,		"UUID" }
+
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGF_DADDR(mp)	((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
 #define	XFS_AGF_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
-#define	XFS_BUF_TO_AGF(bp)	((xfs_agf_t *)XFS_BUF_PTR(bp))
+#define	XFS_BUF_TO_AGF(bp)	((xfs_agf_t *)((bp)->b_addr))
 
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 
 /*
  * Size of the unlinked inode hash table in the agi.
@@ -112,6 +152,7 @@ typedef struct xfs_agi {
 	__be32		agi_root;	/* root of inode btree */
 	__be32		agi_level;	/* levels in inode btree */
 	__be32		agi_freecount;	/* number of free inodes */
+
 	__be32		agi_newino;	/* new inode just allocated */
 	__be32		agi_dirino;	/* last directory inode chunk */
 	/*
@@ -119,26 +160,46 @@ typedef struct xfs_agi {
 	 * still being referenced.
 	 */
 	__be32		agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+	/*
+	 * This marks the end of logging region 1 and start of logging region 2.
+	 */
+	uuid_t		agi_uuid;	/* uuid of filesystem */
+	__be32		agi_crc;	/* crc of agi sector */
+	__be32		agi_pad32;
+	__be64		agi_lsn;	/* last write sequence */
+
+	__be32		agi_free_root; /* root of the free inode btree */
+	__be32		agi_free_level;/* levels in free inode btree */
+
+	/* structure must be padded to 64 bit alignment */
 } xfs_agi_t;
 
-#define	XFS_AGI_MAGICNUM	0x00000001
-#define	XFS_AGI_VERSIONNUM	0x00000002
-#define	XFS_AGI_SEQNO		0x00000004
-#define	XFS_AGI_LENGTH		0x00000008
-#define	XFS_AGI_COUNT		0x00000010
-#define	XFS_AGI_ROOT		0x00000020
-#define	XFS_AGI_LEVEL		0x00000040
-#define	XFS_AGI_FREECOUNT	0x00000080
-#define	XFS_AGI_NEWINO		0x00000100
-#define	XFS_AGI_DIRINO		0x00000200
-#define	XFS_AGI_UNLINKED	0x00000400
-#define	XFS_AGI_NUM_BITS	11
-#define	XFS_AGI_ALL_BITS	((1 << XFS_AGI_NUM_BITS) - 1)
+#define XFS_AGI_CRC_OFF		offsetof(struct xfs_agi, agi_crc)
+
+#define	XFS_AGI_MAGICNUM	(1 << 0)
+#define	XFS_AGI_VERSIONNUM	(1 << 1)
+#define	XFS_AGI_SEQNO		(1 << 2)
+#define	XFS_AGI_LENGTH		(1 << 3)
+#define	XFS_AGI_COUNT		(1 << 4)
+#define	XFS_AGI_ROOT		(1 << 5)
+#define	XFS_AGI_LEVEL		(1 << 6)
+#define	XFS_AGI_FREECOUNT	(1 << 7)
+#define	XFS_AGI_NEWINO		(1 << 8)
+#define	XFS_AGI_DIRINO		(1 << 9)
+#define	XFS_AGI_UNLINKED	(1 << 10)
+#define	XFS_AGI_NUM_BITS_R1	11	/* end of the 1st agi logging region */
+#define	XFS_AGI_ALL_BITS_R1	((1 << XFS_AGI_NUM_BITS_R1) - 1)
+#define	XFS_AGI_FREE_ROOT	(1 << 11)
+#define	XFS_AGI_FREE_LEVEL	(1 << 12)
+#define	XFS_AGI_NUM_BITS_R2	13
 
 /* disk block (xfs_daddr_t) in the AG */
 #define XFS_AGI_DADDR(mp)	((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
 #define	XFS_AGI_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
-#define	XFS_BUF_TO_AGI(bp)	((xfs_agi_t *)XFS_BUF_PTR(bp))
+#define	XFS_BUF_TO_AGI(bp)	((xfs_agi_t *)((bp)->b_addr))
+
+extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+				xfs_agnumber_t agno, struct xfs_buf **bpp);
 
 /*
  * The third a.g. block contains the a.g. freelist, an array
@@ -146,53 +207,42 @@ typedef struct xfs_agi {
  */
 #define XFS_AGFL_DADDR(mp)	((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
 #define	XFS_AGFL_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
-#define XFS_AGFL_SIZE(mp)	((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
-#define	XFS_BUF_TO_AGFL(bp)	((xfs_agfl_t *)XFS_BUF_PTR(bp))
+#define	XFS_BUF_TO_AGFL(bp)	((xfs_agfl_t *)((bp)->b_addr))
 
-typedef struct xfs_agfl {
-	xfs_agblock_t	agfl_bno[1];	/* actually XFS_AGFL_SIZE(mp) */
-} xfs_agfl_t;
+#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
+	(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		&(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
+		(__be32 *)(bp)->b_addr)
 
 /*
- * Busy block/extent entry.  Used in perag to mark blocks that have been freed
- * but whose transactions aren't committed to disk yet.
+ * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of
+ * slots in the beginning of the block for a proper header with the
+ * location information and CRC.
  */
-typedef struct xfs_perag_busy {
-	xfs_agblock_t	busy_start;
-	xfs_extlen_t	busy_length;
-	struct xfs_trans *busy_tp;	/* transaction that did the free */
-} xfs_perag_busy_t;
+#define XFS_AGFL_SIZE(mp) \
+	(((mp)->m_sb.sb_sectsize - \
+	 (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		sizeof(struct xfs_agfl) : 0)) / \
+	  sizeof(xfs_agblock_t))
+
+typedef struct xfs_agfl {
+	__be32		agfl_magicnum;
+	__be32		agfl_seqno;
+	uuid_t		agfl_uuid;
+	__be64		agfl_lsn;
+	__be32		agfl_crc;
+	__be32		agfl_bno[];	/* actually XFS_AGFL_SIZE(mp) */
+} xfs_agfl_t;
+
+#define XFS_AGFL_CRC_OFF	offsetof(struct xfs_agfl, agfl_crc)
 
 /*
- * Per-ag incore structure, copies of information in agf and agi,
- * to improve the performance of allocation group selection.
- *
- * pick sizes which fit in allocation buckets well
+ * tags for inode radix tree
  */
-#if (BITS_PER_LONG == 32)
-#define XFS_PAGB_NUM_SLOTS	84
-#elif (BITS_PER_LONG == 64)
-#define XFS_PAGB_NUM_SLOTS	128
-#endif
-
-typedef struct xfs_perag
-{
-	char		pagf_init;	/* this agf's entry is initialized */
-	char		pagi_init;	/* this agi's entry is initialized */
-	char		pagf_metadata;	/* the agf is prefered to be metadata */
-	char		pagi_inodeok;	/* The agi is ok for inodes */
-	__uint8_t	pagf_levels[XFS_BTNUM_AGF];
-					/* # of levels in bno & cnt btree */
-	__uint32_t	pagf_flcount;	/* count of blocks in freelist */
-	xfs_extlen_t	pagf_freeblks;	/* total free blocks */
-	xfs_extlen_t	pagf_longest;	/* longest free space */
-	xfs_agino_t	pagi_freecount;	/* number of free inodes */
-#ifdef __KERNEL__
-	lock_t		pagb_lock;	/* lock for pagb_list */
-#endif
-	int		pagb_count;	/* pagb slots in use */
-	xfs_perag_busy_t *pagb_list;	/* unstable blocks */
-} xfs_perag_t;
+#define XFS_ICI_NO_TAG		(-1)	/* special flag for an untagged lookup
+					   in xfs_inode_ag_iterator */
+#define XFS_ICI_RECLAIM_TAG	0	/* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG	1	/* inode has blocks beyond EOF */
 
 #define	XFS_AG_MAXLEVELS(mp)		((mp)->m_ag_maxlevels)
 #define	XFS_MIN_FREELIST_RAW(bl,cl,mp)	\
@@ -203,15 +253,15 @@ typedef struct xfs_perag
 		be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
 #define	XFS_MIN_FREELIST_PAG(pag,mp)	\
 	(XFS_MIN_FREELIST_RAW(		\
-		(uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
-		(uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+		(unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+		(unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
 
 #define XFS_AGB_TO_FSB(mp,agno,agbno)	\
 	(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
 #define	XFS_FSB_TO_AGNO(mp,fsbno)	\
 	((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
 #define	XFS_FSB_TO_AGBNO(mp,fsbno)	\
-	((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog)))
+	((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
 #define	XFS_AGB_TO_DADDR(mp,agno,agbno)	\
 	((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
 		(xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
@@ -224,8 +274,8 @@ typedef struct xfs_perag
 #define	XFS_AG_CHECK_DADDR(mp,d,len)	\
 	((len) == 1 ? \
 	    ASSERT((d) == XFS_SB_DADDR || \
-		   XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \
-	    ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \
-		   XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1)))
+		   xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+	    ASSERT(xfs_daddr_to_agno(mp, d) == \
+		   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
 
 #endif	/* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index f4328e1e2a7..d43813267a8 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -17,108 +17,153 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
 #include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_ialloc.h"
+#include "xfs_alloc_btree.h"
 #include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
 #include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
 
+struct workqueue_struct *xfs_alloc_wq;
 
 #define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
 
 #define	XFSA_FIXUP_BNO_OK	1
 #define	XFSA_FIXUP_CNT_OK	2
 
-STATIC int
-xfs_alloc_search_busy(xfs_trans_t *tp,
-		    xfs_agnumber_t agno,
-		    xfs_agblock_t bno,
-		    xfs_extlen_t len);
-
-#if defined(XFS_ALLOC_TRACE)
-ktrace_t *xfs_alloc_trace_buf;
-
-#define	TRACE_ALLOC(s,a)	\
-	xfs_alloc_trace_alloc(fname, s, a, __LINE__)
-#define	TRACE_FREE(s,a,b,x,f)	\
-	xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__)
-#define	TRACE_MODAGF(s,a,f)	\
-	xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__)
-#define	TRACE_BUSY(fname,s,ag,agb,l,sl,tp)	\
-	xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
-#define	TRACE_UNBUSY(fname,s,ag,sl,tp)	\
-	xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
-#define	TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)	\
-	xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
-#else
-#define	TRACE_ALLOC(s,a)
-#define	TRACE_FREE(s,a,b,x,f)
-#define	TRACE_MODAGF(s,a,f)
-#define	TRACE_BUSY(s,a,ag,agb,l,sl,tp)
-#define	TRACE_UNBUSY(fname,s,ag,sl,tp)
-#define	TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)
-#endif	/* XFS_ALLOC_TRACE */
-
-/*
- * Prototypes for per-ag allocation routines
- */
-
 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
-	xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+		xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int				/* error */
+xfs_alloc_lookup_eq(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int				/* error */
+xfs_alloc_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
 
 /*
- * Internal functions.
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
  */
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat)	/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int				/* error */
+xfs_alloc_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len)	/* length of extent */
+{
+	union xfs_btree_rec	rec;
+
+	rec.alloc.ar_startblock = cpu_to_be32(bno);
+	rec.alloc.ar_blockcount = cpu_to_be32(len);
+	return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat)	/* output: success/failure */
+{
+	union xfs_btree_rec	*rec;
+	int			error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (!error && *stat == 1) {
+		*bno = be32_to_cpu(rec->alloc.ar_startblock);
+		*len = be32_to_cpu(rec->alloc.ar_blockcount);
+	}
+	return error;
+}
 
 /*
  * Compute aligned version of the found extent.
  * Takes alignment and min length into account.
  */
-STATIC int				/* success (>= minlen) */
+STATIC void
 xfs_alloc_compute_aligned(
+	xfs_alloc_arg_t	*args,		/* allocation argument structure */
 	xfs_agblock_t	foundbno,	/* starting block in found extent */
 	xfs_extlen_t	foundlen,	/* length in found extent */
-	xfs_extlen_t	alignment,	/* alignment for allocation */
-	xfs_extlen_t	minlen,		/* minimum length for allocation */
 	xfs_agblock_t	*resbno,	/* result block number */
 	xfs_extlen_t	*reslen)	/* result length */
 {
 	xfs_agblock_t	bno;
-	xfs_extlen_t	diff;
 	xfs_extlen_t	len;
 
-	if (alignment > 1 && foundlen >= minlen) {
-		bno = roundup(foundbno, alignment);
-		diff = bno - foundbno;
-		len = diff >= foundlen ? 0 : foundlen - diff;
+	/* Trim busy sections out of found extent */
+	xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+
+	if (args->alignment > 1 && len >= args->minlen) {
+		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
+		xfs_extlen_t	diff = aligned_bno - bno;
+
+		*resbno = aligned_bno;
+		*reslen = diff >= len ? 0 : len - diff;
 	} else {
-		bno = foundbno;
-		len = foundlen;
+		*resbno = bno;
+		*reslen = len;
 	}
-	*resbno = bno;
-	*reslen = len;
-	return len >= minlen;
 }
 
 /*
@@ -130,6 +175,7 @@ xfs_alloc_compute_diff(
 	xfs_agblock_t	wantbno,	/* target starting block */
 	xfs_extlen_t	wantlen,	/* target length */
 	xfs_extlen_t	alignment,	/* target alignment */
+	char		userdata,	/* are we allocating data? */
 	xfs_agblock_t	freebno,	/* freespace's starting block */
 	xfs_extlen_t	freelen,	/* freespace's length */
 	xfs_agblock_t	*newbnop)	/* result: best start block from free */
@@ -144,7 +190,14 @@ xfs_alloc_compute_diff(
 	ASSERT(freelen >= wantlen);
 	freeend = freebno + freelen;
 	wantend = wantbno + wantlen;
-	if (freebno >= wantbno) {
+	/*
+	 * We want to allocate from the start of a free extent if it is past
+	 * the desired block or if we are allocating user data and the free
+	 * extent is before desired block. The second case is there to allow
+	 * for contiguous allocation from the remaining free space if the file
+	 * grows in the short term.
+	 */
+	if (freebno >= wantbno || (userdata && freeend < wantend)) {
 		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
 			newbno1 = NULLAGBLOCK;
 	} else if (freeend >= wantend && alignment > 1) {
@@ -204,16 +257,14 @@ xfs_alloc_fix_len(
 	k = rlen % args->prod;
 	if (k == args->mod)
 		return;
-	if (k > args->mod) {
-		if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
-			return;
-	} else {
-		if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
-		    (int)args->minlen)
-			return;
-	}
-	ASSERT(rlen >= args->minlen);
-	ASSERT(rlen <= args->maxlen);
+	if (k > args->mod)
+		rlen = rlen - (k - args->mod);
+	else
+		rlen = rlen - args->prod + (args->mod - k);
+	if ((int)rlen < (int)args->minlen)
+		return;
+	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+	ASSERT(rlen % args->prod == args->mod);
 	args->len = rlen;
 }
 
@@ -232,7 +283,6 @@ xfs_alloc_fix_minleft(
 		return 1;
 	agf = XFS_BUF_TO_AGF(args->agbp);
 	diff = be32_to_cpu(agf->agf_freeblks)
-		+ be32_to_cpu(agf->agf_flcount)
 		- args->len - args->minleft;
 	if (diff >= 0)
 		return 1;
@@ -297,21 +347,20 @@ xfs_alloc_fixup_trees(
 			return error;
 		XFS_WANT_CORRUPTED_RETURN(i == 1);
 	}
+
 #ifdef DEBUG
-	{
-		xfs_alloc_block_t	*bnoblock;
-		xfs_alloc_block_t	*cntblock;
-
-		if (bno_cur->bc_nlevels == 1 &&
-		    cnt_cur->bc_nlevels == 1) {
-			bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
-			cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
-			XFS_WANT_CORRUPTED_RETURN(
-				be16_to_cpu(bnoblock->bb_numrecs) ==
-				be16_to_cpu(cntblock->bb_numrecs));
-		}
+	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+		struct xfs_btree_block	*bnoblock;
+		struct xfs_btree_block	*cntblock;
+
+		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
+		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+
+		XFS_WANT_CORRUPTED_RETURN(
+			bnoblock->bb_numrecs == cntblock->bb_numrecs);
 	}
 #endif
+
 	/*
 	 * Deal with all four cases: the allocated record is contained
 	 * within the freespace record, so we can have new freespace
@@ -336,7 +385,7 @@ xfs_alloc_fixup_trees(
 	/*
 	 * Delete the entry from the by-size btree.
 	 */
-	if ((error = xfs_alloc_delete(cnt_cur, &i)))
+	if ((error = xfs_btree_delete(cnt_cur, &i)))
 		return error;
 	XFS_WANT_CORRUPTED_RETURN(i == 1);
 	/*
@@ -346,7 +395,7 @@ xfs_alloc_fixup_trees(
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
 			return error;
 		XFS_WANT_CORRUPTED_RETURN(i == 0);
-		if ((error = xfs_alloc_insert(cnt_cur, &i)))
+		if ((error = xfs_btree_insert(cnt_cur, &i)))
 			return error;
 		XFS_WANT_CORRUPTED_RETURN(i == 1);
 	}
@@ -354,7 +403,7 @@ xfs_alloc_fixup_trees(
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
 			return error;
 		XFS_WANT_CORRUPTED_RETURN(i == 0);
-		if ((error = xfs_alloc_insert(cnt_cur, &i)))
+		if ((error = xfs_btree_insert(cnt_cur, &i)))
 			return error;
 		XFS_WANT_CORRUPTED_RETURN(i == 1);
 	}
@@ -365,7 +414,7 @@ xfs_alloc_fixup_trees(
 		/*
 		 * No remaining freespace, just delete the by-block tree entry.
 		 */
-		if ((error = xfs_alloc_delete(bno_cur, &i)))
+		if ((error = xfs_btree_delete(bno_cur, &i)))
 			return error;
 		XFS_WANT_CORRUPTED_RETURN(i == 1);
 	} else {
@@ -382,13 +431,94 @@ xfs_alloc_fixup_trees(
 		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
 			return error;
 		XFS_WANT_CORRUPTED_RETURN(i == 0);
-		if ((error = xfs_alloc_insert(bno_cur, &i)))
+		if ((error = xfs_btree_insert(bno_cur, &i)))
 			return error;
 		XFS_WANT_CORRUPTED_RETURN(i == 1);
 	}
 	return 0;
 }
 
+static bool
+xfs_agfl_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
+	int		i;
+
+	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+		return false;
+	if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+		return false;
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+		return false;
+
+	for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+		if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
+		    be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+			return false;
+	}
+	return true;
+}
+
+static void
+xfs_agfl_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	/*
+	 * There is no verification of non-crc AGFLs because mkfs does not
+	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
+	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
+	 * can't verify just those entries are valid.
+	 */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_agfl_verify(bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_agfl_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	/* no verification of non-crc AGFLs */
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (!xfs_agfl_verify(bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (bip)
+		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+	.verify_read = xfs_agfl_read_verify,
+	.verify_write = xfs_agfl_write_verify,
+};
+
 /*
  * Read in the allocation group free block array.
  */
@@ -406,133 +536,34 @@ xfs_alloc_read_agfl(
 	error = xfs_trans_read_buf(
 			mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0, &bp);
+			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
 	if (error)
 		return error;
-	ASSERT(bp);
-	ASSERT(!XFS_BUF_GETERROR(bp));
-	XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
+	xfs_buf_set_ref(bp, XFS_AGFL_REF);
 	*bpp = bp;
 	return 0;
 }
 
-#if defined(XFS_ALLOC_TRACE)
-/*
- * Add an allocation trace entry for an alloc call.
- */
-STATIC void
-xfs_alloc_trace_alloc(
-	char		*name,		/* function tag string */
-	char		*str,		/* additional string */
-	xfs_alloc_arg_t	*args,		/* allocation argument structure */
-	int		line)		/* source line number */
+STATIC int
+xfs_alloc_update_counters(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agbp,
+	long			len)
 {
-	ktrace_enter(xfs_alloc_trace_buf,
-		(void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)),
-		(void *)name,
-		(void *)str,
-		(void *)args->mp,
-		(void *)(__psunsigned_t)args->agno,
-		(void *)(__psunsigned_t)args->agbno,
-		(void *)(__psunsigned_t)args->minlen,
-		(void *)(__psunsigned_t)args->maxlen,
-		(void *)(__psunsigned_t)args->mod,
-		(void *)(__psunsigned_t)args->prod,
-		(void *)(__psunsigned_t)args->minleft,
-		(void *)(__psunsigned_t)args->total,
-		(void *)(__psunsigned_t)args->alignment,
-		(void *)(__psunsigned_t)args->len,
-		(void *)((((__psint_t)args->type) << 16) |
-			 (__psint_t)args->otype),
-		(void *)(__psint_t)((args->wasdel << 3) |
-				    (args->wasfromfl << 2) |
-				    (args->isfl << 1) |
-				    (args->userdata << 0)));
-}
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
 
-/*
- * Add an allocation trace entry for a free call.
- */
-STATIC void
-xfs_alloc_trace_free(
-	char		*name,		/* function tag string */
-	char		*str,		/* additional string */
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	xfs_agblock_t	agbno,		/* a.g. relative block number */
-	xfs_extlen_t	len,		/* length of extent */
-	int		isfl,		/* set if is freelist allocation/free */
-	int		line)		/* source line number */
-{
-	ktrace_enter(xfs_alloc_trace_buf,
-		(void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)),
-		(void *)name,
-		(void *)str,
-		(void *)mp,
-		(void *)(__psunsigned_t)agno,
-		(void *)(__psunsigned_t)agbno,
-		(void *)(__psunsigned_t)len,
-		(void *)(__psint_t)isfl,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
-}
+	pag->pagf_freeblks += len;
+	be32_add_cpu(&agf->agf_freeblks, len);
 
-/*
- * Add an allocation trace entry for modifying an agf.
- */
-STATIC void
-xfs_alloc_trace_modagf(
-	char		*name,		/* function tag string */
-	char		*str,		/* additional string */
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_agf_t	*agf,		/* new agf value */
-	int		flags,		/* logging flags for agf */
-	int		line)		/* source line number */
-{
-	ktrace_enter(xfs_alloc_trace_buf,
-		(void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)),
-		(void *)name,
-		(void *)str,
-		(void *)mp,
-		(void *)(__psint_t)flags,
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_seqno),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_length),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_flfirst),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_fllast),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_flcount),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_freeblks),
-		(void *)(__psunsigned_t)be32_to_cpu(agf->agf_longest));
-}
+	xfs_trans_agblocks_delta(tp, len);
+	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+		     be32_to_cpu(agf->agf_length)))
+		return EFSCORRUPTED;
 
-STATIC void
-xfs_alloc_trace_busy(
-	char		*name,		/* function tag string */
-	char		*str,		/* additional string */
-	xfs_mount_t	*mp,		/* file system mount poing */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	xfs_agblock_t	agbno,		/* a.g. relative block number */
-	xfs_extlen_t	len,		/* length of extent */
-	int		slot,		/* perag Busy slot */
-	xfs_trans_t	*tp,
-	int		trtype,		/* type: add, delete, search */
-	int		line)		/* source line number */
-{
-	ktrace_enter(xfs_alloc_trace_buf,
-		(void *)(__psint_t)(trtype | (line << 16)),
-		(void *)name,
-		(void *)str,
-		(void *)mp,
-		(void *)(__psunsigned_t)agno,
-		(void *)(__psunsigned_t)agbno,
-		(void *)(__psunsigned_t)len,
-		(void *)(__psint_t)slot,
-		(void *)tp,
-		NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+	return 0;
 }
-#endif	/* XFS_ALLOC_TRACE */
 
 /*
  * Allocation group level functions.
@@ -551,9 +582,6 @@ xfs_alloc_ag_vextent(
 	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
 {
 	int		error=0;
-#ifdef XFS_ALLOC_TRACE
-	static char	fname[] = "xfs_alloc_ag_vextent";
-#endif
 
 	ASSERT(args->minlen > 0);
 	ASSERT(args->maxlen > 0);
@@ -578,46 +606,36 @@ xfs_alloc_ag_vextent(
 		ASSERT(0);
 		/* NOTREACHED */
 	}
-	if (error)
+
+	if (error || args->agbno == NULLAGBLOCK)
 		return error;
-	/*
-	 * If the allocation worked, need to change the agf structure
-	 * (and log it), and the superblock.
-	 */
-	if (args->agbno != NULLAGBLOCK) {
-		xfs_agf_t	*agf;	/* allocation group freelist header */
-#ifdef XFS_ALLOC_TRACE
-		xfs_mount_t	*mp = args->mp;
-#endif
-		long		slen = (long)args->len;
 
-		ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
-		ASSERT(!(args->wasfromfl) || !args->isfl);
-		ASSERT(args->agbno % args->alignment == 0);
-		if (!(args->wasfromfl)) {
-
-			agf = XFS_BUF_TO_AGF(args->agbp);
-			be32_add(&agf->agf_freeblks, -(args->len));
-			xfs_trans_agblocks_delta(args->tp,
-						 -((long)(args->len)));
-			args->pag->pagf_freeblks -= args->len;
-			ASSERT(be32_to_cpu(agf->agf_freeblks) <=
-				be32_to_cpu(agf->agf_length));
-			TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
-			xfs_alloc_log_agf(args->tp, args->agbp,
-						XFS_AGF_FREEBLKS);
-			/* search the busylist for these blocks */
-			xfs_alloc_search_busy(args->tp, args->agno,
-					args->agbno, args->len);
-		}
-		if (!args->isfl)
-			xfs_trans_mod_sb(args->tp,
-				args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
-					XFS_TRANS_SB_FDBLOCKS, -slen);
-		XFS_STATS_INC(xs_allocx);
-		XFS_STATS_ADD(xs_allocb, args->len);
+	ASSERT(args->len >= args->minlen);
+	ASSERT(args->len <= args->maxlen);
+	ASSERT(!args->wasfromfl || !args->isfl);
+	ASSERT(args->agbno % args->alignment == 0);
+
+	if (!args->wasfromfl) {
+		error = xfs_alloc_update_counters(args->tp, args->pag,
+						  args->agbp,
+						  -((long)(args->len)));
+		if (error)
+			return error;
+
+		ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
+					      args->agbno, args->len));
 	}
-	return 0;
+
+	if (!args->isfl) {
+		xfs_trans_mod_sb(args->tp, args->wasdel ?
+				 XFS_TRANS_SB_RES_FDBLOCKS :
+				 XFS_TRANS_SB_FDBLOCKS,
+				 -((long)(args->len)));
+	}
+
+	XFS_STATS_INC(xs_allocx);
+	XFS_STATS_ADD(xs_allocb, args->len);
+	return error;
 }
 
 /*
@@ -632,97 +650,194 @@ xfs_alloc_ag_vextent_exact(
 {
 	xfs_btree_cur_t	*bno_cur;/* by block-number btree cursor */
 	xfs_btree_cur_t	*cnt_cur;/* by count btree cursor */
-	xfs_agblock_t	end;	/* end of allocated extent */
 	int		error;
 	xfs_agblock_t	fbno;	/* start block of found extent */
-	xfs_agblock_t	fend;	/* end block of found extent */
 	xfs_extlen_t	flen;	/* length of found extent */
-#ifdef XFS_ALLOC_TRACE
-	static char	fname[] = "xfs_alloc_ag_vextent_exact";
-#endif
+	xfs_agblock_t	tbno;	/* start block of trimmed extent */
+	xfs_extlen_t	tlen;	/* length of trimmed extent */
+	xfs_agblock_t	tend;	/* end block of trimmed extent */
 	int		i;	/* success/failure of operation */
-	xfs_agblock_t	maxend;	/* end of maximal extent */
-	xfs_agblock_t	minend;	/* end of minimal extent */
-	xfs_extlen_t	rlen;	/* length of returned extent */
 
 	ASSERT(args->alignment == 1);
+
 	/*
 	 * Allocate/initialize a cursor for the by-number freespace btree.
 	 */
-	bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_BNO, NULL, 0);
+	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+					  args->agno, XFS_BTNUM_BNO);
+
 	/*
 	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
 	 * Look for the closest free block <= bno, it must contain bno
 	 * if any free block does.
 	 */
-	if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
+	if (error)
 		goto error0;
-	if (!i) {
-		/*
-		 * Didn't find it, return null.
-		 */
-		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-		args->agbno = NULLAGBLOCK;
-		return 0;
-	}
+	if (!i)
+		goto not_found;
+
 	/*
 	 * Grab the freespace record.
 	 */
-	if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
+	if (error)
 		goto error0;
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 	ASSERT(fbno <= args->agbno);
-	minend = args->agbno + args->minlen;
-	maxend = args->agbno + args->maxlen;
-	fend = fbno + flen;
+
 	/*
-	 * Give up if the freespace isn't long enough for the minimum request.
+	 * Check for overlapping busy extents.
 	 */
-	if (fend < minend) {
-		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-		args->agbno = NULLAGBLOCK;
-		return 0;
-	}
+	xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
+
 	/*
-	 * End of extent will be smaller of the freespace end and the
-	 * maximal requested end.
+	 * Give up if the start of the extent is busy, or the freespace isn't
+	 * long enough for the minimum request.
 	 */
-	end = XFS_AGBLOCK_MIN(fend, maxend);
+	if (tbno > args->agbno)
+		goto not_found;
+	if (tlen < args->minlen)
+		goto not_found;
+	tend = tbno + tlen;
+	if (tend < args->agbno + args->minlen)
+		goto not_found;
+
 	/*
+	 * End of extent will be smaller of the freespace end and the
+	 * maximal requested end.
+	 *
 	 * Fix the length according to mod and prod if given.
 	 */
-	args->len = end - args->agbno;
+	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
+						- args->agbno;
 	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args)) {
-		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
-		return 0;
-	}
-	rlen = args->len;
-	ASSERT(args->agbno + rlen <= fend);
-	end = args->agbno + rlen;
+	if (!xfs_alloc_fix_minleft(args))
+		goto not_found;
+
+	ASSERT(args->agbno + args->len <= tend);
+
 	/*
-	 * We are allocating agbno for rlen [agbno .. end]
+	 * We are allocating agbno for args->len
 	 * Allocate/initialize a cursor for the by-size btree.
 	 */
-	cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_CNT, NULL, 0);
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
 	ASSERT(args->agbno + args->len <=
 		be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
-	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
-			args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
+				      args->len, XFSA_FIXUP_BNO_OK);
+	if (error) {
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
 		goto error0;
 	}
+
 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-	TRACE_ALLOC("normal", args);
+
 	args->wasfromfl = 0;
+	trace_xfs_alloc_exact_done(args);
+	return 0;
+
+not_found:
+	/* Didn't find it, return null. */
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	args->agbno = NULLAGBLOCK;
+	trace_xfs_alloc_exact_notfound(args);
 	return 0;
 
 error0:
 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
-	TRACE_ALLOC("error", args);
+	trace_xfs_alloc_exact_error(args);
+	return error;
+}
+
+/*
+ * Search the btree in a given direction via the search cursor and compare
+ * the records found against the good extent we've already found.
+ */
+STATIC int
+xfs_alloc_find_best_extent(
+	struct xfs_alloc_arg	*args,	/* allocation argument structure */
+	struct xfs_btree_cur	**gcur,	/* good cursor */
+	struct xfs_btree_cur	**scur,	/* searching cursor */
+	xfs_agblock_t		gdiff,	/* difference for search comparison */
+	xfs_agblock_t		*sbno,	/* extent found by search */
+	xfs_extlen_t		*slen,	/* extent length */
+	xfs_agblock_t		*sbnoa,	/* aligned extent found by search */
+	xfs_extlen_t		*slena,	/* aligned extent length */
+	int			dir)	/* 0 = search right, 1 = search left */
+{
+	xfs_agblock_t		new;
+	xfs_agblock_t		sdiff;
+	int			error;
+	int			i;
+
+	/* The good extent is perfect, no need to  search. */
+	if (!gdiff)
+		goto out_use_good;
+
+	/*
+	 * Look until we find a better one, run out of space or run off the end.
+	 */
+	do {
+		error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+		if (error)
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+
+		/*
+		 * The good extent is closer than this one.
+		 */
+		if (!dir) {
+			if (*sbnoa >= args->agbno + gdiff)
+				goto out_use_good;
+		} else {
+			if (*sbnoa <= args->agbno - gdiff)
+				goto out_use_good;
+		}
+
+		/*
+		 * Same distance, compare length and pick the best.
+		 */
+		if (*slena >= args->minlen) {
+			args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
+			xfs_alloc_fix_len(args);
+
+			sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+						       args->alignment,
+						       args->userdata, *sbnoa,
+						       *slena, &new);
+
+			/*
+			 * Choose closer size and invalidate other cursor.
+			 */
+			if (sdiff < gdiff)
+				goto out_use_search;
+			goto out_use_good;
+		}
+
+		if (!dir)
+			error = xfs_btree_increment(*scur, 0, &i);
+		else
+			error = xfs_btree_decrement(*scur, 0, &i);
+		if (error)
+			goto error0;
+	} while (i);
+
+out_use_good:
+	xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
+	*scur = NULL;
+	return 0;
+
+out_use_search:
+	xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
+	*gcur = NULL;
+	return 0;
+
+error0:
+	/* caller invalidates cursors */
 	return error;
 }
 
@@ -739,9 +854,6 @@ xfs_alloc_ag_vextent_near(
 	xfs_btree_cur_t	*bno_cur_gt;	/* cursor for bno btree, right side */
 	xfs_btree_cur_t	*bno_cur_lt;	/* cursor for bno btree, left side */
 	xfs_btree_cur_t	*cnt_cur;	/* cursor for count btree */
-#ifdef XFS_ALLOC_TRACE
-	static char	fname[] = "xfs_alloc_ag_vextent_near";
-#endif
 	xfs_agblock_t	gtbno;		/* start bno of right side entry */
 	xfs_agblock_t	gtbnoa;		/* aligned ... */
 	xfs_extlen_t	gtdiff;		/* difference to right side entry */
@@ -754,27 +866,33 @@ xfs_alloc_ag_vextent_near(
 	xfs_agblock_t	ltbno;		/* start bno of left side entry */
 	xfs_agblock_t	ltbnoa;		/* aligned ... */
 	xfs_extlen_t	ltdiff;		/* difference to left side entry */
-	/*REFERENCED*/
-	xfs_agblock_t	ltend;		/* end bno of left side entry */
 	xfs_extlen_t	ltlen;		/* length of left side entry */
 	xfs_extlen_t	ltlena;		/* aligned ... */
 	xfs_agblock_t	ltnew;		/* useful start bno of left side */
 	xfs_extlen_t	rlen;		/* length of returned extent */
-#if defined(DEBUG) && defined(__KERNEL__)
+	int		forced = 0;
+#ifdef DEBUG
 	/*
 	 * Randomly don't execute the first algorithm.
 	 */
 	int		dofirst;	/* set to do first algorithm */
 
-	dofirst = random() & 1;
+	dofirst = prandom_u32() & 1;
 #endif
+
+restart:
+	bno_cur_lt = NULL;
+	bno_cur_gt = NULL;
+	ltlen = 0;
+	gtlena = 0;
+	ltlena = 0;
+
 	/*
 	 * Get a cursor for the by-size btree.
 	 */
-	cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_CNT, NULL, 0);
-	ltlen = 0;
-	bno_cur_lt = bno_cur_gt = NULL;
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
+
 	/*
 	 * See if there are any free extents as big as maxlen.
 	 */
@@ -790,11 +908,13 @@ xfs_alloc_ag_vextent_near(
 			goto error0;
 		if (i == 0 || ltlen == 0) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_near_noentry(args);
 			return 0;
 		}
 		ASSERT(i == 1);
 	}
 	args->wasfromfl = 0;
+
 	/*
 	 * First algorithm.
 	 * If the requested extent is large wrt the freespaces available
@@ -811,8 +931,8 @@ xfs_alloc_ag_vextent_near(
 		xfs_extlen_t	blen=0;
 		xfs_agblock_t	bnew=0;
 
-#if defined(DEBUG) && defined(__KERNEL__)
-		if (!dofirst)
+#ifdef DEBUG
+		if (dofirst)
 			break;
 #endif
 		/*
@@ -830,7 +950,7 @@ xfs_alloc_ag_vextent_near(
 				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 				if (ltlen >= args->minlen)
 					break;
-				if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
+				if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
 					goto error0;
 			} while (i);
 			ASSERT(ltlen >= args->minlen);
@@ -840,7 +960,7 @@ xfs_alloc_ag_vextent_near(
 		i = cnt_cur->bc_ptrs[0];
 		for (j = 1, blen = 0, bdiff = 0;
 		     !error && j && (blen < args->maxlen || bdiff > 0);
-		     error = xfs_alloc_increment(cnt_cur, 0, &j)) {
+		     error = xfs_btree_increment(cnt_cur, 0, &j)) {
 			/*
 			 * For each entry, decide if it's better than
 			 * the previous best entry.
@@ -848,9 +968,9 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (!xfs_alloc_compute_aligned(ltbno, ltlen,
-					args->alignment, args->minlen,
-					&ltbnoa, &ltlena))
+			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+						  &ltbnoa, &ltlena);
+			if (ltlena < args->minlen)
 				continue;
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
@@ -858,7 +978,8 @@ xfs_alloc_ag_vextent_near(
 			if (args->len < blen)
 				continue;
 			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
-				args->alignment, ltbno, ltlen, &ltnew);
+				args->alignment, args->userdata, ltbnoa,
+				ltlena, &ltnew);
 			if (ltnew != NULLAGBLOCK &&
 			    (args->len > blen || ltdiff < bdiff)) {
 				bdiff = ltdiff;
@@ -880,12 +1001,11 @@ xfs_alloc_ag_vextent_near(
 		if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		ltend = ltbno + ltlen;
-		ASSERT(ltend <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
 		args->len = blen;
 		if (!xfs_alloc_fix_minleft(args)) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			TRACE_ALLOC("nominleft", args);
+			trace_xfs_alloc_near_nominleft(args);
 			return 0;
 		}
 		blen = args->len;
@@ -894,12 +1014,12 @@ xfs_alloc_ag_vextent_near(
 		 */
 		args->agbno = bnew;
 		ASSERT(bnew >= ltbno);
-		ASSERT(bnew + blen <= ltend);
+		ASSERT(bnew + blen <= ltbno + ltlen);
 		/*
 		 * Set up a cursor for the by-bno tree.
 		 */
-		bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
-			args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0);
+		bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
+			args->agbp, args->agno, XFS_BTNUM_BNO);
 		/*
 		 * Fix up the btree entries.
 		 */
@@ -908,7 +1028,8 @@ xfs_alloc_ag_vextent_near(
 			goto error0;
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-		TRACE_ALLOC("first", args);
+
+		trace_xfs_alloc_near_first(args);
 		return 0;
 	}
 	/*
@@ -926,8 +1047,8 @@ xfs_alloc_ag_vextent_near(
 	/*
 	 * Allocate and initialize the cursor for the leftward search.
 	 */
-	bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_BNO, NULL, 0);
+	bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO);
 	/*
 	 * Lookup <= bno to find the leftward search's starting point.
 	 */
@@ -950,7 +1071,7 @@ xfs_alloc_ag_vextent_near(
 	 * Increment the cursor, so we will point at the entry just right
 	 * of the leftward entry if any, or to the leftmost entry.
 	 */
-	if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+	if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
 		goto error0;
 	if (!i) {
 		/*
@@ -969,11 +1090,11 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (xfs_alloc_compute_aligned(ltbno, ltlen,
-					args->alignment, args->minlen,
-					&ltbnoa, &ltlena))
+			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+						  &ltbnoa, &ltlena);
+			if (ltlena >= args->minlen)
 				break;
-			if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
+			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
 				goto error0;
 			if (!i) {
 				xfs_btree_del_cursor(bno_cur_lt,
@@ -985,11 +1106,11 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if (xfs_alloc_compute_aligned(gtbno, gtlen,
-					args->alignment, args->minlen,
-					&gtbnoa, &gtlena))
+			xfs_alloc_compute_aligned(args, gtbno, gtlen,
+						  &gtbnoa, &gtlena);
+			if (gtlena >= args->minlen)
 				break;
-			if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
 				goto error0;
 			if (!i) {
 				xfs_btree_del_cursor(bno_cur_gt,
@@ -998,211 +1119,65 @@ xfs_alloc_ag_vextent_near(
 			}
 		}
 	} while (bno_cur_lt || bno_cur_gt);
+
 	/*
 	 * Got both cursors still active, need to find better entry.
 	 */
 	if (bno_cur_lt && bno_cur_gt) {
-		/*
-		 * Left side is long enough, look for a right side entry.
-		 */
 		if (ltlena >= args->minlen) {
 			/*
-			 * Fix up the length.
+			 * Left side is good, look for a right side entry.
 			 */
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 			xfs_alloc_fix_len(args);
-			rlen = args->len;
-			ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
-				args->alignment, ltbno, ltlen, &ltnew);
-			/*
-			 * Not perfect.
-			 */
-			if (ltdiff) {
-				/*
-				 * Look until we find a better one, run out of
-				 * space, or run off the end.
-				 */
-				while (bno_cur_lt && bno_cur_gt) {
-					if ((error = xfs_alloc_get_rec(
-							bno_cur_gt, &gtbno,
-							&gtlen, &i)))
-						goto error0;
-					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-					xfs_alloc_compute_aligned(gtbno, gtlen,
-						args->alignment, args->minlen,
-						&gtbnoa, &gtlena);
-					/*
-					 * The left one is clearly better.
-					 */
-					if (gtbnoa >= args->agbno + ltdiff) {
-						xfs_btree_del_cursor(
-							bno_cur_gt,
-							XFS_BTREE_NOERROR);
-						bno_cur_gt = NULL;
-						break;
-					}
-					/*
-					 * If we reach a big enough entry,
-					 * compare the two and pick the best.
-					 */
-					if (gtlena >= args->minlen) {
-						args->len =
-							XFS_EXTLEN_MIN(gtlena,
-								args->maxlen);
-						xfs_alloc_fix_len(args);
-						rlen = args->len;
-						gtdiff = xfs_alloc_compute_diff(
-							args->agbno, rlen,
-							args->alignment,
-							gtbno, gtlen, &gtnew);
-						/*
-						 * Right side is better.
-						 */
-						if (gtdiff < ltdiff) {
-							xfs_btree_del_cursor(
-								bno_cur_lt,
-								XFS_BTREE_NOERROR);
-							bno_cur_lt = NULL;
-						}
-						/*
-						 * Left side is better.
-						 */
-						else {
-							xfs_btree_del_cursor(
-								bno_cur_gt,
-								XFS_BTREE_NOERROR);
-							bno_cur_gt = NULL;
-						}
-						break;
-					}
-					/*
-					 * Fell off the right end.
-					 */
-					if ((error = xfs_alloc_increment(
-							bno_cur_gt, 0, &i)))
-						goto error0;
-					if (!i) {
-						xfs_btree_del_cursor(
-							bno_cur_gt,
-							XFS_BTREE_NOERROR);
-						bno_cur_gt = NULL;
-						break;
-					}
-				}
-			}
-			/*
-			 * The left side is perfect, trash the right side.
-			 */
-			else {
-				xfs_btree_del_cursor(bno_cur_gt,
-						     XFS_BTREE_NOERROR);
-				bno_cur_gt = NULL;
-			}
-		}
-		/*
-		 * It's the right side that was found first, look left.
-		 */
-		else {
+			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, args->userdata, ltbnoa,
+				ltlena, &ltnew);
+
+			error = xfs_alloc_find_best_extent(args,
+						&bno_cur_lt, &bno_cur_gt,
+						ltdiff, &gtbno, &gtlen,
+						&gtbnoa, &gtlena,
+						0 /* search right */);
+		} else {
+			ASSERT(gtlena >= args->minlen);
+
 			/*
-			 * Fix up the length.
+			 * Right side is good, look for a left side entry.
 			 */
 			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
 			xfs_alloc_fix_len(args);
-			rlen = args->len;
-			gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
-				args->alignment, gtbno, gtlen, &gtnew);
-			/*
-			 * Right side entry isn't perfect.
-			 */
-			if (gtdiff) {
-				/*
-				 * Look until we find a better one, run out of
-				 * space, or run off the end.
-				 */
-				while (bno_cur_lt && bno_cur_gt) {
-					if ((error = xfs_alloc_get_rec(
-							bno_cur_lt, &ltbno,
-							&ltlen, &i)))
-						goto error0;
-					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-					xfs_alloc_compute_aligned(ltbno, ltlen,
-						args->alignment, args->minlen,
-						&ltbnoa, &ltlena);
-					/*
-					 * The right one is clearly better.
-					 */
-					if (ltbnoa <= args->agbno - gtdiff) {
-						xfs_btree_del_cursor(
-							bno_cur_lt,
-							XFS_BTREE_NOERROR);
-						bno_cur_lt = NULL;
-						break;
-					}
-					/*
-					 * If we reach a big enough entry,
-					 * compare the two and pick the best.
-					 */
-					if (ltlena >= args->minlen) {
-						args->len = XFS_EXTLEN_MIN(
-							ltlena, args->maxlen);
-						xfs_alloc_fix_len(args);
-						rlen = args->len;
-						ltdiff = xfs_alloc_compute_diff(
-							args->agbno, rlen,
-							args->alignment,
-							ltbno, ltlen, &ltnew);
-						/*
-						 * Left side is better.
-						 */
-						if (ltdiff < gtdiff) {
-							xfs_btree_del_cursor(
-								bno_cur_gt,
-								XFS_BTREE_NOERROR);
-							bno_cur_gt = NULL;
-						}
-						/*
-						 * Right side is better.
-						 */
-						else {
-							xfs_btree_del_cursor(
-								bno_cur_lt,
-								XFS_BTREE_NOERROR);
-							bno_cur_lt = NULL;
-						}
-						break;
-					}
-					/*
-					 * Fell off the left end.
-					 */
-					if ((error = xfs_alloc_decrement(
-							bno_cur_lt, 0, &i)))
-						goto error0;
-					if (!i) {
-						xfs_btree_del_cursor(bno_cur_lt,
-							XFS_BTREE_NOERROR);
-						bno_cur_lt = NULL;
-						break;
-					}
-				}
-			}
-			/*
-			 * The right side is perfect, trash the left side.
-			 */
-			else {
-				xfs_btree_del_cursor(bno_cur_lt,
-					XFS_BTREE_NOERROR);
-				bno_cur_lt = NULL;
-			}
+			gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, args->userdata, gtbnoa,
+				gtlena, &gtnew);
+
+			error = xfs_alloc_find_best_extent(args,
+						&bno_cur_gt, &bno_cur_lt,
+						gtdiff, &ltbno, &ltlen,
+						&ltbnoa, &ltlena,
+						1 /* search left */);
 		}
+
+		if (error)
+			goto error0;
 	}
+
 	/*
 	 * If we couldn't get anything, give up.
 	 */
 	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
-		TRACE_ALLOC("neither", args);
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
+		if (!forced++) {
+			trace_xfs_alloc_near_busy(args);
+			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			goto restart;
+		}
+		trace_xfs_alloc_size_neither(args);
 		args->agbno = NULLAGBLOCK;
 		return 0;
 	}
+
 	/*
 	 * At this point we have selected a freespace entry, either to the
 	 * left or to the right.  If it's on the right, copy all the
@@ -1219,35 +1194,41 @@ xfs_alloc_ag_vextent_near(
 		j = 1;
 	} else
 		j = 0;
+
 	/*
 	 * Fix up the length and compute the useful address.
 	 */
-	ltend = ltbno + ltlen;
 	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 	xfs_alloc_fix_len(args);
 	if (!xfs_alloc_fix_minleft(args)) {
-		TRACE_ALLOC("nominleft", args);
+		trace_xfs_alloc_near_nominleft(args);
 		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 		return 0;
 	}
 	rlen = args->len;
-	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
-		ltlen, &ltnew);
+	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
+				     args->userdata, ltbnoa, ltlena, &ltnew);
 	ASSERT(ltnew >= ltbno);
-	ASSERT(ltnew + rlen <= ltend);
+	ASSERT(ltnew + rlen <= ltbnoa + ltlena);
 	ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
 	args->agbno = ltnew;
+
 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
 			ltnew, rlen, XFSA_FIXUP_BNO_OK)))
 		goto error0;
-	TRACE_ALLOC(j ? "gt" : "lt", args);
+
+	if (j)
+		trace_xfs_alloc_near_greater(args);
+	else
+		trace_xfs_alloc_near_lesser(args);
+
 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 	xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
 	return 0;
 
  error0:
-	TRACE_ALLOC("error", args);
+	trace_xfs_alloc_near_error(args);
 	if (cnt_cur != NULL)
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
 	if (bno_cur_lt != NULL)
@@ -1272,56 +1253,95 @@ xfs_alloc_ag_vextent_size(
 	int		error;		/* error result */
 	xfs_agblock_t	fbno;		/* start of found freespace */
 	xfs_extlen_t	flen;		/* length of found freespace */
-#ifdef XFS_ALLOC_TRACE
-	static char	fname[] = "xfs_alloc_ag_vextent_size";
-#endif
 	int		i;		/* temp status variable */
 	xfs_agblock_t	rbno;		/* returned block number */
 	xfs_extlen_t	rlen;		/* length of returned extent */
+	int		forced = 0;
 
+restart:
 	/*
 	 * Allocate and initialize a cursor for the by-size btree.
 	 */
-	cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_CNT, NULL, 0);
+	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT);
 	bno_cur = NULL;
+
 	/*
 	 * Look for an entry >= maxlen+alignment-1 blocks.
 	 */
 	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
 			args->maxlen + args->alignment - 1, &i)))
 		goto error0;
+
 	/*
-	 * If none, then pick up the last entry in the tree unless the
-	 * tree is empty.
+	 * If none or we have busy extents that we cannot allocate from, then
+	 * we have to settle for a smaller extent. In the case that there are
+	 * no large extents, this will return the last entry in the tree unless
+	 * the tree is empty. In the case that there are only busy large
+	 * extents, this will return the largest small extent unless there
+	 * are no smaller extents available.
 	 */
-	if (!i) {
-		if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
-				&flen, &i)))
+	if (!i || forced > 1) {
+		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+						   &fbno, &flen, &i);
+		if (error)
 			goto error0;
 		if (i == 0 || flen == 0) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			TRACE_ALLOC("noentry", args);
+			trace_xfs_alloc_size_noentry(args);
 			return 0;
 		}
 		ASSERT(i == 1);
+		xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+	} else {
+		/*
+		 * Search for a non-busy extent that is large enough.
+		 * If we are at low space, don't check, or if we fall of
+		 * the end of the btree, turn off the busy check and
+		 * restart.
+		 */
+		for (;;) {
+			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+			if (error)
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+			xfs_alloc_compute_aligned(args, fbno, flen,
+						  &rbno, &rlen);
+
+			if (rlen >= args->maxlen)
+				break;
+
+			error = xfs_btree_increment(cnt_cur, 0, &i);
+			if (error)
+				goto error0;
+			if (i == 0) {
+				/*
+				 * Our only valid extents must have been busy.
+				 * Make it unbusy by forcing the log out and
+				 * retrying. If we've been here before, forcing
+				 * the log isn't making the extents available,
+				 * which means they have probably been freed in
+				 * this transaction.  In that case, we have to
+				 * give up on them and we'll attempt a minlen
+				 * allocation the next time around.
+				 */
+				xfs_btree_del_cursor(cnt_cur,
+						     XFS_BTREE_NOERROR);
+				trace_xfs_alloc_size_busy(args);
+				if (!forced++)
+					xfs_log_force(args->mp, XFS_LOG_SYNC);
+				goto restart;
+			}
+		}
 	}
-	/*
-	 * There's a freespace as big as maxlen+alignment-1, get it.
-	 */
-	else {
-		if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	}
+
 	/*
 	 * In the first case above, we got the last entry in the
 	 * by-size btree.  Now we check to see if the space hits maxlen
 	 * once aligned; if not, we search left for something better.
 	 * This can't happen in the second case above.
 	 */
-	xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
-		&rbno, &rlen);
 	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
 	XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
 			(rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1336,7 +1356,7 @@ xfs_alloc_ag_vextent_size(
 		bestflen = flen;
 		bestfbno = fbno;
 		for (;;) {
-			if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
+			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
 				goto error0;
 			if (i == 0)
 				break;
@@ -1346,8 +1366,8 @@ xfs_alloc_ag_vextent_size(
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 			if (flen < bestrlen)
 				break;
-			xfs_alloc_compute_aligned(fbno, flen, args->alignment,
-				args->minlen, &rbno, &rlen);
+			xfs_alloc_compute_aligned(args, fbno, flen,
+						  &rbno, &rlen);
 			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
 			XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
 				(rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1375,20 +1395,26 @@ xfs_alloc_ag_vextent_size(
 	 * Fix up the length.
 	 */
 	args->len = rlen;
-	xfs_alloc_fix_len(args);
-	if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-		TRACE_ALLOC("nominleft", args);
-		args->agbno = NULLAGBLOCK;
-		return 0;
+	if (rlen < args->minlen) {
+		if (!forced++) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			trace_xfs_alloc_size_busy(args);
+			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			goto restart;
+		}
+		goto out_nominleft;
 	}
+	xfs_alloc_fix_len(args);
+
+	if (!xfs_alloc_fix_minleft(args))
+		goto out_nominleft;
 	rlen = args->len;
 	XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
 	/*
 	 * Allocate and initialize a cursor for the by-block tree.
 	 */
-	bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-		args->agno, XFS_BTNUM_BNO, NULL, 0);
+	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO);
 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
 			rbno, rlen, XFSA_FIXUP_CNT_OK)))
 		goto error0;
@@ -1401,16 +1427,22 @@ xfs_alloc_ag_vextent_size(
 		args->agbno + args->len <=
 			be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
 		error0);
-	TRACE_ALLOC("normal", args);
+	trace_xfs_alloc_size_done(args);
 	return 0;
 
 error0:
-	TRACE_ALLOC("error", args);
+	trace_xfs_alloc_size_error(args);
 	if (cnt_cur)
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
 	if (bno_cur)
 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
 	return error;
+
+out_nominleft:
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	trace_xfs_alloc_size_nominleft(args);
+	args->agbno = NULLAGBLOCK;
+	return 0;
 }
 
 /*
@@ -1429,12 +1461,9 @@ xfs_alloc_ag_vextent_small(
 	int		error;
 	xfs_agblock_t	fbno;
 	xfs_extlen_t	flen;
-#ifdef XFS_ALLOC_TRACE
-	static char	fname[] = "xfs_alloc_ag_vextent_small";
-#endif
 	int		i;
 
-	if ((error = xfs_alloc_decrement(ccur, 0, &i)))
+	if ((error = xfs_btree_decrement(ccur, 0, &i)))
 		goto error0;
 	if (i) {
 		if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
@@ -1449,9 +1478,13 @@ xfs_alloc_ag_vextent_small(
 	else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
 		 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
 		  > args->minleft)) {
-		if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno)))
+		error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+		if (error)
 			goto error0;
 		if (fbno != NULLAGBLOCK) {
+			xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+					     args->userdata);
+
 			if (args->userdata) {
 				xfs_buf_t	*bp;
 
@@ -1466,7 +1499,7 @@ xfs_alloc_ag_vextent_small(
 				be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
 				error0);
 			args->wasfromfl = 1;
-			TRACE_ALLOC("freelist", args);
+			trace_xfs_alloc_small_freelist(args);
 			*stat = 0;
 			return 0;
 		}
@@ -1479,24 +1512,26 @@ xfs_alloc_ag_vextent_small(
 	/*
 	 * Can't allocate from the freelist for some reason.
 	 */
-	else
+	else {
+		fbno = NULLAGBLOCK;
 		flen = 0;
+	}
 	/*
 	 * Can't do the allocation, give up.
 	 */
 	if (flen < args->minlen) {
 		args->agbno = NULLAGBLOCK;
-		TRACE_ALLOC("notenough", args);
+		trace_xfs_alloc_small_notenough(args);
 		flen = 0;
 	}
 	*fbnop = fbno;
 	*flenp = flen;
 	*stat = 1;
-	TRACE_ALLOC("normal", args);
+	trace_xfs_alloc_small_done(args);
 	return 0;
 
 error0:
-	TRACE_ALLOC("error", args);
+	trace_xfs_alloc_small_error(args);
 	return error;
 }
 
@@ -1515,9 +1550,6 @@ xfs_free_ag_extent(
 	xfs_btree_cur_t	*bno_cur;	/* cursor for by-block btree */
 	xfs_btree_cur_t	*cnt_cur;	/* cursor for by-size btree */
 	int		error;		/* error return value */
-#ifdef XFS_ALLOC_TRACE
-	static char	fname[] = "xfs_free_ag_extent";
-#endif
 	xfs_agblock_t	gtbno;		/* start of right neighbor block */
 	xfs_extlen_t	gtlen;		/* length of right neighbor block */
 	int		haveleft;	/* have a left neighbor block */
@@ -1528,13 +1560,13 @@ xfs_free_ag_extent(
 	xfs_mount_t	*mp;		/* mount point struct for filesystem */
 	xfs_agblock_t	nbno;		/* new starting block of freespace */
 	xfs_extlen_t	nlen;		/* new length of freespace */
+	xfs_perag_t	*pag;		/* per allocation group data */
 
 	mp = tp->t_mountp;
 	/*
 	 * Allocate and initialize a cursor for the by-block btree.
 	 */
-	bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL,
-		0);
+	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
 	cnt_cur = NULL;
 	/*
 	 * Look for a neighboring block on the left (lower block numbers)
@@ -1567,7 +1599,7 @@ xfs_free_ag_extent(
 	 * Look for a neighboring block on the right (higher block numbers)
 	 * that is contiguous with this space.
 	 */
-	if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
+	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
 		goto error0;
 	if (haveright) {
 		/*
@@ -1593,8 +1625,7 @@ xfs_free_ag_extent(
 	/*
 	 * Now allocate and initialize a cursor for the by-size tree.
 	 */
-	cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL,
-		0);
+	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
 	/*
 	 * Have both left and right contiguous neighbors.
 	 * Merge all three into a single free block.
@@ -1606,7 +1637,7 @@ xfs_free_ag_extent(
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		/*
@@ -1615,19 +1646,19 @@ xfs_free_ag_extent(
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		/*
 		 * Delete the old by-block entry for the right block.
 		 */
-		if ((error = xfs_alloc_delete(bno_cur, &i)))
+		if ((error = xfs_btree_delete(bno_cur, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		/*
 		 * Move the by-block cursor back to the left neighbor.
 		 */
-		if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 #ifdef DEBUG
@@ -1666,14 +1697,14 @@ xfs_free_ag_extent(
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		/*
 		 * Back up the by-block cursor to the left neighbor, and
 		 * update its length.
 		 */
-		if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		nbno = ltbno;
@@ -1692,7 +1723,7 @@ xfs_free_ag_extent(
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+		if ((error = xfs_btree_delete(cnt_cur, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		/*
@@ -1711,7 +1742,7 @@ xfs_free_ag_extent(
 	else {
 		nbno = bno;
 		nlen = len;
-		if ((error = xfs_alloc_insert(bno_cur, &i)))
+		if ((error = xfs_btree_insert(bno_cur, &i)))
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 	}
@@ -1723,55 +1754,32 @@ xfs_free_ag_extent(
 	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
 		goto error0;
 	XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
-	if ((error = xfs_alloc_insert(cnt_cur, &i)))
+	if ((error = xfs_btree_insert(cnt_cur, &i)))
 		goto error0;
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 	cnt_cur = NULL;
+
 	/*
 	 * Update the freespace totals in the ag and superblock.
 	 */
-	{
-		xfs_agf_t	*agf;
-		xfs_perag_t	*pag;		/* per allocation group data */
-
-		agf = XFS_BUF_TO_AGF(agbp);
-		pag = &mp->m_perag[agno];
-		be32_add(&agf->agf_freeblks, len);
-		xfs_trans_agblocks_delta(tp, len);
-		pag->pagf_freeblks += len;
-		XFS_WANT_CORRUPTED_GOTO(
-			be32_to_cpu(agf->agf_freeblks) <=
-			be32_to_cpu(agf->agf_length),
-			error0);
-		TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
-		xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
-		if (!isfl)
-			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-		XFS_STATS_INC(xs_freex);
-		XFS_STATS_ADD(xs_freeb, len);
-	}
-	TRACE_FREE(haveleft ?
-			(haveright ? "both" : "left") :
-			(haveright ? "right" : "none"),
-		agno, bno, len, isfl);
-
-	/*
-	 * Since blocks move to the free list without the coordination
-	 * used in xfs_bmap_finish, we can't allow block to be available
-	 * for reallocation and non-transaction writing (user data)
-	 * until we know that the transaction that moved it to the free
-	 * list is permanently on disk.  We track the blocks by declaring
-	 * these blocks as "busy"; the busy list is maintained on a per-ag
-	 * basis and each transaction records which entries should be removed
-	 * when the iclog commits to disk.  If a busy block is allocated,
-	 * the iclog is pushed up to the LSN that freed the block.
-	 */
-	xfs_alloc_mark_busy(tp, agno, bno, len);
+	pag = xfs_perag_get(mp, agno);
+	error = xfs_alloc_update_counters(tp, pag, agbp, len);
+	xfs_perag_put(pag);
+	if (error)
+		goto error0;
+
+	if (!isfl)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+	XFS_STATS_INC(xs_freex);
+	XFS_STATS_ADD(xs_freeb, len);
+
+	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
+
 	return 0;
 
  error0:
-	TRACE_FREE("error", agno, bno, len, isfl);
+	trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1);
 	if (bno_cur)
 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
 	if (cnt_cur)
@@ -1807,6 +1815,25 @@ xfs_alloc_compute_maxlevels(
 }
 
 /*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag)
+{
+	xfs_extlen_t		need, delta = 0;
+
+	need = XFS_MIN_FREELIST_PAG(pag, mp);
+	if (need > pag->pagf_flcount)
+		delta = need - pag->pagf_flcount;
+
+	if (pag->pagf_longest > delta)
+		return pag->pagf_longest - delta;
+	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
+/*
  * Decide whether to use this allocation group for this allocation.
  * If so, fix up the btree freelist's size.
  */
@@ -1837,40 +1864,44 @@ xfs_alloc_fix_freelist(
 				&agbp)))
 			return error;
 		if (!pag->pagf_init) {
+			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
 			args->agbp = NULL;
 			return 0;
 		}
 	} else
 		agbp = NULL;
 
-	/* If this is a metadata prefered pag and we are user data
+	/*
+	 * If this is a metadata preferred pag and we are user data
 	 * then try somewhere else if we are not being asked to
 	 * try harder at this point
 	 */
-	if (pag->pagf_metadata && args->userdata && flags) {
+	if (pag->pagf_metadata && args->userdata &&
+	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
+		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
 		args->agbp = NULL;
 		return 0;
 	}
 
-	need = XFS_MIN_FREELIST_PAG(pag, mp);
-	delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
-	/*
-	 * If it looks like there isn't a long enough extent, or enough
-	 * total blocks, reject it.
-	 */
-	longest = (pag->pagf_longest > delta) ?
-		(pag->pagf_longest - delta) :
-		(pag->pagf_flcount > 0 || pag->pagf_longest > 0);
-	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
-	    (args->minleft &&
-	     (int)(pag->pagf_freeblks + pag->pagf_flcount -
-		   need - args->total) <
-	     (int)args->minleft)) {
-		if (agbp)
-			xfs_trans_brelse(tp, agbp);
-		args->agbp = NULL;
-		return 0;
+	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+		/*
+		 * If it looks like there isn't a long enough extent, or enough
+		 * total blocks, reject it.
+		 */
+		need = XFS_MIN_FREELIST_PAG(pag, mp);
+		longest = xfs_alloc_longest_free_extent(mp, pag);
+		if ((args->minlen + args->alignment + args->minalignslop - 1) >
+				longest ||
+		    ((int)(pag->pagf_freeblks + pag->pagf_flcount -
+			   need - args->total) < (int)args->minleft)) {
+			if (agbp)
+				xfs_trans_brelse(tp, agbp);
+			args->agbp = NULL;
+			return 0;
+		}
 	}
+
 	/*
 	 * Get the a.g. freespace buffer.
 	 * Can fail if we're not blocking on locks, and it's held.
@@ -1880,6 +1911,8 @@ xfs_alloc_fix_freelist(
 				&agbp)))
 			return error;
 		if (agbp == NULL) {
+			ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
+			ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
 			args->agbp = NULL;
 			return 0;
 		}
@@ -1889,22 +1922,24 @@ xfs_alloc_fix_freelist(
 	 */
 	agf = XFS_BUF_TO_AGF(agbp);
 	need = XFS_MIN_FREELIST(agf, mp);
-	delta = need > be32_to_cpu(agf->agf_flcount) ?
-		(need - be32_to_cpu(agf->agf_flcount)) : 0;
 	/*
 	 * If there isn't enough total or single-extent, reject it.
 	 */
-	longest = be32_to_cpu(agf->agf_longest);
-	longest = (longest > delta) ? (longest - delta) :
-		(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
-	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
-	     (args->minleft &&
-		(int)(be32_to_cpu(agf->agf_freeblks) +
-		   be32_to_cpu(agf->agf_flcount) - need - args->total) <
-	     (int)args->minleft)) {
-		xfs_trans_brelse(tp, agbp);
-		args->agbp = NULL;
-		return 0;
+	if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+		delta = need > be32_to_cpu(agf->agf_flcount) ?
+			(need - be32_to_cpu(agf->agf_flcount)) : 0;
+		longest = be32_to_cpu(agf->agf_longest);
+		longest = (longest > delta) ? (longest - delta) :
+			(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
+		if ((args->minlen + args->alignment + args->minalignslop - 1) >
+				longest ||
+		    ((int)(be32_to_cpu(agf->agf_freeblks) +
+		     be32_to_cpu(agf->agf_flcount) - need - args->total) <
+				(int)args->minleft)) {
+			xfs_trans_brelse(tp, agbp);
+			args->agbp = NULL;
+			return 0;
+		}
 	}
 	/*
 	 * Make the freelist shorter if it's too long.
@@ -1912,7 +1947,8 @@ xfs_alloc_fix_freelist(
 	while (be32_to_cpu(agf->agf_flcount) > need) {
 		xfs_buf_t	*bp;
 
-		if ((error = xfs_alloc_get_freelist(tp, agbp, &bno)))
+		error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
+		if (error)
 			return error;
 		if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
 			return error;
@@ -1922,12 +1958,11 @@ xfs_alloc_fix_freelist(
 	/*
 	 * Initialize the args structure.
 	 */
+	memset(&targs, 0, sizeof(targs));
 	targs.tp = tp;
 	targs.mp = mp;
 	targs.agbp = agbp;
 	targs.agno = args->agno;
-	targs.mod = targs.minleft = targs.wasdel = targs.userdata =
-		targs.minalignslop = 0;
 	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
 	targs.type = XFS_ALLOCTYPE_THIS_AG;
 	targs.pag = pag;
@@ -1942,24 +1977,33 @@ xfs_alloc_fix_freelist(
 		/*
 		 * Allocate as many blocks as possible at once.
 		 */
-		if ((error = xfs_alloc_ag_vextent(&targs)))
+		if ((error = xfs_alloc_ag_vextent(&targs))) {
+			xfs_trans_brelse(tp, agflbp);
 			return error;
+		}
 		/*
 		 * Stop if we run out.  Won't happen if callers are obeying
 		 * the restrictions correctly.  Can happen for free calls
 		 * on a completely full ag.
 		 */
-		if (targs.agbno == NULLAGBLOCK)
-			break;
+		if (targs.agbno == NULLAGBLOCK) {
+			if (flags & XFS_ALLOC_FLAG_FREEING)
+				break;
+			xfs_trans_brelse(tp, agflbp);
+			args->agbp = NULL;
+			return 0;
+		}
 		/*
 		 * Put each allocated block on the list.
 		 */
 		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
-			if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp,
-					bno)))
+			error = xfs_alloc_put_freelist(tp, agbp,
+							agflbp, bno, 0);
+			if (error)
 				return error;
 		}
 	}
+	xfs_trans_brelse(tp, agflbp);
 	args->agbp = agbp;
 	return 0;
 }
@@ -1972,23 +2016,22 @@ int				/* error */
 xfs_alloc_get_freelist(
 	xfs_trans_t	*tp,	/* transaction pointer */
 	xfs_buf_t	*agbp,	/* buffer containing the agf structure */
-	xfs_agblock_t	*bnop)	/* block address retrieved from freelist */
+	xfs_agblock_t	*bnop,	/* block address retrieved from freelist */
+	int		btreeblk) /* destination is a AGF btree */
 {
 	xfs_agf_t	*agf;	/* a.g. freespace structure */
-	xfs_agfl_t	*agfl;	/* a.g. freelist structure */
 	xfs_buf_t	*agflbp;/* buffer for a.g. freelist structure */
 	xfs_agblock_t	bno;	/* block number returned */
+	__be32		*agfl_bno;
 	int		error;
-#ifdef XFS_ALLOC_TRACE
-	static char	fname[] = "xfs_alloc_get_freelist";
-#endif
-	xfs_mount_t	*mp;	/* mount structure */
+	int		logflags;
+	xfs_mount_t	*mp = tp->t_mountp;
 	xfs_perag_t	*pag;	/* per allocation group data */
 
-	agf = XFS_BUF_TO_AGF(agbp);
 	/*
 	 * Freelist is empty, give up.
 	 */
+	agf = XFS_BUF_TO_AGF(agbp);
 	if (!agf->agf_flcount) {
 		*bnop = NULLAGBLOCK;
 		return 0;
@@ -1996,36 +2039,38 @@ xfs_alloc_get_freelist(
 	/*
 	 * Read the array of free blocks.
 	 */
-	mp = tp->t_mountp;
-	if ((error = xfs_alloc_read_agfl(mp, tp,
-			be32_to_cpu(agf->agf_seqno), &agflbp)))
+	error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
+				    &agflbp);
+	if (error)
 		return error;
-	agfl = XFS_BUF_TO_AGFL(agflbp);
+
+
 	/*
 	 * Get the block number and update the data structures.
 	 */
-	bno = INT_GET(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)], ARCH_CONVERT);
-	be32_add(&agf->agf_flfirst, 1);
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+	be32_add_cpu(&agf->agf_flfirst, 1);
 	xfs_trans_brelse(tp, agflbp);
 	if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
 		agf->agf_flfirst = 0;
-	pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
-	be32_add(&agf->agf_flcount, -1);
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+	be32_add_cpu(&agf->agf_flcount, -1);
 	xfs_trans_agflist_delta(tp, -1);
 	pag->pagf_flcount--;
-	TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
-	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
+	xfs_perag_put(pag);
+
+	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
+	if (btreeblk) {
+		be32_add_cpu(&agf->agf_btreeblks, 1);
+		pag->pagf_btreeblks++;
+		logflags |= XFS_AGF_BTREEBLKS;
+	}
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
 	*bnop = bno;
 
-	/*
-	 * As blocks are freed, they are added to the per-ag busy list
-	 * and remain there until the freeing transaction is committed to
-	 * disk.  Now that we have allocated blocks, this list must be
-	 * searched to see if a block is being reused.  If one is, then
-	 * the freeing transaction must be pushed to disk NOW by forcing
-	 * to disk all iclogs up that transaction's LSN.
-	 */
-	xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
 	return 0;
 }
 
@@ -2052,9 +2097,15 @@ xfs_alloc_log_agf(
 		offsetof(xfs_agf_t, agf_flcount),
 		offsetof(xfs_agf_t, agf_freeblks),
 		offsetof(xfs_agf_t, agf_longest),
+		offsetof(xfs_agf_t, agf_btreeblks),
+		offsetof(xfs_agf_t, agf_uuid),
 		sizeof(xfs_agf_t)
 	};
 
+	trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+
+	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+
 	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
 	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
 }
@@ -2087,17 +2138,17 @@ xfs_alloc_put_freelist(
 	xfs_trans_t		*tp,	/* transaction pointer */
 	xfs_buf_t		*agbp,	/* buffer for a.g. freelist header */
 	xfs_buf_t		*agflbp,/* buffer for a.g. free block array */
-	xfs_agblock_t		bno)	/* block being freed */
+	xfs_agblock_t		bno,	/* block being freed */
+	int			btreeblk) /* block came from a AGF btree */
 {
 	xfs_agf_t		*agf;	/* a.g. freespace structure */
-	xfs_agfl_t		*agfl;	/* a.g. free block array */
-	xfs_agblock_t		*blockp;/* pointer to array entry */
+	__be32			*blockp;/* pointer to array entry */
 	int			error;
-#ifdef XFS_ALLOC_TRACE
-	static char		fname[] = "xfs_alloc_put_freelist";
-#endif
+	int			logflags;
 	xfs_mount_t		*mp;	/* mount structure */
 	xfs_perag_t		*pag;	/* per allocation group data */
+	__be32			*agfl_bno;
+	int			startoff;
 
 	agf = XFS_BUF_TO_AGF(agbp);
 	mp = tp->t_mountp;
@@ -2105,92 +2156,198 @@ xfs_alloc_put_freelist(
 	if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
 			be32_to_cpu(agf->agf_seqno), &agflbp)))
 		return error;
-	agfl = XFS_BUF_TO_AGFL(agflbp);
-	be32_add(&agf->agf_fllast, 1);
+	be32_add_cpu(&agf->agf_fllast, 1);
 	if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
 		agf->agf_fllast = 0;
-	pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
-	be32_add(&agf->agf_flcount, 1);
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+	be32_add_cpu(&agf->agf_flcount, 1);
 	xfs_trans_agflist_delta(tp, 1);
 	pag->pagf_flcount++;
+
+	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
+	if (btreeblk) {
+		be32_add_cpu(&agf->agf_btreeblks, -1);
+		pag->pagf_btreeblks--;
+		logflags |= XFS_AGF_BTREEBLKS;
+	}
+	xfs_perag_put(pag);
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+
 	ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
-	blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
-	INT_SET(*blockp, ARCH_CONVERT, bno);
-	TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
-	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
-	xfs_trans_log_buf(tp, agflbp,
-		(int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
-		(int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
-			sizeof(xfs_agblock_t) - 1));
+
+	agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
+	*blockp = cpu_to_be32(bno);
+	startoff = (char *)blockp - (char *)agflbp->b_addr;
+
+	xfs_alloc_log_agf(tp, agbp, logflags);
+
+	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+	xfs_trans_log_buf(tp, agflbp, startoff,
+			  startoff + sizeof(xfs_agblock_t) - 1);
 	return 0;
 }
 
+static bool
+xfs_agf_verify(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp)
+ {
+	struct xfs_agf	*agf = XFS_BUF_TO_AGF(bp);
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+			return false;
+
+	if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+	      XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+	      be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+	      be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+	      be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+	      be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
+		return false;
+
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+		return false;
+
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+		return false;
+
+	return true;;
+
+}
+
+static void
+xfs_agf_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
+				XFS_ERRTAG_ALLOC_READ_AGF,
+				XFS_RANDOM_ALLOC_READ_AGF))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	if (bp->b_error)
+		xfs_verifier_error(bp);
+}
+
+static void
+xfs_agf_write_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_buf_log_item	*bip = bp->b_fspriv;
+
+	if (!xfs_agf_verify(mp, bp)) {
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return;
+
+	if (bip)
+		XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+	.verify_read = xfs_agf_read_verify,
+	.verify_write = xfs_agf_write_verify,
+};
+
 /*
  * Read in the allocation group header (free/alloc section).
  */
 int					/* error */
-xfs_alloc_read_agf(
-	xfs_mount_t	*mp,		/* mount point structure */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	int		flags,		/* XFS_ALLOC_FLAG_... */
-	xfs_buf_t	**bpp)		/* buffer for the ag freelist header */
+xfs_read_agf(
+	struct xfs_mount	*mp,	/* mount point structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags,	/* XFS_BUF_ */
+	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
 {
-	xfs_agf_t	*agf;		/* ag freelist header */
-	int		agf_ok;		/* set if agf is consistent */
-	xfs_buf_t	*bp;		/* return value */
-	xfs_perag_t	*pag;		/* per allocation group data */
 	int		error;
 
+	trace_xfs_read_agf(mp, agno);
+
 	ASSERT(agno != NULLAGNUMBER);
 	error = xfs_trans_read_buf(
 			mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1),
-			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
-			&bp);
+			XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
 	if (error)
 		return error;
-	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-	if (!bp) {
-		*bpp = NULL;
+	if (!*bpp)
 		return 0;
-	}
-	/*
-	 * Validate the magic number of the agf block.
-	 */
-	agf = XFS_BUF_TO_AGF(bp);
-	agf_ok =
-		be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC &&
-		XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
-		be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
-		be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
-		be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-		be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
-	if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
-			XFS_RANDOM_ALLOC_READ_AGF))) {
-		XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
-				     XFS_ERRLEVEL_LOW, mp, agf);
-		xfs_trans_brelse(tp, bp);
-		return XFS_ERROR(EFSCORRUPTED);
-	}
-	pag = &mp->m_perag[agno];
+
+	ASSERT(!(*bpp)->b_error);
+	xfs_buf_set_ref(*bpp, XFS_AGF_REF);
+	return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error */
+xfs_alloc_read_agf(
+	struct xfs_mount	*mp,	/* mount point structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags,	/* XFS_ALLOC_FLAG_... */
+	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
+{
+	struct xfs_agf		*agf;		/* ag freelist header */
+	struct xfs_perag	*pag;		/* per allocation group data */
+	int			error;
+
+	trace_xfs_alloc_read_agf(mp, agno);
+
+	ASSERT(agno != NULLAGNUMBER);
+	error = xfs_read_agf(mp, tp, agno,
+			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
+			bpp);
+	if (error)
+		return error;
+	if (!*bpp)
+		return 0;
+	ASSERT(!(*bpp)->b_error);
+
+	agf = XFS_BUF_TO_AGF(*bpp);
+	pag = xfs_perag_get(mp, agno);
 	if (!pag->pagf_init) {
 		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
+		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
 		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
 		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
 		pag->pagf_levels[XFS_BTNUM_BNOi] =
 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
 		pag->pagf_levels[XFS_BTNUM_CNTi] =
 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
-		spinlock_init(&pag->pagb_lock, "xfspagb");
-		pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS *
-					sizeof(xfs_perag_busy_t), KM_SLEEP);
+		spin_lock_init(&pag->pagb_lock);
+		pag->pagb_count = 0;
+		pag->pagb_tree = RB_ROOT;
 		pag->pagf_init = 1;
 	}
 #ifdef DEBUG
 	else if (!XFS_FORCED_SHUTDOWN(mp)) {
 		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
 		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
 		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
 		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
@@ -2199,8 +2356,7 @@ xfs_alloc_read_agf(
 		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
 	}
 #endif
-	XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
-	*bpp = bp;
+	xfs_perag_put(pag);
 	return 0;
 }
 
@@ -2216,9 +2372,6 @@ xfs_alloc_vextent(
 	xfs_agblock_t	agsize;	/* allocation group size */
 	int		error;
 	int		flags;	/* XFS_ALLOC_FLAG_... locking flags */
-#ifdef XFS_ALLOC_TRACE
-	static char	fname[] = "xfs_alloc_vextent";
-#endif
 	xfs_extlen_t	minleft;/* minimum left value, temp copy */
 	xfs_mount_t	*mp;	/* mount structure pointer */
 	xfs_agnumber_t	sagno;	/* starting allocation group number */
@@ -2250,7 +2403,7 @@ xfs_alloc_vextent(
 	    args->minlen > args->maxlen || args->minlen > agsize ||
 	    args->mod >= args->prod) {
 		args->fsbno = NULLFSBLOCK;
-		TRACE_ALLOC("badargs", args);
+		trace_xfs_alloc_vextent_badargs(args);
 		return 0;
 	}
 	minleft = args->minleft;
@@ -2263,24 +2416,21 @@ xfs_alloc_vextent(
 		 * These three force us into a single a.g.
 		 */
 		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-		down_read(&mp->m_peraglock);
-		args->pag = &mp->m_perag[args->agno];
+		args->pag = xfs_perag_get(mp, args->agno);
 		args->minleft = 0;
 		error = xfs_alloc_fix_freelist(args, 0);
 		args->minleft = minleft;
 		if (error) {
-			TRACE_ALLOC("nofix", args);
+			trace_xfs_alloc_vextent_nofix(args);
 			goto error0;
 		}
 		if (!args->agbp) {
-			up_read(&mp->m_peraglock);
-			TRACE_ALLOC("noagbp", args);
+			trace_xfs_alloc_vextent_noagbp(args);
 			break;
 		}
 		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
 		if ((error = xfs_alloc_ag_vextent(args)))
 			goto error0;
-		up_read(&mp->m_peraglock);
 		break;
 	case XFS_ALLOCTYPE_START_BNO:
 		/*
@@ -2332,14 +2482,13 @@ xfs_alloc_vextent(
 		 * Loop over allocation groups twice; first time with
 		 * trylock set, second time without.
 		 */
-		down_read(&mp->m_peraglock);
 		for (;;) {
-			args->pag = &mp->m_perag[args->agno];
+			args->pag = xfs_perag_get(mp, args->agno);
 			if (no_min) args->minleft = 0;
 			error = xfs_alloc_fix_freelist(args, flags);
 			args->minleft = minleft;
 			if (error) {
-				TRACE_ALLOC("nofix", args);
+				trace_xfs_alloc_vextent_nofix(args);
 				goto error0;
 			}
 			/*
@@ -2350,15 +2499,28 @@ xfs_alloc_vextent(
 					goto error0;
 				break;
 			}
-			TRACE_ALLOC("loopfailed", args);
+
+			trace_xfs_alloc_vextent_loopfailed(args);
+
 			/*
 			 * Didn't work, figure out the next iteration.
 			 */
 			if (args->agno == sagno &&
 			    type == XFS_ALLOCTYPE_START_BNO)
 				args->type = XFS_ALLOCTYPE_THIS_AG;
-			if (++(args->agno) == mp->m_sb.sb_agcount)
-				args->agno = 0;
+			/*
+			* For the first allocation, we can try any AG to get
+			* space.  However, if we already have allocated a
+			* block, we don't want to try AGs whose number is below
+			* sagno. Otherwise, we may end up with out-of-order
+			* locking of AGF, which might cause deadlock.
+			*/
+			if (++(args->agno) == mp->m_sb.sb_agcount) {
+				if (args->firstblock != NULLFSBLOCK)
+					args->agno = sagno;
+				else
+					args->agno = 0;
+			}
 			/*
 			 * Reached the starting a.g., must either be done
 			 * or switch to non-trylock mode.
@@ -2366,7 +2528,7 @@ xfs_alloc_vextent(
 			if (args->agno == sagno) {
 				if (no_min == 1) {
 					args->agbno = NULLAGBLOCK;
-					TRACE_ALLOC("allfailed", args);
+					trace_xfs_alloc_vextent_allfailed(args);
 					break;
 				}
 				if (flags == 0) {
@@ -2380,8 +2542,8 @@ xfs_alloc_vextent(
 					}
 				}
 			}
+			xfs_perag_put(args->pag);
 		}
-		up_read(&mp->m_peraglock);
 		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
 			if (args->agno == sagno)
 				mp->m_agfrotor = (mp->m_agfrotor + 1) %
@@ -2407,9 +2569,10 @@ xfs_alloc_vextent(
 			args->len);
 #endif
 	}
+	xfs_perag_put(args->pag);
 	return 0;
 error0:
-	up_read(&mp->m_peraglock);
+	xfs_perag_put(args->pag);
 	return error;
 }
 
@@ -2424,178 +2587,44 @@ xfs_free_extent(
 	xfs_fsblock_t	bno,	/* starting block number of extent */
 	xfs_extlen_t	len)	/* length of extent */
 {
-#ifdef DEBUG
-	xfs_agf_t	*agf;	/* a.g. freespace header */
-#endif
-	xfs_alloc_arg_t	args;	/* allocation argument structure */
+	xfs_alloc_arg_t	args;
 	int		error;
 
 	ASSERT(len != 0);
+	memset(&args, 0, sizeof(xfs_alloc_arg_t));
 	args.tp = tp;
 	args.mp = tp->t_mountp;
-	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
-	ASSERT(args.agno < args.mp->m_sb.sb_agcount);
-	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
-	args.alignment = 1;
-	args.minlen = args.minleft = args.minalignslop = 0;
-	down_read(&args.mp->m_peraglock);
-	args.pag = &args.mp->m_perag[args.agno];
-	if ((error = xfs_alloc_fix_freelist(&args, 0)))
-		goto error0;
-#ifdef DEBUG
-	ASSERT(args.agbp != NULL);
-	agf = XFS_BUF_TO_AGF(args.agbp);
-	ASSERT(args.agbno + len <= be32_to_cpu(agf->agf_length));
-#endif
-	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno,
-		len, 0);
-error0:
-	up_read(&args.mp->m_peraglock);
-	return error;
-}
-
-
-/*
- * AG Busy list management
- * The busy list contains block ranges that have been freed but whose
- * transacations have not yet hit disk.  If any block listed in a busy
- * list is reused, the transaction that freed it must be forced to disk
- * before continuing to use the block.
- *
- * xfs_alloc_mark_busy - add to the per-ag busy list
- * xfs_alloc_clear_busy - remove an item from the per-ag busy list
- */
-void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
-		    xfs_agnumber_t agno,
-		    xfs_agblock_t bno,
-		    xfs_extlen_t len)
-{
-	xfs_mount_t		*mp;
-	xfs_perag_busy_t	*bsy;
-	int			n;
-	SPLDECL(s);
-
-	mp = tp->t_mountp;
-	s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
-
-	/* search pagb_list for an open slot */
-	for (bsy = mp->m_perag[agno].pagb_list, n = 0;
-	     n < XFS_PAGB_NUM_SLOTS;
-	     bsy++, n++) {
-		if (bsy->busy_tp == NULL) {
-			break;
-		}
-	}
-
-	if (n < XFS_PAGB_NUM_SLOTS) {
-		bsy = &mp->m_perag[agno].pagb_list[n];
-		mp->m_perag[agno].pagb_count++;
-		TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp);
-		bsy->busy_start = bno;
-		bsy->busy_length = len;
-		bsy->busy_tp = tp;
-		xfs_trans_add_busy(tp, agno, n);
-	} else {
-		TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp);
-		/*
-		 * The busy list is full!  Since it is now not possible to
-		 * track the free block, make this a synchronous transaction
-		 * to insure that the block is not reused before this
-		 * transaction commits.
-		 */
-		xfs_trans_set_sync(tp);
-	}
-
-	mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
-}
-
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
-		     xfs_agnumber_t agno,
-		     int idx)
-{
-	xfs_mount_t		*mp;
-	xfs_perag_busy_t	*list;
-	SPLDECL(s);
-
-	mp = tp->t_mountp;
-
-	s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
-	list = mp->m_perag[agno].pagb_list;
 
-	ASSERT(idx < XFS_PAGB_NUM_SLOTS);
-	if (list[idx].busy_tp == tp) {
-		TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp);
-		list[idx].busy_tp = NULL;
-		mp->m_perag[agno].pagb_count--;
-	} else {
-		TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp);
-	}
-
-	mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
-}
-
-
-/*
- * returns non-zero if any of (agno,bno):len is in a busy list
- */
-STATIC int
-xfs_alloc_search_busy(xfs_trans_t *tp,
-		    xfs_agnumber_t agno,
-		    xfs_agblock_t bno,
-		    xfs_extlen_t len)
-{
-	xfs_mount_t		*mp;
-	xfs_perag_busy_t	*bsy;
-	int			n;
-	xfs_agblock_t		uend, bend;
-	xfs_lsn_t		lsn;
-	int			cnt;
-	SPLDECL(s);
-
-	mp = tp->t_mountp;
-
-	s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
-	cnt = mp->m_perag[agno].pagb_count;
+	/*
+	 * validate that the block number is legal - the enables us to detect
+	 * and handle a silent filesystem corruption rather than crashing.
+	 */
+	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
+	if (args.agno >= args.mp->m_sb.sb_agcount)
+		return EFSCORRUPTED;
 
-	uend = bno + len - 1;
+	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+	if (args.agbno >= args.mp->m_sb.sb_agblocks)
+		return EFSCORRUPTED;
 
-	/* search pagb_list for this slot, skipping open slots */
-	for (bsy = mp->m_perag[agno].pagb_list, n = 0;
-	     cnt; bsy++, n++) {
+	args.pag = xfs_perag_get(args.mp, args.agno);
+	ASSERT(args.pag);
 
-		/*
-		 * (start1,length1) within (start2, length2)
-		 */
-		if (bsy->busy_tp != NULL) {
-			bend = bsy->busy_start + bsy->busy_length - 1;
-			if ((bno > bend) ||
-			    (uend < bsy->busy_start)) {
-				cnt--;
-			} else {
-				TRACE_BUSYSEARCH("xfs_alloc_search_busy",
-						 "found1", agno, bno, len, n,
-						 tp);
-				break;
-			}
-		}
-	}
+	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+	if (error)
+		goto error0;
 
-	/*
-	 * If a block was found, force the log through the LSN of the
-	 * transaction that freed the block
-	 */
-	if (cnt) {
-		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp);
-		lsn = bsy->busy_tp->t_commit_lsn;
-		mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
-		xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
-	} else {
-		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp);
-		n = -1;
-		mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+	/* validate the extent size is legal now we have the agf locked */
+	if (args.agbno + len >
+			be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
+		error = EFSCORRUPTED;
+		goto error0;
 	}
 
-	return n;
+	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+	if (!error)
+		xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
+error0:
+	xfs_perag_put(args.pag);
+	return error;
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 3546dea27b7..feacb061bab 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -19,28 +19,77 @@
 #define	__XFS_ALLOC_H__
 
 struct xfs_buf;
+struct xfs_btree_cur;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
 
+extern struct workqueue_struct *xfs_alloc_wq;
+
 /*
  * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
  */
-typedef enum xfs_alloctype
-{
-	XFS_ALLOCTYPE_ANY_AG,		/* allocate anywhere, use rotor */
-	XFS_ALLOCTYPE_FIRST_AG,		/* ... start at ag 0 */
-	XFS_ALLOCTYPE_START_AG,		/* anywhere, start in this a.g. */
-	XFS_ALLOCTYPE_THIS_AG,		/* anywhere in this a.g. */
-	XFS_ALLOCTYPE_START_BNO,	/* near this block else anywhere */
-	XFS_ALLOCTYPE_NEAR_BNO,		/* in this a.g. and near this block */
-	XFS_ALLOCTYPE_THIS_BNO		/* at exactly this block */
-} xfs_alloctype_t;
+#define XFS_ALLOCTYPE_ANY_AG	0x01	/* allocate anywhere, use rotor */
+#define XFS_ALLOCTYPE_FIRST_AG	0x02	/* ... start at ag 0 */
+#define XFS_ALLOCTYPE_START_AG	0x04	/* anywhere, start in this a.g. */
+#define XFS_ALLOCTYPE_THIS_AG	0x08	/* anywhere in this a.g. */
+#define XFS_ALLOCTYPE_START_BNO	0x10	/* near this block else anywhere */
+#define XFS_ALLOCTYPE_NEAR_BNO	0x20	/* in this a.g. and near this block */
+#define XFS_ALLOCTYPE_THIS_BNO	0x40	/* at exactly this block */
+
+/* this should become an enum again when the tracing code is fixed */
+typedef unsigned int xfs_alloctype_t;
+
+#define XFS_ALLOC_TYPES \
+	{ XFS_ALLOCTYPE_ANY_AG,		"ANY_AG" }, \
+	{ XFS_ALLOCTYPE_FIRST_AG,	"FIRST_AG" }, \
+	{ XFS_ALLOCTYPE_START_AG,	"START_AG" }, \
+	{ XFS_ALLOCTYPE_THIS_AG,	"THIS_AG" }, \
+	{ XFS_ALLOCTYPE_START_BNO,	"START_BNO" }, \
+	{ XFS_ALLOCTYPE_NEAR_BNO,	"NEAR_BNO" }, \
+	{ XFS_ALLOCTYPE_THIS_BNO,	"THIS_BNO" }
 
 /*
  * Flags for xfs_alloc_fix_freelist.
  */
 #define	XFS_ALLOC_FLAG_TRYLOCK	0x00000001  /* use trylock for buffer locking */
+#define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks
+ * to 4 + 4*agcount.
+ */
+#define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
+
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *	- the AG superblock, AGF, AGI and AGFL
+ *	- the AGF (bno and cnt) and AGI btree root blocks
+ *	- 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)	\
+	((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+
 
 /*
  * Argument structure for xfs_alloc routines.
@@ -68,8 +117,9 @@ typedef struct xfs_alloc_arg {
 	xfs_alloctype_t	otype;		/* original allocation type */
 	char		wasdel;		/* set if allocation was prev delayed */
 	char		wasfromfl;	/* set if allocation is from freelist */
-	char		isfl;		/* set if is freelist blocks - !actg */
+	char		isfl;		/* set if is freelist blocks - !acctg */
 	char		userdata;	/* set if this is user data */
+	xfs_fsblock_t	firstblock;	/* io first block allocated */
 } xfs_alloc_arg_t;
 
 /*
@@ -78,26 +128,12 @@ typedef struct xfs_alloc_arg {
 #define XFS_ALLOC_USERDATA		1	/* allocation is for user data*/
 #define XFS_ALLOC_INITIAL_USER_DATA	2	/* special case start of file */
 
-
-#ifdef __KERNEL__
-
-#if defined(XFS_ALLOC_TRACE)
 /*
- * Allocation tracing buffer size.
+ * Find the length of the longest extent in an AG.
  */
-#define	XFS_ALLOC_TRACE_SIZE	4096
-extern ktrace_t *xfs_alloc_trace_buf;
-
-/*
- * Types for alloc tracing.
- */
-#define	XFS_ALLOC_KTRACE_ALLOC	1
-#define	XFS_ALLOC_KTRACE_FREE	2
-#define	XFS_ALLOC_KTRACE_MODAGF	3
-#define	XFS_ALLOC_KTRACE_BUSY	4
-#define	XFS_ALLOC_KTRACE_UNBUSY	5
-#define	XFS_ALLOC_KTRACE_BUSYSEARCH	6
-#endif
+xfs_extlen_t
+xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+		struct xfs_perag *pag);
 
 /*
  * Compute and fill in value of m_ag_maxlevels.
@@ -114,7 +150,8 @@ int				/* error */
 xfs_alloc_get_freelist(
 	struct xfs_trans *tp,	/* transaction pointer */
 	struct xfs_buf	*agbp,	/* buffer containing the agf structure */
-	xfs_agblock_t	*bnop);	/* block address retrieved from freelist */
+	xfs_agblock_t	*bnop,	/* block address retrieved from freelist */
+	int		btreeblk); /* destination is a AGF btree */
 
 /*
  * Log the given fields from the agf structure.
@@ -143,7 +180,8 @@ xfs_alloc_put_freelist(
 	struct xfs_trans *tp,	/* transaction pointer */
 	struct xfs_buf	*agbp,	/* buffer for a.g. freelist header */
 	struct xfs_buf	*agflbp,/* buffer for a.g. free block array */
-	xfs_agblock_t	bno);	/* block being freed */
+	xfs_agblock_t	bno,	/* block being freed */
+	int		btreeblk); /* owner was a AGF btree */
 
 /*
  * Read in the allocation group header (free/alloc section).
@@ -172,18 +210,25 @@ xfs_free_extent(
 	xfs_fsblock_t	bno,	/* starting block number of extent */
 	xfs_extlen_t	len);	/* length of extent */
 
-void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
-		xfs_agnumber_t agno,
-		xfs_agblock_t bno,
-		xfs_extlen_t len);
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat);	/* success/failure */
 
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
-		xfs_agnumber_t ag,
-		int idx);
-
-
-#endif	/* __KERNEL__ */
+int				/* error */
+xfs_alloc_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat);	/* success/failure */
+
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat);	/* output: success/failure */
 
 #endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index a1d92da86cc..8358f1ded94 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -17,2187 +17,488 @@
  */
 #include "xfs.h"
 #include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
 #include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
 #include "xfs_btree.h"
-#include "xfs_ialloc.h"
+#include "xfs_alloc_btree.h"
 #include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
 
-/*
- * Prototypes for internal functions.
- */
-
-STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
-		xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
 
-/*
- * Internal functions.
- */
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agbp, cur->bc_private.a.agno,
+			cur->bc_btnum);
+}
 
-/*
- * Single level of the xfs_alloc_delete record deletion routine.
- * Delete record pointed to by cur/level.
- * Remove the record from its block then rebalance the tree.
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int				/* error */
-xfs_alloc_delrec(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level,	/* level removing record from */
-	int			*stat)	/* fail/done/go-on */
+STATIC void
+xfs_allocbt_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			inc)
 {
-	xfs_agf_t		*agf;	/* allocation group freelist header */
-	xfs_alloc_block_t	*block;	/* btree block record/key lives in */
-	xfs_agblock_t		bno;	/* btree block number */
-	xfs_buf_t		*bp;	/* buffer for block */
-	int			error;	/* error return value */
-	int			i;	/* loop index */
-	xfs_alloc_key_t		key;	/* kp points here if block is level 0 */
-	xfs_agblock_t		lbno;	/* left block's block number */
-	xfs_buf_t		*lbp;	/* left block's buffer pointer */
-	xfs_alloc_block_t	*left;	/* left btree block */
-	xfs_alloc_key_t		*lkp=NULL;	/* left block key pointer */
-	xfs_alloc_ptr_t		*lpp=NULL;	/* left block address pointer */
-	int			lrecs=0;	/* number of records in left block */
-	xfs_alloc_rec_t		*lrp;	/* left block record pointer */
-	xfs_mount_t		*mp;	/* mount structure */
-	int			ptr;	/* index in btree block for this rec */
-	xfs_agblock_t		rbno;	/* right block's block number */
-	xfs_buf_t		*rbp;	/* right block's buffer pointer */
-	xfs_alloc_block_t	*right;	/* right btree block */
-	xfs_alloc_key_t		*rkp;	/* right block key pointer */
-	xfs_alloc_ptr_t		*rpp;	/* right block address pointer */
-	int			rrecs=0;	/* number of records in right block */
-	xfs_alloc_rec_t		*rrp;	/* right block record pointer */
-	xfs_btree_cur_t		*tcur;	/* temporary btree cursor */
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	int			btnum = cur->bc_btnum;
+	struct xfs_perag	*pag = xfs_perag_get(cur->bc_mp, seqno);
 
-	/*
-	 * Get the index of the entry being deleted, check for nothing there.
-	 */
-	ptr = cur->bc_ptrs[level];
-	if (ptr == 0) {
-		*stat = 0;
-		return 0;
-	}
-	/*
-	 * Get the buffer & block containing the record or key/ptr.
-	 */
-	bp = cur->bc_bufs[level];
-	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-		return error;
-#endif
-	/*
-	 * Fail if we're off the end of the block.
-	 */
-	if (ptr > be16_to_cpu(block->bb_numrecs)) {
-		*stat = 0;
-		return 0;
-	}
-	XFS_STATS_INC(xs_abt_delrec);
-	/*
-	 * It's a nonleaf.  Excise the key and ptr being deleted, by
-	 * sliding the entries past them down one.
-	 * Log the changed areas of the block.
-	 */
-	if (level > 0) {
-		lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-		lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-		for (i = ptr; i < be16_to_cpu(block->bb_numrecs); i++) {
-			if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-				return error;
-		}
-#endif
-		if (ptr < be16_to_cpu(block->bb_numrecs)) {
-			memmove(&lkp[ptr - 1], &lkp[ptr],
-				(be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lkp));
-			memmove(&lpp[ptr - 1], &lpp[ptr],
-				(be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lpp));
-			xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
-			xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
-		}
-	}
-	/*
-	 * It's a leaf.  Excise the record being deleted, by sliding the
-	 * entries past it down one.  Log the changed areas of the block.
-	 */
-	else {
-		lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-		if (ptr < be16_to_cpu(block->bb_numrecs)) {
-			memmove(&lrp[ptr - 1], &lrp[ptr],
-				(be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lrp));
-			xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
-		}
-		/*
-		 * If it's the first record in the block, we'll need a key
-		 * structure to pass up to the next level (updkey).
-		 */
-		if (ptr == 1) {
-			key.ar_startblock = lrp->ar_startblock;
-			key.ar_blockcount = lrp->ar_blockcount;
-			lkp = &key;
-		}
-	}
-	/*
-	 * Decrement and log the number of entries in the block.
-	 */
-	be16_add(&block->bb_numrecs, -1);
-	xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-	/*
-	 * See if the longest free extent in the allocation group was
-	 * changed by this operation.  True if it's the by-size btree, and
-	 * this is the leaf level, and there is no right sibling block,
-	 * and this was the last record.
-	 */
-	agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-	mp = cur->bc_mp;
-
-	if (level == 0 &&
-	    cur->bc_btnum == XFS_BTNUM_CNT &&
-	    be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-	    ptr > be16_to_cpu(block->bb_numrecs)) {
-		ASSERT(ptr == be16_to_cpu(block->bb_numrecs) + 1);
-		/*
-		 * There are still records in the block.  Grab the size
-		 * from the last one.
-		 */
-		if (be16_to_cpu(block->bb_numrecs)) {
-			rrp = XFS_ALLOC_REC_ADDR(block, be16_to_cpu(block->bb_numrecs), cur);
-			agf->agf_longest = rrp->ar_blockcount;
-		}
-		/*
-		 * No free extents left.
-		 */
-		else
-			agf->agf_longest = 0;
-		mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest =
-			be32_to_cpu(agf->agf_longest);
-		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-			XFS_AGF_LONGEST);
-	}
-	/*
-	 * Is this the root level?  If so, we're almost done.
-	 */
-	if (level == cur->bc_nlevels - 1) {
-		/*
-		 * If this is the root level,
-		 * and there's only one entry left,
-		 * and it's NOT the leaf level,
-		 * then we can get rid of this level.
-		 */
-		if (be16_to_cpu(block->bb_numrecs) == 1 && level > 0) {
-			/*
-			 * lpp is still set to the first pointer in the block.
-			 * Make it the new root of the btree.
-			 */
-			bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-			agf->agf_roots[cur->bc_btnum] = *lpp;
-			be32_add(&agf->agf_levels[cur->bc_btnum], -1);
-			mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--;
-			/*
-			 * Put this buffer/block on the ag's freelist.
-			 */
-			if ((error = xfs_alloc_put_freelist(cur->bc_tp,
-					cur->bc_private.a.agbp, NULL, bno)))
-				return error;
-			/*
-			 * Since blocks move to the free list without the
-			 * coordination used in xfs_bmap_finish, we can't allow
-			 * block to be available for reallocation and
-			 * non-transaction writing (user data) until we know
-			 * that the transaction that moved it to the free list
-			 * is permanently on disk. We track the blocks by
-			 * declaring these blocks as "busy"; the busy list is
-			 * maintained on a per-ag basis and each transaction
-			 * records which entries should be removed when the
-			 * iclog commits to disk. If a busy block is
-			 * allocated, the iclog is pushed up to the LSN
-			 * that freed the block.
-			 */
-			xfs_alloc_mark_busy(cur->bc_tp,
-				be32_to_cpu(agf->agf_seqno), bno, 1);
-
-			xfs_trans_agbtree_delta(cur->bc_tp, -1);
-			xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-				XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-			/*
-			 * Update the cursor so there's one fewer level.
-			 */
-			xfs_btree_setbuf(cur, level, NULL);
-			cur->bc_nlevels--;
-		} else if (level > 0 &&
-			   (error = xfs_alloc_decrement(cur, level, &i)))
-			return error;
-		*stat = 1;
-		return 0;
-	}
-	/*
-	 * If we deleted the leftmost entry in the block, update the
-	 * key values above us in the tree.
-	 */
-	if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
-		return error;
-	/*
-	 * If the number of records remaining in the block is at least
-	 * the minimum, we're done.
-	 */
-	if (be16_to_cpu(block->bb_numrecs) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-		if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-			return error;
-		*stat = 1;
-		return 0;
-	}
-	/*
-	 * Otherwise, we have to move some records around to keep the
-	 * tree balanced.  Look at the left and right sibling blocks to
-	 * see if we can re-balance by moving only one record.
-	 */
-	rbno = be32_to_cpu(block->bb_rightsib);
-	lbno = be32_to_cpu(block->bb_leftsib);
-	bno = NULLAGBLOCK;
-	ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
-	/*
-	 * Duplicate the cursor so our btree manipulations here won't
-	 * disrupt the next level up.
-	 */
-	if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-		return error;
-	/*
-	 * If there's a right sibling, see if it's ok to shift an entry
-	 * out of it.
-	 */
-	if (rbno != NULLAGBLOCK) {
-		/*
-		 * Move the temp cursor to the last entry in the next block.
-		 * Actually any entry but the first would suffice.
-		 */
-		i = xfs_btree_lastrec(tcur, level);
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		if ((error = xfs_alloc_increment(tcur, level, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		i = xfs_btree_lastrec(tcur, level);
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		/*
-		 * Grab a pointer to the block.
-		 */
-		rbp = tcur->bc_bufs[level];
-		right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-		if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-			goto error0;
-#endif
-		/*
-		 * Grab the current block number, for future use.
-		 */
-		bno = be32_to_cpu(right->bb_leftsib);
-		/*
-		 * If right block is full enough so that removing one entry
-		 * won't make it too empty, and left-shifting an entry out
-		 * of right to us works, we're done.
-		 */
-		if (be16_to_cpu(right->bb_numrecs) - 1 >=
-		     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-			if ((error = xfs_alloc_lshift(tcur, level, &i)))
-				goto error0;
-			if (i) {
-				ASSERT(be16_to_cpu(block->bb_numrecs) >=
-				       XFS_ALLOC_BLOCK_MINRECS(level, cur));
-				xfs_btree_del_cursor(tcur,
-						     XFS_BTREE_NOERROR);
-				if (level > 0 &&
-				    (error = xfs_alloc_decrement(cur, level,
-					    &i)))
-					return error;
-				*stat = 1;
-				return 0;
-			}
-		}
-		/*
-		 * Otherwise, grab the number of records in right for
-		 * future reference, and fix up the temp cursor to point
-		 * to our block again (last record).
-		 */
-		rrecs = be16_to_cpu(right->bb_numrecs);
-		if (lbno != NULLAGBLOCK) {
-			i = xfs_btree_firstrec(tcur, level);
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			if ((error = xfs_alloc_decrement(tcur, level, &i)))
-				goto error0;
-			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		}
-	}
-	/*
-	 * If there's a left sibling, see if it's ok to shift an entry
-	 * out of it.
-	 */
-	if (lbno != NULLAGBLOCK) {
-		/*
-		 * Move the temp cursor to the first entry in the
-		 * previous block.
-		 */
-		i = xfs_btree_firstrec(tcur, level);
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		if ((error = xfs_alloc_decrement(tcur, level, &i)))
-			goto error0;
-		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		xfs_btree_firstrec(tcur, level);
-		/*
-		 * Grab a pointer to the block.
-		 */
-		lbp = tcur->bc_bufs[level];
-		left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-		if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-			goto error0;
-#endif
-		/*
-		 * Grab the current block number, for future use.
-		 */
-		bno = be32_to_cpu(left->bb_rightsib);
-		/*
-		 * If left block is full enough so that removing one entry
-		 * won't make it too empty, and right-shifting an entry out
-		 * of left to us works, we're done.
-		 */
-		if (be16_to_cpu(left->bb_numrecs) - 1 >=
-		     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-			if ((error = xfs_alloc_rshift(tcur, level, &i)))
-				goto error0;
-			if (i) {
-				ASSERT(be16_to_cpu(block->bb_numrecs) >=
-				       XFS_ALLOC_BLOCK_MINRECS(level, cur));
-				xfs_btree_del_cursor(tcur,
-						     XFS_BTREE_NOERROR);
-				if (level == 0)
-					cur->bc_ptrs[0]++;
-				*stat = 1;
-				return 0;
-			}
-		}
-		/*
-		 * Otherwise, grab the number of records in right for
-		 * future reference.
-		 */
-		lrecs = be16_to_cpu(left->bb_numrecs);
-	}
-	/*
-	 * Delete the temp cursor, we're done with it.
-	 */
-	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-	/*
-	 * If here, we need to do a join to keep the tree balanced.
-	 */
-	ASSERT(bno != NULLAGBLOCK);
-	/*
-	 * See if we can join with the left neighbor block.
-	 */
-	if (lbno != NULLAGBLOCK &&
-	    lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-		/*
-		 * Set "right" to be the starting block,
-		 * "left" to be the left neighbor.
-		 */
-		rbno = bno;
-		right = block;
-		rbp = bp;
-		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-				cur->bc_private.a.agno, lbno, 0, &lbp,
-				XFS_ALLOC_BTREE_REF)))
-			return error;
-		left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-		if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-			return error;
-	}
-	/*
-	 * If that won't work, see if we can join with the right neighbor block.
-	 */
-	else if (rbno != NULLAGBLOCK &&
-		 rrecs + be16_to_cpu(block->bb_numrecs) <=
-		  XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-		/*
-		 * Set "left" to be the starting block,
-		 * "right" to be the right neighbor.
-		 */
-		lbno = bno;
-		left = block;
-		lbp = bp;
-		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-				cur->bc_private.a.agno, rbno, 0, &rbp,
-				XFS_ALLOC_BTREE_REF)))
-			return error;
-		right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-		if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-			return error;
-	}
-	/*
-	 * Otherwise, we can't fix the imbalance.
-	 * Just return.  This is probably a logic error, but it's not fatal.
-	 */
-	else {
-		if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-			return error;
-		*stat = 1;
-		return 0;
-	}
-	/*
-	 * We're now going to join "left" and "right" by moving all the stuff
-	 * in "right" to "left" and deleting "right".
-	 */
-	if (level > 0) {
-		/*
-		 * It's a non-leaf.  Move keys and pointers.
-		 */
-		lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
-		lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
-		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-		for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-			if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-				return error;
-		}
-#endif
-		memcpy(lkp, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*lkp));
-		memcpy(lpp, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*lpp));
-		xfs_alloc_log_keys(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
-				   be16_to_cpu(left->bb_numrecs) +
-				   be16_to_cpu(right->bb_numrecs));
-		xfs_alloc_log_ptrs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
-				   be16_to_cpu(left->bb_numrecs) +
-				   be16_to_cpu(right->bb_numrecs));
-	} else {
-		/*
-		 * It's a leaf.  Move records.
-		 */
-		lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
-		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-		memcpy(lrp, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*lrp));
-		xfs_alloc_log_recs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
-				   be16_to_cpu(left->bb_numrecs) +
-				   be16_to_cpu(right->bb_numrecs));
-	}
-	/*
-	 * If we joined with the left neighbor, set the buffer in the
-	 * cursor to the left block, and fix up the index.
-	 */
-	if (bp != lbp) {
-		xfs_btree_setbuf(cur, level, lbp);
-		cur->bc_ptrs[level] += be16_to_cpu(left->bb_numrecs);
-	}
-	/*
-	 * If we joined with the right neighbor and there's a level above
-	 * us, increment the cursor at that level.
-	 */
-	else if (level + 1 < cur->bc_nlevels &&
-		 (error = xfs_alloc_increment(cur, level + 1, &i)))
-		return error;
-	/*
-	 * Fix up the number of records in the surviving block.
-	 */
-	be16_add(&left->bb_numrecs, be16_to_cpu(right->bb_numrecs));
-	/*
-	 * Fix up the right block pointer in the surviving block, and log it.
-	 */
-	left->bb_rightsib = right->bb_rightsib;
-	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-	/*
-	 * If there is a right sibling now, make it point to the
-	 * remaining block.
-	 */
-	if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-		xfs_alloc_block_t	*rrblock;
-		xfs_buf_t		*rrbp;
-
-		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-				cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
-				&rrbp, XFS_ALLOC_BTREE_REF)))
-			return error;
-		rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
-		if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-			return error;
-		rrblock->bb_leftsib = cpu_to_be32(lbno);
-		xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
-	}
-	/*
-	 * Free the deleting block by putting it on the freelist.
-	 */
-	if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp,
-			NULL, rbno)))
-		return error;
-	/*
-	 * Since blocks move to the free list without the coordination
-	 * used in xfs_bmap_finish, we can't allow block to be available
-	 * for reallocation and non-transaction writing (user data)
-	 * until we know that the transaction that moved it to the free
-	 * list is permanently on disk. We track the blocks by declaring
-	 * these blocks as "busy"; the busy list is maintained on a
-	 * per-ag basis and each transaction records which entries
-	 * should be removed when the iclog commits to disk. If a
-	 * busy block is allocated, the iclog is pushed up to the
-	 * LSN that freed the block.
-	 */
-	xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
-	xfs_trans_agbtree_delta(cur->bc_tp, -1);
+	ASSERT(ptr->s != 0);
 
-	/*
-	 * Adjust the current level's cursor so that we're left referring
-	 * to the right node, after we're done.
-	 * If this leaves the ptr value 0 our caller will fix it up.
-	 */
-	if (level > 0)
-		cur->bc_ptrs[level]--;
-	/*
-	 * Return value means the next level up has something to do.
-	 */
-	*stat = 2;
-	return 0;
+	agf->agf_roots[btnum] = ptr->s;
+	be32_add_cpu(&agf->agf_levels[btnum], inc);
+	pag->pagf_levels[btnum] += inc;
+	xfs_perag_put(pag);
 
-error0:
-	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-	return error;
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
 }
 
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int				/* error */
-xfs_alloc_insrec(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level,	/* level to insert record at */
-	xfs_agblock_t		*bnop,	/* i/o: block number inserted */
-	xfs_alloc_rec_t		*recp,	/* i/o: record data inserted */
-	xfs_btree_cur_t		**curp,	/* output: new cursor replacing cur */
-	int			*stat)	/* output: success/failure */
+STATIC int
+xfs_allocbt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
 {
-	xfs_agf_t		*agf;	/* allocation group freelist header */
-	xfs_alloc_block_t	*block;	/* btree block record/key lives in */
-	xfs_buf_t		*bp;	/* buffer for block */
-	int			error;	/* error return value */
-	int			i;	/* loop index */
-	xfs_alloc_key_t		key;	/* key value being inserted */
-	xfs_alloc_key_t		*kp;	/* pointer to btree keys */
-	xfs_agblock_t		nbno;	/* block number of allocated block */
-	xfs_btree_cur_t		*ncur;	/* new cursor to be used at next lvl */
-	xfs_alloc_key_t		nkey;	/* new key value, from split */
-	xfs_alloc_rec_t		nrec;	/* new record value, for caller */
-	int			optr;	/* old ptr value */
-	xfs_alloc_ptr_t		*pp;	/* pointer to btree addresses */
-	int			ptr;	/* index in btree block for this rec */
-	xfs_alloc_rec_t		*rp;	/* pointer to btree records */
-
-	ASSERT(be32_to_cpu(recp->ar_blockcount) > 0);
+	int			error;
+	xfs_agblock_t		bno;
 
-	/*
-	 * GCC doesn't understand the (arguably complex) control flow in
-	 * this function and complains about uninitialized structure fields
-	 * without this.
-	 */
-	memset(&nrec, 0, sizeof(nrec));
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
 
-	/*
-	 * If we made it to the root level, allocate a new root block
-	 * and we're done.
-	 */
-	if (level >= cur->bc_nlevels) {
-		XFS_STATS_INC(xs_abt_insrec);
-		if ((error = xfs_alloc_newroot(cur, &i)))
-			return error;
-		*bnop = NULLAGBLOCK;
-		*stat = i;
-		return 0;
+	/* Allocate the new block from the freelist. If we can't, give up.  */
+	error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+				       &bno, 1);
+	if (error) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+		return error;
 	}
-	/*
-	 * Make a key out of the record data to be inserted, and save it.
-	 */
-	key.ar_startblock = recp->ar_startblock;
-	key.ar_blockcount = recp->ar_blockcount;
-	optr = ptr = cur->bc_ptrs[level];
-	/*
-	 * If we're off the left edge, return failure.
-	 */
-	if (ptr == 0) {
+
+	if (bno == NULLAGBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
 		*stat = 0;
 		return 0;
 	}
-	XFS_STATS_INC(xs_abt_insrec);
-	/*
-	 * Get pointers to the btree buffer and block.
-	 */
-	bp = cur->bc_bufs[level];
-	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-		return error;
-	/*
-	 * Check that the new entry is being inserted in the right place.
-	 */
-	if (ptr <= be16_to_cpu(block->bb_numrecs)) {
-		if (level == 0) {
-			rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-			xfs_btree_check_rec(cur->bc_btnum, recp, rp);
-		} else {
-			kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
-			xfs_btree_check_key(cur->bc_btnum, &key, kp);
-		}
-	}
-#endif
-	nbno = NULLAGBLOCK;
-	ncur = (xfs_btree_cur_t *)0;
-	/*
-	 * If the block is full, we can't insert the new entry until we
-	 * make the block un-full.
-	 */
-	if (be16_to_cpu(block->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-		/*
-		 * First, try shifting an entry to the right neighbor.
-		 */
-		if ((error = xfs_alloc_rshift(cur, level, &i)))
-			return error;
-		if (i) {
-			/* nothing */
-		}
-		/*
-		 * Next, try shifting an entry to the left neighbor.
-		 */
-		else {
-			if ((error = xfs_alloc_lshift(cur, level, &i)))
-				return error;
-			if (i)
-				optr = ptr = cur->bc_ptrs[level];
-			else {
-				/*
-				 * Next, try splitting the current block in
-				 * half. If this works we have to re-set our
-				 * variables because we could be in a
-				 * different block now.
-				 */
-				if ((error = xfs_alloc_split(cur, level, &nbno,
-						&nkey, &ncur, &i)))
-					return error;
-				if (i) {
-					bp = cur->bc_bufs[level];
-					block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-					if ((error =
-						xfs_btree_check_sblock(cur,
-							block, level, bp)))
-						return error;
-#endif
-					ptr = cur->bc_ptrs[level];
-					nrec.ar_startblock = nkey.ar_startblock;
-					nrec.ar_blockcount = nkey.ar_blockcount;
-				}
-				/*
-				 * Otherwise the insert fails.
-				 */
-				else {
-					*stat = 0;
-					return 0;
-				}
-			}
-		}
-	}
-	/*
-	 * At this point we know there's room for our new entry in the block
-	 * we're pointing at.
-	 */
-	if (level > 0) {
-		/*
-		 * It's a non-leaf entry.  Make a hole for the new data
-		 * in the key and ptr regions of the block.
-		 */
-		kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-		pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-		for (i = be16_to_cpu(block->bb_numrecs); i >= ptr; i--) {
-			if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
-				return error;
-		}
-#endif
-		memmove(&kp[ptr], &kp[ptr - 1],
-			(be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*kp));
-		memmove(&pp[ptr], &pp[ptr - 1],
-			(be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
-		if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
-			return error;
-#endif
-		/*
-		 * Now stuff the new data in, bump numrecs and log the new data.
-		 */
-		kp[ptr - 1] = key;
-		pp[ptr - 1] = cpu_to_be32(*bnop);
-		be16_add(&block->bb_numrecs, 1);
-		xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
-		xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
-#ifdef DEBUG
-		if (ptr < be16_to_cpu(block->bb_numrecs))
-			xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
-				kp + ptr);
-#endif
-	} else {
-		/*
-		 * It's a leaf entry.  Make a hole for the new record.
-		 */
-		rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-		memmove(&rp[ptr], &rp[ptr - 1],
-			(be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*rp));
-		/*
-		 * Now stuff the new record in, bump numrecs
-		 * and log the new data.
-		 */
-		rp[ptr - 1] = *recp; /* INT_: struct copy */
-		be16_add(&block->bb_numrecs, 1);
-		xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
-#ifdef DEBUG
-		if (ptr < be16_to_cpu(block->bb_numrecs))
-			xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
-				rp + ptr);
-#endif
-	}
-	/*
-	 * Log the new number of records in the btree header.
-	 */
-	xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-	/*
-	 * If we inserted at the start of a block, update the parents' keys.
-	 */
-	if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
-		return error;
-	/*
-	 * Look to see if the longest extent in the allocation group
-	 * needs to be updated.
-	 */
 
-	agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-	if (level == 0 &&
-	    cur->bc_btnum == XFS_BTNUM_CNT &&
-	    be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-	    be32_to_cpu(recp->ar_blockcount) > be32_to_cpu(agf->agf_longest)) {
-		/*
-		 * If this is a leaf in the by-size btree and there
-		 * is no right sibling block and this block is bigger
-		 * than the previous longest block, update it.
-		 */
-		agf->agf_longest = recp->ar_blockcount;
-		cur->bc_mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest
-			= be32_to_cpu(recp->ar_blockcount);
-		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-			XFS_AGF_LONGEST);
-	}
-	/*
-	 * Return the new block number, if any.
-	 * If there is one, give back a record value and a cursor too.
-	 */
-	*bnop = nbno;
-	if (nbno != NULLAGBLOCK) {
-		*recp = nrec; /* INT_: struct copy */
-		*curp = ncur; /* INT_: struct copy */
-	}
+	xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+
+	xfs_trans_agbtree_delta(cur->bc_tp, 1);
+	new->s = cpu_to_be32(bno);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
 	*stat = 1;
 	return 0;
 }
 
-/*
- * Log header fields from a btree block.
- */
-STATIC void
-xfs_alloc_log_block(
-	xfs_trans_t		*tp,	/* transaction pointer */
-	xfs_buf_t		*bp,	/* buffer containing btree block */
-	int			fields)	/* mask of fields: XFS_BB_... */
+STATIC int
+xfs_allocbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
 {
-	int			first;	/* first byte offset logged */
-	int			last;	/* last byte offset logged */
-	static const short	offsets[] = {	/* table of offsets */
-		offsetof(xfs_alloc_block_t, bb_magic),
-		offsetof(xfs_alloc_block_t, bb_level),
-		offsetof(xfs_alloc_block_t, bb_numrecs),
-		offsetof(xfs_alloc_block_t, bb_leftsib),
-		offsetof(xfs_alloc_block_t, bb_rightsib),
-		sizeof(xfs_alloc_block_t)
-	};
-
-	xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
-	xfs_trans_log_buf(tp, bp, first, last);
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_agblock_t		bno;
+	int			error;
+
+	bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
+	error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
+	if (error)
+		return error;
+
+	xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+			      XFS_EXTENT_BUSY_SKIP_DISCARD);
+	xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+	xfs_trans_binval(cur->bc_tp, bp);
+	return 0;
 }
 
 /*
- * Log keys from a btree block (nonleaf).
+ * Update the longest extent in the AGF
  */
 STATIC void
-xfs_alloc_log_keys(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	xfs_buf_t		*bp,	/* buffer containing btree block */
-	int			kfirst,	/* index of first key to log */
-	int			klast)	/* index of last key to log */
+xfs_allocbt_update_lastrec(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	union xfs_btree_rec	*rec,
+	int			ptr,
+	int			reason)
 {
-	xfs_alloc_block_t	*block;	/* btree block to log from */
-	int			first;	/* first byte offset logged */
-	xfs_alloc_key_t		*kp;	/* key pointer in btree block */
-	int			last;	/* last byte offset logged */
-
-	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-	kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-	first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-	last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	struct xfs_perag	*pag;
+	__be32			len;
+	int			numrecs;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+	switch (reason) {
+	case LASTREC_UPDATE:
+		/*
+		 * If this is the last leaf block and it's the last record,
+		 * then update the size of the longest extent in the AG.
+		 */
+		if (ptr != xfs_btree_get_numrecs(block))
+			return;
+		len = rec->alloc.ar_blockcount;
+		break;
+	case LASTREC_INSREC:
+		if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+		    be32_to_cpu(agf->agf_longest))
+			return;
+		len = rec->alloc.ar_blockcount;
+		break;
+	case LASTREC_DELREC:
+		numrecs = xfs_btree_get_numrecs(block);
+		if (ptr <= numrecs)
+			return;
+		ASSERT(ptr == numrecs + 1);
+
+		if (numrecs) {
+			xfs_alloc_rec_t *rrp;
+
+			rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+			len = rrp->ar_blockcount;
+		} else {
+			len = 0;
+		}
+
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	agf->agf_longest = len;
+	pag = xfs_perag_get(cur->bc_mp, seqno);
+	pag->pagf_longest = be32_to_cpu(len);
+	xfs_perag_put(pag);
+	xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
 }
 
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
-STATIC void
-xfs_alloc_log_ptrs(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	xfs_buf_t		*bp,	/* buffer containing btree block */
-	int			pfirst,	/* index of first pointer to log */
-	int			plast)	/* index of last pointer to log */
+STATIC int
+xfs_allocbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
 {
-	xfs_alloc_block_t	*block;	/* btree block to log from */
-	int			first;	/* first byte offset logged */
-	int			last;	/* last byte offset logged */
-	xfs_alloc_ptr_t		*pp;	/* block-pointer pointer in btree blk */
-
-	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-	pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-	first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-	last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+	return cur->bc_mp->m_alloc_mnr[level != 0];
 }
 
-/*
- * Log records from a btree block (leaf).
- */
-STATIC void
-xfs_alloc_log_recs(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	xfs_buf_t		*bp,	/* buffer containing btree block */
-	int			rfirst,	/* index of first record to log */
-	int			rlast)	/* index of last record to log */
+STATIC int
+xfs_allocbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
 {
-	xfs_alloc_block_t	*block;	/* btree block to log from */
-	int			first;	/* first byte offset logged */
-	int			last;	/* last byte offset logged */
-	xfs_alloc_rec_t		*rp;	/* record pointer for btree block */
-
-
-	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-	rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-#ifdef DEBUG
-	{
-		xfs_agf_t	*agf;
-		xfs_alloc_rec_t	*p;
-
-		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-		for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
-			ASSERT(be32_to_cpu(p->ar_startblock) +
-			       be32_to_cpu(p->ar_blockcount) <=
-			       be32_to_cpu(agf->agf_length));
-	}
-#endif
-	first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-	last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+	return cur->bc_mp->m_alloc_mxr[level != 0];
 }
 
-/*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
- */
-STATIC int				/* error */
-xfs_alloc_lookup(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	xfs_lookup_t		dir,	/* <=, ==, or >= */
-	int			*stat)	/* success/failure */
+STATIC void
+xfs_allocbt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
 {
-	xfs_agblock_t		agbno;	/* a.g. relative btree block number */
-	xfs_agnumber_t		agno;	/* allocation group number */
-	xfs_alloc_block_t	*block=NULL;	/* current btree block */
-	int			diff;	/* difference for the current key */
-	int			error;	/* error return value */
-	int			keyno=0;	/* current key number */
-	int			level;	/* level in the btree */
-	xfs_mount_t		*mp;	/* file system mount point */
-
-	XFS_STATS_INC(xs_abt_lookup);
-	/*
-	 * Get the allocation group header, and the root block number.
-	 */
-	mp = cur->bc_mp;
+	ASSERT(rec->alloc.ar_startblock != 0);
 
-	{
-		xfs_agf_t	*agf;	/* a.g. freespace header */
+	key->alloc.ar_startblock = rec->alloc.ar_startblock;
+	key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+}
 
-		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-		agno = be32_to_cpu(agf->agf_seqno);
-		agbno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-	}
-	/*
-	 * Iterate over each level in the btree, starting at the root.
-	 * For each level above the leaves, find the key we need, based
-	 * on the lookup record, then follow the corresponding block
-	 * pointer down to the next level.
-	 */
-	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-		xfs_buf_t	*bp;	/* buffer pointer for btree block */
-		xfs_daddr_t	d;	/* disk address of btree block */
+STATIC void
+xfs_allocbt_init_rec_from_key(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	ASSERT(key->alloc.ar_startblock != 0);
 
-		/*
-		 * Get the disk address we're looking for.
-		 */
-		d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-		/*
-		 * If the old buffer at this level is for a different block,
-		 * throw it away, otherwise just use it.
-		 */
-		bp = cur->bc_bufs[level];
-		if (bp && XFS_BUF_ADDR(bp) != d)
-			bp = (xfs_buf_t *)0;
-		if (!bp) {
-			/*
-			 * Need to get a new buffer.  Read it, then
-			 * set it in the cursor, releasing the old one.
-			 */
-			if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
-					agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
-				return error;
-			xfs_btree_setbuf(cur, level, bp);
-			/*
-			 * Point to the btree block, now that we have the buffer
-			 */
-			block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-			if ((error = xfs_btree_check_sblock(cur, block, level,
-					bp)))
-				return error;
-		} else
-			block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-		/*
-		 * If we already had a key match at a higher level, we know
-		 * we need to use the first entry in this block.
-		 */
-		if (diff == 0)
-			keyno = 1;
-		/*
-		 * Otherwise we need to search this block.  Do a binary search.
-		 */
-		else {
-			int		high;	/* high entry number */
-			xfs_alloc_key_t	*kkbase=NULL;/* base of keys in block */
-			xfs_alloc_rec_t	*krbase=NULL;/* base of records in block */
-			int		low;	/* low entry number */
-
-			/*
-			 * Get a pointer to keys or records.
-			 */
-			if (level > 0)
-				kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-			else
-				krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
-			/*
-			 * Set low and high entry numbers, 1-based.
-			 */
-			low = 1;
-			if (!(high = be16_to_cpu(block->bb_numrecs))) {
-				/*
-				 * If the block is empty, the tree must
-				 * be an empty leaf.
-				 */
-				ASSERT(level == 0 && cur->bc_nlevels == 1);
-				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-				*stat = 0;
-				return 0;
-			}
-			/*
-			 * Binary search the block.
-			 */
-			while (low <= high) {
-				xfs_extlen_t	blockcount;	/* key value */
-				xfs_agblock_t	startblock;	/* key value */
-
-				XFS_STATS_INC(xs_abt_compare);
-				/*
-				 * keyno is average of low and high.
-				 */
-				keyno = (low + high) >> 1;
-				/*
-				 * Get startblock & blockcount.
-				 */
-				if (level > 0) {
-					xfs_alloc_key_t	*kkp;
-
-					kkp = kkbase + keyno - 1;
-					startblock = be32_to_cpu(kkp->ar_startblock);
-					blockcount = be32_to_cpu(kkp->ar_blockcount);
-				} else {
-					xfs_alloc_rec_t	*krp;
-
-					krp = krbase + keyno - 1;
-					startblock = be32_to_cpu(krp->ar_startblock);
-					blockcount = be32_to_cpu(krp->ar_blockcount);
-				}
-				/*
-				 * Compute difference to get next direction.
-				 */
-				if (cur->bc_btnum == XFS_BTNUM_BNO)
-					diff = (int)startblock -
-					       (int)cur->bc_rec.a.ar_startblock;
-				else if (!(diff = (int)blockcount -
-					    (int)cur->bc_rec.a.ar_blockcount))
-					diff = (int)startblock -
-					    (int)cur->bc_rec.a.ar_startblock;
-				/*
-				 * Less than, move right.
-				 */
-				if (diff < 0)
-					low = keyno + 1;
-				/*
-				 * Greater than, move left.
-				 */
-				else if (diff > 0)
-					high = keyno - 1;
-				/*
-				 * Equal, we're done.
-				 */
-				else
-					break;
-			}
-		}
-		/*
-		 * If there are more levels, set up for the next level
-		 * by getting the block number and filling in the cursor.
-		 */
-		if (level > 0) {
-			/*
-			 * If we moved left, need the previous key number,
-			 * unless there isn't one.
-			 */
-			if (diff > 0 && --keyno < 1)
-				keyno = 1;
-			agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
-			if ((error = xfs_btree_check_sptr(cur, agbno, level)))
-				return error;
-#endif
-			cur->bc_ptrs[level] = keyno;
-		}
-	}
-	/*
-	 * Done with the search.
-	 * See if we need to adjust the results.
-	 */
-	if (dir != XFS_LOOKUP_LE && diff < 0) {
-		keyno++;
-		/*
-		 * If ge search and we went off the end of the block, but it's
-		 * not the last block, we're in the wrong block.
-		 */
-		if (dir == XFS_LOOKUP_GE &&
-		    keyno > be16_to_cpu(block->bb_numrecs) &&
-		    be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-			int	i;
-
-			cur->bc_ptrs[0] = keyno;
-			if ((error = xfs_alloc_increment(cur, 0, &i)))
-				return error;
-			XFS_WANT_CORRUPTED_RETURN(i == 1);
-			*stat = 1;
-			return 0;
-		}
-	}
-	else if (dir == XFS_LOOKUP_LE && diff > 0)
-		keyno--;
-	cur->bc_ptrs[0] = keyno;
-	/*
-	 * Return if we succeeded or not.
-	 */
-	if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
-		*stat = 0;
-	else
-		*stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-	return 0;
+	rec->alloc.ar_startblock = key->alloc.ar_startblock;
+	rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
 }
 
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int				/* error */
-xfs_alloc_lshift(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level,	/* level to shift record on */
-	int			*stat)	/* success/failure */
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
 {
-	int			error;	/* error return value */
-#ifdef DEBUG
-	int			i;	/* loop index */
-#endif
-	xfs_alloc_key_t		key;	/* key value for leaf level upward */
-	xfs_buf_t		*lbp;	/* buffer for left neighbor block */
-	xfs_alloc_block_t	*left;	/* left neighbor btree block */
-	int			nrec;	/* new number of left block entries */
-	xfs_buf_t		*rbp;	/* buffer for right (current) block */
-	xfs_alloc_block_t	*right;	/* right (current) btree block */
-	xfs_alloc_key_t		*rkp=NULL;	/* key pointer for right block */
-	xfs_alloc_ptr_t		*rpp=NULL;	/* address pointer for right block */
-	xfs_alloc_rec_t		*rrp=NULL;	/* record pointer for right block */
+	ASSERT(cur->bc_rec.a.ar_startblock != 0);
 
-	/*
-	 * Set up variables for this block as "right".
-	 */
-	rbp = cur->bc_bufs[level];
-	right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-	if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-		return error;
-#endif
-	/*
-	 * If we've got no left sibling then we can't shift an entry left.
-	 */
-	if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
-		*stat = 0;
-		return 0;
-	}
-	/*
-	 * If the cursor entry is the one that would be moved, don't
-	 * do it... it's too complicated.
-	 */
-	if (cur->bc_ptrs[level] <= 1) {
-		*stat = 0;
-		return 0;
-	}
-	/*
-	 * Set up the left neighbor as "left".
-	 */
-	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-			cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
-			0, &lbp, XFS_ALLOC_BTREE_REF)))
-		return error;
-	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-		return error;
-	/*
-	 * If it's full, it can't take another entry.
-	 */
-	if (be16_to_cpu(left->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-		*stat = 0;
-		return 0;
-	}
-	nrec = be16_to_cpu(left->bb_numrecs) + 1;
-	/*
-	 * If non-leaf, copy a key and a ptr to the left block.
-	 */
-	if (level > 0) {
-		xfs_alloc_key_t	*lkp;	/* key pointer for left block */
-		xfs_alloc_ptr_t	*lpp;	/* address pointer for left block */
-
-		lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
-		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-		*lkp = *rkp;
-		xfs_alloc_log_keys(cur, lbp, nrec, nrec);
-		lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
-		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-		if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
-			return error;
-#endif
-		*lpp = *rpp; /* INT_: copy */
-		xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
-		xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
-	}
-	/*
-	 * If leaf, copy a record to the left block.
-	 */
-	else {
-		xfs_alloc_rec_t	*lrp;	/* record pointer for left block */
-
-		lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
-		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-		*lrp = *rrp;
-		xfs_alloc_log_recs(cur, lbp, nrec, nrec);
-		xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-	}
-	/*
-	 * Bump and log left's numrecs, decrement and log right's numrecs.
-	 */
-	be16_add(&left->bb_numrecs, 1);
-	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-	be16_add(&right->bb_numrecs, -1);
-	xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-	/*
-	 * Slide the contents of right down one entry.
-	 */
-	if (level > 0) {
-#ifdef DEBUG
-		for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-			if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
-					level)))
-				return error;
-		}
-#endif
-		memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-		memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-		xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-		xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-	} else {
-		memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-		xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-		key.ar_startblock = rrp->ar_startblock;
-		key.ar_blockcount = rrp->ar_blockcount;
-		rkp = &key;
-	}
-	/*
-	 * Update the parent key values of right.
-	 */
-	if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
-		return error;
-	/*
-	 * Slide the cursor value left one.
-	 */
-	cur->bc_ptrs[level]--;
-	*stat = 1;
-	return 0;
+	rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+	rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
 }
 
-/*
- * Allocate a new root block, fill it in.
- */
-STATIC int				/* error */
-xfs_alloc_newroot(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			*stat)	/* success/failure */
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
 {
-	int			error;	/* error return value */
-	xfs_agblock_t		lbno;	/* left block number */
-	xfs_buf_t		*lbp;	/* left btree buffer */
-	xfs_alloc_block_t	*left;	/* left btree block */
-	xfs_mount_t		*mp;	/* mount structure */
-	xfs_agblock_t		nbno;	/* new block number */
-	xfs_buf_t		*nbp;	/* new (root) buffer */
-	xfs_alloc_block_t	*new;	/* new (root) btree block */
-	int			nptr;	/* new value for key index, 1 or 2 */
-	xfs_agblock_t		rbno;	/* right block number */
-	xfs_buf_t		*rbp;	/* right btree buffer */
-	xfs_alloc_block_t	*right;	/* right btree block */
-
-	mp = cur->bc_mp;
-
-	ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
-	/*
-	 * Get a buffer from the freelist blocks, for the new root.
-	 */
-	if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
-			&nbno)))
-		return error;
-	/*
-	 * None available, we fail.
-	 */
-	if (nbno == NULLAGBLOCK) {
-		*stat = 0;
-		return 0;
-	}
-	xfs_trans_agbtree_delta(cur->bc_tp, 1);
-	nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
-		0);
-	new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
-	/*
-	 * Set the root data in the a.g. freespace structure.
-	 */
-	{
-		xfs_agf_t	*agf;	/* a.g. freespace header */
-		xfs_agnumber_t	seqno;
-
-		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-		agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno);
-		be32_add(&agf->agf_levels[cur->bc_btnum], 1);
-		seqno = be32_to_cpu(agf->agf_seqno);
-		mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
-		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-			XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-	}
-	/*
-	 * At the previous root level there are now two blocks: the old
-	 * root, and the new block generated when it was split.
-	 * We don't know which one the cursor is pointing at, so we
-	 * set up variables "left" and "right" for each case.
-	 */
-	lbp = cur->bc_bufs[cur->bc_nlevels - 1];
-	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-	if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
-		return error;
-#endif
-	if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-		/*
-		 * Our block is left, pick up the right block.
-		 */
-		lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
-		rbno = be32_to_cpu(left->bb_rightsib);
-		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-				cur->bc_private.a.agno, rbno, 0, &rbp,
-				XFS_ALLOC_BTREE_REF)))
-			return error;
-		right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-		if ((error = xfs_btree_check_sblock(cur, right,
-				cur->bc_nlevels - 1, rbp)))
-			return error;
-		nptr = 1;
-	} else {
-		/*
-		 * Our block is right, pick up the left block.
-		 */
-		rbp = lbp;
-		right = left;
-		rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
-		lbno = be32_to_cpu(right->bb_leftsib);
-		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-				cur->bc_private.a.agno, lbno, 0, &lbp,
-				XFS_ALLOC_BTREE_REF)))
-			return error;
-		left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-		if ((error = xfs_btree_check_sblock(cur, left,
-				cur->bc_nlevels - 1, lbp)))
-			return error;
-		nptr = 2;
-	}
-	/*
-	 * Fill in the new block's btree header and log it.
-	 */
-	new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-	new->bb_level = cpu_to_be16(cur->bc_nlevels);
-	new->bb_numrecs = cpu_to_be16(2);
-	new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-	new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-	xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
-	ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
-	/*
-	 * Fill in the key data in the new root.
-	 */
-	{
-		xfs_alloc_key_t		*kp;	/* btree key pointer */
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
 
-		kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
-		if (be16_to_cpu(left->bb_level) > 0) {
-			kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */
-			kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */
-		} else {
-			xfs_alloc_rec_t	*rp;	/* btree record pointer */
-
-			rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
-			kp[0].ar_startblock = rp->ar_startblock;
-			kp[0].ar_blockcount = rp->ar_blockcount;
-			rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-			kp[1].ar_startblock = rp->ar_startblock;
-			kp[1].ar_blockcount = rp->ar_blockcount;
-		}
-	}
-	xfs_alloc_log_keys(cur, nbp, 1, 2);
-	/*
-	 * Fill in the pointer data in the new root.
-	 */
-	{
-		xfs_alloc_ptr_t		*pp;	/* btree address pointer */
+	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+	ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
 
-		pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
-		pp[0] = cpu_to_be32(lbno);
-		pp[1] = cpu_to_be32(rbno);
-	}
-	xfs_alloc_log_ptrs(cur, nbp, 1, 2);
-	/*
-	 * Fix up the cursor.
-	 */
-	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-	cur->bc_ptrs[cur->bc_nlevels] = nptr;
-	cur->bc_nlevels++;
-	*stat = 1;
-	return 0;
+	ptr->s = agf->agf_roots[cur->bc_btnum];
 }
 
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int				/* error */
-xfs_alloc_rshift(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level,	/* level to shift record on */
-	int			*stat)	/* success/failure */
+STATIC __int64_t
+xfs_allocbt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
 {
-	int			error;	/* error return value */
-	int			i;	/* loop index */
-	xfs_alloc_key_t		key;	/* key value for leaf level upward */
-	xfs_buf_t		*lbp;	/* buffer for left (current) block */
-	xfs_alloc_block_t	*left;	/* left (current) btree block */
-	xfs_buf_t		*rbp;	/* buffer for right neighbor block */
-	xfs_alloc_block_t	*right;	/* right neighbor btree block */
-	xfs_alloc_key_t		*rkp;	/* key pointer for right block */
-	xfs_btree_cur_t		*tcur;	/* temporary cursor */
+	xfs_alloc_rec_incore_t	*rec = &cur->bc_rec.a;
+	xfs_alloc_key_t		*kp = &key->alloc;
+	__int64_t		diff;
 
-	/*
-	 * Set up variables for this block as "left".
-	 */
-	lbp = cur->bc_bufs[level];
-	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-		return error;
-#endif
-	/*
-	 * If we've got no right sibling then we can't shift an entry right.
-	 */
-	if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
-		*stat = 0;
-		return 0;
-	}
-	/*
-	 * If the cursor entry is the one that would be moved, don't
-	 * do it... it's too complicated.
-	 */
-	if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-		*stat = 0;
-		return 0;
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return (__int64_t)be32_to_cpu(kp->ar_startblock) -
+				rec->ar_startblock;
 	}
-	/*
-	 * Set up the right neighbor as "right".
-	 */
-	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-			cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
-			0, &rbp, XFS_ALLOC_BTREE_REF)))
-		return error;
-	right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-	if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-		return error;
-	/*
-	 * If it's full, it can't take another entry.
-	 */
-	if (be16_to_cpu(right->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-		*stat = 0;
-		return 0;
-	}
-	/*
-	 * Make a hole at the start of the right neighbor block, then
-	 * copy the last left block entry to the hole.
-	 */
-	if (level > 0) {
-		xfs_alloc_key_t	*lkp;	/* key pointer for left block */
-		xfs_alloc_ptr_t	*lpp;	/* address pointer for left block */
-		xfs_alloc_ptr_t	*rpp;	/* address pointer for right block */
-
-		lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-		lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-		for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-			if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-				return error;
-		}
-#endif
-		memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-		memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-		if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
-			return error;
-#endif
-		*rkp = *lkp; /* INT_: copy */
-		*rpp = *lpp; /* INT_: copy */
-		xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-		xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-		xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
-	} else {
-		xfs_alloc_rec_t	*lrp;	/* record pointer for left block */
-		xfs_alloc_rec_t	*rrp;	/* record pointer for right block */
-
-		lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-		memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-		*rrp = *lrp;
-		xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-		key.ar_startblock = rrp->ar_startblock;
-		key.ar_blockcount = rrp->ar_blockcount;
-		rkp = &key;
-		xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
-	}
-	/*
-	 * Decrement and log left's numrecs, bump and log right's numrecs.
-	 */
-	be16_add(&left->bb_numrecs, -1);
-	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-	be16_add(&right->bb_numrecs, 1);
-	xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-	/*
-	 * Using a temporary cursor, update the parent key values of the
-	 * block on the right.
-	 */
-	if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-		return error;
-	i = xfs_btree_lastrec(tcur, level);
-	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-	if ((error = xfs_alloc_increment(tcur, level, &i)) ||
-	    (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
-		goto error0;
-	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-	*stat = 1;
-	return 0;
-error0:
-	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-	return error;
-}
 
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int				/* error */
-xfs_alloc_split(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level,	/* level to split */
-	xfs_agblock_t		*bnop,	/* output: block number allocated */
-	xfs_alloc_key_t		*keyp,	/* output: first key of new block */
-	xfs_btree_cur_t		**curp,	/* output: new cursor */
-	int			*stat)	/* success/failure */
-{
-	int			error;	/* error return value */
-	int			i;	/* loop index/record number */
-	xfs_agblock_t		lbno;	/* left (current) block number */
-	xfs_buf_t		*lbp;	/* buffer for left block */
-	xfs_alloc_block_t	*left;	/* left (current) btree block */
-	xfs_agblock_t		rbno;	/* right (new) block number */
-	xfs_buf_t		*rbp;	/* buffer for right block */
-	xfs_alloc_block_t	*right;	/* right (new) btree block */
+	diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+	if (diff)
+		return diff;
 
-	/*
-	 * Allocate the new block from the freelist.
-	 * If we can't do it, we're toast.  Give up.
-	 */
-	if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
-			&rbno)))
-		return error;
-	if (rbno == NULLAGBLOCK) {
-		*stat = 0;
-		return 0;
-	}
-	xfs_trans_agbtree_delta(cur->bc_tp, 1);
-	rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
-		rbno, 0);
-	/*
-	 * Set up the new block as "right".
-	 */
-	right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-	/*
-	 * "Left" is the current (according to the cursor) block.
-	 */
-	lbp = cur->bc_bufs[level];
-	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-		return error;
-#endif
-	/*
-	 * Fill in the btree header for the new block.
-	 */
-	right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-	right->bb_level = left->bb_level;
-	right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-	/*
-	 * Make sure that if there's an odd number of entries now, that
-	 * each new block will have the same number of entries.
-	 */
-	if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-	    cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-		be16_add(&right->bb_numrecs, 1);
-	i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-	/*
-	 * For non-leaf blocks, copy keys and addresses over to the new block.
-	 */
-	if (level > 0) {
-		xfs_alloc_key_t	*lkp;	/* left btree key pointer */
-		xfs_alloc_ptr_t	*lpp;	/* left btree address pointer */
-		xfs_alloc_key_t	*rkp;	/* right btree key pointer */
-		xfs_alloc_ptr_t	*rpp;	/* right btree address pointer */
-
-		lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
-		lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
-		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-		for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-			if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-				return error;
-		}
-#endif
-		memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-		memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-		xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-		xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-		*keyp = *rkp;
-	}
-	/*
-	 * For leaf blocks, copy records over to the new block.
-	 */
-	else {
-		xfs_alloc_rec_t	*lrp;	/* left btree record pointer */
-		xfs_alloc_rec_t	*rrp;	/* right btree record pointer */
-
-		lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
-		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-		memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-		xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-		keyp->ar_startblock = rrp->ar_startblock;
-		keyp->ar_blockcount = rrp->ar_blockcount;
-	}
-	/*
-	 * Find the left block number by looking in the buffer.
-	 * Adjust numrecs, sibling pointers.
-	 */
-	lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
-	be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-	right->bb_rightsib = left->bb_rightsib;
-	left->bb_rightsib = cpu_to_be32(rbno);
-	right->bb_leftsib = cpu_to_be32(lbno);
-	xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
-	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-	/*
-	 * If there's a block to the new block's right, make that block
-	 * point back to right instead of to left.
-	 */
-	if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
-		xfs_alloc_block_t	*rrblock;	/* rr btree block */
-		xfs_buf_t		*rrbp;		/* buffer for rrblock */
-
-		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-				cur->bc_private.a.agno, be32_to_cpu(right->bb_rightsib), 0,
-				&rrbp, XFS_ALLOC_BTREE_REF)))
-			return error;
-		rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
-		if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-			return error;
-		rrblock->bb_leftsib = cpu_to_be32(rbno);
-		xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
-	}
-	/*
-	 * If the cursor is really in the right block, move it there.
-	 * If it's just pointing past the last entry in left, then we'll
-	 * insert there, so don't change anything in that case.
-	 */
-	if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-		xfs_btree_setbuf(cur, level, rbp);
-		cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-	}
-	/*
-	 * If there are more levels, we'll need another cursor which refers to
-	 * the right block, no matter where this cursor was.
-	 */
-	if (level + 1 < cur->bc_nlevels) {
-		if ((error = xfs_btree_dup_cursor(cur, curp)))
-			return error;
-		(*curp)->bc_ptrs[level + 1]++;
-	}
-	*bnop = rbno;
-	*stat = 1;
-	return 0;
+	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
 
-/*
- * Update keys at all levels from here to the root along the cursor's path.
- */
-STATIC int				/* error */
-xfs_alloc_updkey(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	xfs_alloc_key_t		*keyp,	/* new key value to update to */
-	int			level)	/* starting level for update */
+static bool
+xfs_allocbt_verify(
+	struct xfs_buf		*bp)
 {
-	int			ptr;	/* index of key in block */
-
-	/*
-	 * Go up the tree from this level toward the root.
-	 * At each level, update the key value to the value input.
-	 * Stop when we reach a level where the cursor isn't pointing
-	 * at the first entry in the block.
-	 */
-	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-		xfs_alloc_block_t	*block;	/* btree block */
-		xfs_buf_t		*bp;	/* buffer for block */
-#ifdef DEBUG
-		int			error;	/* error return value */
-#endif
-		xfs_alloc_key_t		*kp;	/* ptr to btree block keys */
-
-		bp = cur->bc_bufs[level];
-		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-		if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-			return error;
-#endif
-		ptr = cur->bc_ptrs[level];
-		kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
-		*kp = *keyp;
-		xfs_alloc_log_keys(cur, bp, ptr, ptr);
-	}
-	return 0;
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+	unsigned int		level;
+
+	/*
+	 * magic number and level verification
+	 *
+	 * During growfs operations, we can't verify the exact level or owner as
+	 * the perag is not fully initialised and hence not attached to the
+	 * buffer.  In this case, check against the maximum tree depth.
+	 *
+	 * Similarly, during log recovery we will have a perag structure
+	 * attached, but the agf information will not yet have been initialised
+	 * from the on disk AGF. Again, we can only check against maximum limits
+	 * in this case.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	switch (block->bb_magic) {
+	case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+			return false;
+		if (pag &&
+		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_ABTB_MAGIC):
+		if (pag && pag->pagf_init) {
+			if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
+				return false;
+		} else if (level >= mp->m_ag_maxlevels)
+			return false;
+		break;
+	case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
+		if (!xfs_sb_version_hascrc(&mp->m_sb))
+			return false;
+		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+			return false;
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+			return false;
+		if (pag &&
+		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+			return false;
+		/* fall through */
+	case cpu_to_be32(XFS_ABTC_MAGIC):
+		if (pag && pag->pagf_init) {
+			if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
+				return false;
+		} else if (level >= mp->m_ag_maxlevels)
+			return false;
+		break;
+	default:
+		return false;
+	}
+
+	/* numrecs verification */
+	if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
+		return false;
+
+	/* sibling pointer verification */
+	if (!block->bb_u.s.bb_leftsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+	if (!block->bb_u.s.bb_rightsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+
+	return true;
 }
 
-/*
- * Externally visible routines.
- */
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int					/* error */
-xfs_alloc_decrement(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level,	/* level in btree, 0 is leaf */
-	int			*stat)	/* success/failure */
+static void
+xfs_allocbt_read_verify(
+	struct xfs_buf	*bp)
 {
-	xfs_alloc_block_t	*block;	/* btree block */
-	int			error;	/* error return value */
-	int			lev;	/* btree level */
+	if (!xfs_btree_sblock_verify_crc(bp))
+		xfs_buf_ioerror(bp, EFSBADCRC);
+	else if (!xfs_allocbt_verify(bp))
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
 
-	ASSERT(level < cur->bc_nlevels);
-	/*
-	 * Read-ahead to the left at this level.
-	 */
-	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-	/*
-	 * Decrement the ptr at this level.  If we're still in the block
-	 * then we're done.
-	 */
-	if (--cur->bc_ptrs[level] > 0) {
-		*stat = 1;
-		return 0;
-	}
-	/*
-	 * Get a pointer to the btree block.
-	 */
-	block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
-	if ((error = xfs_btree_check_sblock(cur, block, level,
-			cur->bc_bufs[level])))
-		return error;
-#endif
-	/*
-	 * If we just went off the left edge of the tree, return failure.
-	 */
-	if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
-		*stat = 0;
-		return 0;
-	}
-	/*
-	 * March up the tree decrementing pointers.
-	 * Stop when we don't go off the left edge of a block.
-	 */
-	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-		if (--cur->bc_ptrs[lev] > 0)
-			break;
-		/*
-		 * Read-ahead the left block, we're going to read it
-		 * in the next loop.
-		 */
-		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-	}
-	/*
-	 * If we went off the root then we are seriously confused.
-	 */
-	ASSERT(lev < cur->bc_nlevels);
-	/*
-	 * Now walk back down the tree, fixing up the cursor's buffer
-	 * pointers and key numbers.
-	 */
-	for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
-		xfs_agblock_t	agbno;	/* block number of btree block */
-		xfs_buf_t	*bp;	/* buffer pointer for block */
-
-		agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-				cur->bc_private.a.agno, agbno, 0, &bp,
-				XFS_ALLOC_BTREE_REF)))
-			return error;
-		lev--;
-		xfs_btree_setbuf(cur, lev, bp);
-		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-			return error;
-		cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
+	if (bp->b_error) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp);
 	}
-	*stat = 1;
-	return 0;
 }
 
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-int					/* error */
-xfs_alloc_delete(
-	xfs_btree_cur_t	*cur,		/* btree cursor */
-	int		*stat)		/* success/failure */
+static void
+xfs_allocbt_write_verify(
+	struct xfs_buf	*bp)
 {
-	int		error;		/* error return value */
-	int		i;		/* result code */
-	int		level;		/* btree level */
-
-	/*
-	 * Go up the tree, starting at leaf level.
-	 * If 2 is returned then a join was done; go to the next level.
-	 * Otherwise we are done.
-	 */
-	for (level = 0, i = 2; i == 2; level++) {
-		if ((error = xfs_alloc_delrec(cur, level, &i)))
-			return error;
+	if (!xfs_allocbt_verify(bp)) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
 	}
-	if (i == 0) {
-		for (level = 1; level < cur->bc_nlevels; level++) {
-			if (cur->bc_ptrs[level] == 0) {
-				if ((error = xfs_alloc_decrement(cur, level, &i)))
-					return error;
-				break;
-			}
-		}
-	}
-	*stat = i;
-	return 0;
+	xfs_btree_sblock_calc_crc(bp);
+
 }
 
-/*
- * Get the data from the pointed-to record.
- */
-int					/* error */
-xfs_alloc_get_rec(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	xfs_agblock_t		*bno,	/* output: starting block of extent */
-	xfs_extlen_t		*len,	/* output: length of extent */
-	int			*stat)	/* output: success/failure */
-{
-	xfs_alloc_block_t	*block;	/* btree block */
-#ifdef DEBUG
-	int			error;	/* error return value */
-#endif
-	int			ptr;	/* record number */
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+	.verify_read = xfs_allocbt_read_verify,
+	.verify_write = xfs_allocbt_write_verify,
+};
 
-	ptr = cur->bc_ptrs[0];
-	block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
-	if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
-		return error;
-#endif
-	/*
-	 * Off the right end or left end, return failure.
-	 */
-	if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
-		*stat = 0;
-		return 0;
-	}
-	/*
-	 * Point to the record and extract its data.
-	 */
-	{
-		xfs_alloc_rec_t		*rec;	/* record data */
 
-		rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-		*bno = be32_to_cpu(rec->ar_startblock);
-		*len = be32_to_cpu(rec->ar_blockcount);
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_allocbt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return be32_to_cpu(k1->alloc.ar_startblock) <
+		       be32_to_cpu(k2->alloc.ar_startblock);
+	} else {
+		return be32_to_cpu(k1->alloc.ar_blockcount) <
+			be32_to_cpu(k2->alloc.ar_blockcount) ||
+			(k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+			 be32_to_cpu(k1->alloc.ar_startblock) <
+			 be32_to_cpu(k2->alloc.ar_startblock));
 	}
-	*stat = 1;
-	return 0;
 }
 
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int					/* error */
-xfs_alloc_increment(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level,	/* level in btree, 0 is leaf */
-	int			*stat)	/* success/failure */
+STATIC int
+xfs_allocbt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
 {
-	xfs_alloc_block_t	*block;	/* btree block */
-	xfs_buf_t		*bp;	/* tree block buffer */
-	int			error;	/* error return value */
-	int			lev;	/* btree level */
-
-	ASSERT(level < cur->bc_nlevels);
-	/*
-	 * Read-ahead to the right at this level.
-	 */
-	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-	/*
-	 * Get a pointer to the btree block.
-	 */
-	bp = cur->bc_bufs[level];
-	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-		return error;
-#endif
-	/*
-	 * Increment the ptr at this level.  If we're still in the block
-	 * then we're done.
-	 */
-	if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-		*stat = 1;
-		return 0;
-	}
-	/*
-	 * If we just went off the right edge of the tree, return failure.
-	 */
-	if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
-		*stat = 0;
-		return 0;
-	}
-	/*
-	 * March up the tree incrementing pointers.
-	 * Stop when we don't go off the right edge of a block.
-	 */
-	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-		bp = cur->bc_bufs[lev];
-		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-			return error;
-#endif
-		if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-			break;
-		/*
-		 * Read-ahead the right block, we're going to read it
-		 * in the next loop.
-		 */
-		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-	}
-	/*
-	 * If we went off the root then we are seriously confused.
-	 */
-	ASSERT(lev < cur->bc_nlevels);
-	/*
-	 * Now walk back down the tree, fixing up the cursor's buffer
-	 * pointers and key numbers.
-	 */
-	for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-	     lev > level; ) {
-		xfs_agblock_t	agbno;	/* block number of btree block */
-
-		agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-				cur->bc_private.a.agno, agbno, 0, &bp,
-				XFS_ALLOC_BTREE_REF)))
-			return error;
-		lev--;
-		xfs_btree_setbuf(cur, lev, bp);
-		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-			return error;
-		cur->bc_ptrs[lev] = 1;
+	if (cur->bc_btnum == XFS_BTNUM_BNO) {
+		return be32_to_cpu(r1->alloc.ar_startblock) +
+			be32_to_cpu(r1->alloc.ar_blockcount) <=
+			be32_to_cpu(r2->alloc.ar_startblock);
+	} else {
+		return be32_to_cpu(r1->alloc.ar_blockcount) <
+			be32_to_cpu(r2->alloc.ar_blockcount) ||
+			(r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+			 be32_to_cpu(r1->alloc.ar_startblock) <
+			 be32_to_cpu(r2->alloc.ar_startblock));
 	}
-	*stat = 1;
-	return 0;
 }
+#endif	/* DEBUG */
+
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+	.rec_len		= sizeof(xfs_alloc_rec_t),
+	.key_len		= sizeof(xfs_alloc_key_t),
+
+	.dup_cursor		= xfs_allocbt_dup_cursor,
+	.set_root		= xfs_allocbt_set_root,
+	.alloc_block		= xfs_allocbt_alloc_block,
+	.free_block		= xfs_allocbt_free_block,
+	.update_lastrec		= xfs_allocbt_update_lastrec,
+	.get_minrecs		= xfs_allocbt_get_minrecs,
+	.get_maxrecs		= xfs_allocbt_get_maxrecs,
+	.init_key_from_rec	= xfs_allocbt_init_key_from_rec,
+	.init_rec_from_key	= xfs_allocbt_init_rec_from_key,
+	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
+	.key_diff		= xfs_allocbt_key_diff,
+	.buf_ops		= &xfs_allocbt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_allocbt_keys_inorder,
+	.recs_inorder		= xfs_allocbt_recs_inorder,
+#endif
+};
 
 /*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
+ * Allocate a new allocation btree cursor.
  */
-int					/* error */
-xfs_alloc_insert(
-	xfs_btree_cur_t	*cur,		/* btree cursor */
-	int		*stat)		/* success/failure */
+struct xfs_btree_cur *			/* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_buf		*agbp,		/* buffer for agf structure */
+	xfs_agnumber_t		agno,		/* allocation group number */
+	xfs_btnum_t		btnum)		/* btree identifier */
 {
-	int		error;		/* error return value */
-	int		i;		/* result value, 0 for failure */
-	int		level;		/* current level number in btree */
-	xfs_agblock_t	nbno;		/* new block number (split result) */
-	xfs_btree_cur_t	*ncur;		/* new cursor (split result) */
-	xfs_alloc_rec_t	nrec;		/* record being inserted this level */
-	xfs_btree_cur_t	*pcur;		/* previous level's cursor */
-
-	level = 0;
-	nbno = NULLAGBLOCK;
-	nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
-	nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
-	ncur = (xfs_btree_cur_t *)0;
-	pcur = cur;
-	/*
-	 * Loop going up the tree, starting at the leaf level.
-	 * Stop when we don't get a split block, that must mean that
-	 * the insert is finished with this level.
-	 */
-	do {
-		/*
-		 * Insert nrec/nbno into this level of the tree.
-		 * Note if we fail, nbno will be null.
-		 */
-		if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
-				&i))) {
-			if (pcur != cur)
-				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-			return error;
-		}
-		/*
-		 * See if the cursor we just used is trash.
-		 * Can't trash the caller's cursor, but otherwise we should
-		 * if ncur is a new cursor or we're about to be done.
-		 */
-		if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
-			cur->bc_nlevels = pcur->bc_nlevels;
-			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-		}
-		/*
-		 * If we got a new cursor, switch to it.
-		 */
-		if (ncur) {
-			pcur = ncur;
-			ncur = (xfs_btree_cur_t *)0;
-		}
-	} while (nbno != NULLAGBLOCK);
-	*stat = i;
-	return 0;
-}
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xfs_btree_cur	*cur;
 
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-int					/* error */
-xfs_alloc_lookup_eq(
-	xfs_btree_cur_t	*cur,		/* btree cursor */
-	xfs_agblock_t	bno,		/* starting block of extent */
-	xfs_extlen_t	len,		/* length of extent */
-	int		*stat)		/* success/failure */
-{
-	cur->bc_rec.a.ar_startblock = bno;
-	cur->bc_rec.a.ar_blockcount = len;
-	return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
+	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
 
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-int					/* error */
-xfs_alloc_lookup_ge(
-	xfs_btree_cur_t	*cur,		/* btree cursor */
-	xfs_agblock_t	bno,		/* starting block of extent */
-	xfs_extlen_t	len,		/* length of extent */
-	int		*stat)		/* success/failure */
-{
-	cur->bc_rec.a.ar_startblock = bno;
-	cur->bc_rec.a.ar_blockcount = len;
-	return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
-}
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
 
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-int					/* error */
-xfs_alloc_lookup_le(
-	xfs_btree_cur_t	*cur,		/* btree cursor */
-	xfs_agblock_t	bno,		/* starting block of extent */
-	xfs_extlen_t	len,		/* length of extent */
-	int		*stat)		/* success/failure */
-{
-	cur->bc_rec.a.ar_startblock = bno;
-	cur->bc_rec.a.ar_blockcount = len;
-	return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_btnum = btnum;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+	cur->bc_ops = &xfs_allocbt_ops;
+
+	if (btnum == XFS_BTNUM_CNT) {
+		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+		cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+	} else {
+		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+	}
+
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_private.a.agno = agno;
+
+	if (xfs_sb_version_hascrc(&mp->m_sb))
+		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	return cur;
 }
 
 /*
- * Update the record referred to by cur, to the value given by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * Calculate number of records in an alloc btree block.
  */
-int					/* error */
-xfs_alloc_update(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	xfs_agblock_t		bno,	/* starting block of extent */
-	xfs_extlen_t		len)	/* length of extent */
+int
+xfs_allocbt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	int			leaf)
 {
-	xfs_alloc_block_t	*block;	/* btree block to update */
-	int			error;	/* error return value */
-	int			ptr;	/* current record number (updating) */
+	blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
 
-	ASSERT(len > 0);
-	/*
-	 * Pick up the a.g. freelist struct and the current block.
-	 */
-	block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
-	if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
-		return error;
-#endif
-	/*
-	 * Get the address of the rec to be updated.
-	 */
-	ptr = cur->bc_ptrs[0];
-	{
-		xfs_alloc_rec_t		*rp;	/* pointer to updated record */
-
-		rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-		/*
-		 * Fill in the new contents and log them.
-		 */
-		rp->ar_startblock = cpu_to_be32(bno);
-		rp->ar_blockcount = cpu_to_be32(len);
-		xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
-	}
-	/*
-	 * If it's the by-size btree and it's the last leaf block and
-	 * it's the last record... then update the size of the longest
-	 * extent in the a.g., which we cache in the a.g. freelist header.
-	 */
-	if (cur->bc_btnum == XFS_BTNUM_CNT &&
-	    be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-	    ptr == be16_to_cpu(block->bb_numrecs)) {
-		xfs_agf_t	*agf;	/* a.g. freespace header */
-		xfs_agnumber_t	seqno;
-
-		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-		seqno = be32_to_cpu(agf->agf_seqno);
-		cur->bc_mp->m_perag[seqno].pagf_longest = len;
-		agf->agf_longest = cpu_to_be32(len);
-		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-			XFS_AGF_LONGEST);
-	}
-	/*
-	 * Updating first record in leaf. Pass new key value up to our parent.
-	 */
-	if (ptr == 1) {
-		xfs_alloc_key_t	key;	/* key containing [bno, len] */
-
-		key.ar_startblock = cpu_to_be32(bno);
-		key.ar_blockcount = cpu_to_be32(len);
-		if ((error = xfs_alloc_updkey(cur, &key, 1)))
-			return error;
-	}
-	return 0;
+	if (leaf)
+		return blocklen / sizeof(xfs_alloc_rec_t);
+	return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
 }
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index bce81c7a4fd..45e189e7e81 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -24,136 +24,42 @@
 
 struct xfs_buf;
 struct xfs_btree_cur;
-struct xfs_btree_sblock;
 struct xfs_mount;
 
 /*
- * There are two on-disk btrees, one sorted by blockno and one sorted
- * by blockcount and blockno.  All blocks look the same to make the code
- * simpler; if we have time later, we'll make the optimizations.
+ * Btree block header size depends on a superblock flag.
  */
-#define	XFS_ABTB_MAGIC	0x41425442	/* 'ABTB' for bno tree */
-#define	XFS_ABTC_MAGIC	0x41425443	/* 'ABTC' for cnt tree */
-
-/*
- * Data record/key structure
- */
-typedef struct xfs_alloc_rec {
-	__be32		ar_startblock;	/* starting block number */
-	__be32		ar_blockcount;	/* count of free blocks */
-} xfs_alloc_rec_t, xfs_alloc_key_t;
-
-typedef struct xfs_alloc_rec_incore {
-	xfs_agblock_t	ar_startblock;	/* starting block number */
-	xfs_extlen_t	ar_blockcount;	/* count of free blocks */
-} xfs_alloc_rec_incore_t;
-
-/* btree pointer type */
-typedef __be32 xfs_alloc_ptr_t;
-/* btree block header type */
-typedef	struct xfs_btree_sblock xfs_alloc_block_t;
-
-#define	XFS_BUF_TO_ALLOC_BLOCK(bp)	((xfs_alloc_block_t *)XFS_BUF_PTR(bp))
-
-/*
- * Real block structures have a size equal to the disk block size.
- */
-#define	XFS_ALLOC_BLOCK_SIZE(lev,cur)	(1 << (cur)->bc_blocklog)
-#define	XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0])
-#define	XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0])
-
-/*
- * Minimum and maximum blocksize and sectorsize.
- * The blocksize upper limit is pretty much arbitrary.
- * The sectorsize upper limit is due to sizeof(sb_sectsize).
- */
-#define XFS_MIN_BLOCKSIZE_LOG	9	/* i.e. 512 bytes */
-#define XFS_MAX_BLOCKSIZE_LOG	16	/* i.e. 65536 bytes */
-#define XFS_MIN_BLOCKSIZE	(1 << XFS_MIN_BLOCKSIZE_LOG)
-#define XFS_MAX_BLOCKSIZE	(1 << XFS_MAX_BLOCKSIZE_LOG)
-#define XFS_MIN_SECTORSIZE_LOG	9	/* i.e. 512 bytes */
-#define XFS_MAX_SECTORSIZE_LOG	15	/* i.e. 32768 bytes */
-#define XFS_MIN_SECTORSIZE	(1 << XFS_MIN_SECTORSIZE_LOG)
-#define XFS_MAX_SECTORSIZE	(1 << XFS_MAX_SECTORSIZE_LOG)
-
-/*
- * Block numbers in the AG:
- * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
- */
-#define	XFS_BNO_BLOCK(mp)	((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
-#define	XFS_CNT_BLOCK(mp)	((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+#define XFS_ALLOC_BLOCK_LEN(mp) \
+	(xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
+		XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN)
 
 /*
  * Record, key, and pointer address macros for btree blocks.
- */
-#define	XFS_ALLOC_REC_ADDR(bb,i,cur)	\
-	XFS_BTREE_REC_ADDR(XFS_ALLOC_BLOCK_SIZE(0,cur), xfs_alloc, \
-				bb, i, XFS_ALLOC_BLOCK_MAXRECS(0, cur))
-
-#define	XFS_ALLOC_KEY_ADDR(bb,i,cur)	\
-	XFS_BTREE_KEY_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, \
-				bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur))
-
-#define	XFS_ALLOC_PTR_ADDR(bb,i,cur)	\
-	XFS_BTREE_PTR_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, \
-				bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur))
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_alloc_delete(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_alloc_get_rec(struct xfs_btree_cur *cur,	xfs_agblock_t *bno,
-				xfs_extlen_t *len, int *stat);
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_increment(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_alloc_insert(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-extern int xfs_alloc_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-				xfs_extlen_t len, int *stat);
-
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-extern int xfs_alloc_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-				xfs_extlen_t len, int *stat);
-
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-extern int xfs_alloc_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-				xfs_extlen_t len, int *stat);
-
-/*
- * Update the record referred to by cur, to the value given by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-extern int xfs_alloc_update(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-				xfs_extlen_t len);
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+	((xfs_alloc_rec_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+	((xfs_alloc_key_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 ((index) - 1) * sizeof(xfs_alloc_key_t)))
+
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+	((xfs_alloc_ptr_t *) \
+		((char *)(block) + \
+		 XFS_ALLOC_BLOCK_LEN(mp) + \
+		 (maxrecs) * sizeof(xfs_alloc_key_t) + \
+		 ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+		struct xfs_trans *, struct xfs_buf *,
+		xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
 
 #endif	/* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
new file mode 100644
index 00000000000..faaf716e208
--- /dev/null
+++ b/fs/xfs/xfs_aops.c
@@ -0,0 +1,1770 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_iomap.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include <linux/aio.h>
+#include <linux/gfp.h>
+#include <linux/mpage.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+void
+xfs_count_page_state(
+	struct page		*page,
+	int			*delalloc,
+	int			*unwritten)
+{
+	struct buffer_head	*bh, *head;
+
+	*delalloc = *unwritten = 0;
+
+	bh = head = page_buffers(page);
+	do {
+		if (buffer_unwritten(bh))
+			(*unwritten) = 1;
+		else if (buffer_delay(bh))
+			(*delalloc) = 1;
+	} while ((bh = bh->b_this_page) != head);
+}
+
+STATIC struct block_device *
+xfs_find_bdev_for_inode(
+	struct inode		*inode)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (XFS_IS_REALTIME_INODE(ip))
+		return mp->m_rtdev_targp->bt_bdev;
+	else
+		return mp->m_ddev_targp->bt_bdev;
+}
+
+/*
+ * We're now finished for good with this ioend structure.
+ * Update the page state via the associated buffer_heads,
+ * release holds on the inode and bio, and finally free
+ * up memory.  Do not use the ioend after this.
+ */
+STATIC void
+xfs_destroy_ioend(
+	xfs_ioend_t		*ioend)
+{
+	struct buffer_head	*bh, *next;
+
+	for (bh = ioend->io_buffer_head; bh; bh = next) {
+		next = bh->b_private;
+		bh->b_end_io(bh, !ioend->io_error);
+	}
+
+	mempool_free(ioend, xfs_ioend_pool);
+}
+
+/*
+ * Fast and loose check if this write could update the on-disk inode size.
+ */
+static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+{
+	return ioend->io_offset + ioend->io_size >
+		XFS_I(ioend->io_inode)->i_d.di_size;
+}
+
+STATIC int
+xfs_setfilesize_trans_alloc(
+	struct xfs_ioend	*ioend)
+{
+	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	ioend->io_append_trans = tp;
+
+	/*
+	 * We may pass freeze protection with a transaction.  So tell lockdep
+	 * we released it.
+	 */
+	rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+		      1, _THIS_IP_);
+	/*
+	 * We hand off the transaction to the completion thread now, so
+	 * clear the flag here.
+	 */
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	return 0;
+}
+
+/*
+ * Update on-disk file size now that data has been written to disk.
+ */
+STATIC int
+xfs_setfilesize(
+	struct xfs_ioend	*ioend)
+{
+	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
+	struct xfs_trans	*tp = ioend->io_append_trans;
+	xfs_fsize_t		isize;
+
+	/*
+	 * The transaction may have been allocated in the I/O submission thread,
+	 * thus we need to mark ourselves as beeing in a transaction manually.
+	 * Similarly for freeze protection.
+	 */
+	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+			   0, 1, _THIS_IP_);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
+	if (!isize) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_cancel(tp, 0);
+		return 0;
+	}
+
+	trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+
+	ip->i_d.di_size = isize;
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	return xfs_trans_commit(tp, 0);
+}
+
+/*
+ * Schedule IO completion handling on the final put of an ioend.
+ *
+ * If there is no work to do we might as well call it a day and free the
+ * ioend right now.
+ */
+STATIC void
+xfs_finish_ioend(
+	struct xfs_ioend	*ioend)
+{
+	if (atomic_dec_and_test(&ioend->io_remaining)) {
+		struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
+
+		if (ioend->io_type == XFS_IO_UNWRITTEN)
+			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+		else if (ioend->io_append_trans ||
+			 (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
+			queue_work(mp->m_data_workqueue, &ioend->io_work);
+		else
+			xfs_destroy_ioend(ioend);
+	}
+}
+
+/*
+ * IO write completion.
+ */
+STATIC void
+xfs_end_io(
+	struct work_struct *work)
+{
+	xfs_ioend_t	*ioend = container_of(work, xfs_ioend_t, io_work);
+	struct xfs_inode *ip = XFS_I(ioend->io_inode);
+	int		error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		ioend->io_error = -EIO;
+		goto done;
+	}
+	if (ioend->io_error)
+		goto done;
+
+	/*
+	 * For unwritten extents we need to issue transactions to convert a
+	 * range to normal written extens after the data I/O has finished.
+	 */
+	if (ioend->io_type == XFS_IO_UNWRITTEN) {
+		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+						  ioend->io_size);
+	} else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
+		/*
+		 * For direct I/O we do not know if we need to allocate blocks
+		 * or not so we can't preallocate an append transaction as that
+		 * results in nested reservations and log space deadlocks. Hence
+		 * allocate the transaction here. While this is sub-optimal and
+		 * can block IO completion for some time, we're stuck with doing
+		 * it this way until we can pass the ioend to the direct IO
+		 * allocation callbacks and avoid nesting that way.
+		 */
+		error = xfs_setfilesize_trans_alloc(ioend);
+		if (error)
+			goto done;
+		error = xfs_setfilesize(ioend);
+	} else if (ioend->io_append_trans) {
+		error = xfs_setfilesize(ioend);
+	} else {
+		ASSERT(!xfs_ioend_is_append(ioend));
+	}
+
+done:
+	if (error)
+		ioend->io_error = -error;
+	xfs_destroy_ioend(ioend);
+}
+
+/*
+ * Call IO completion handling in caller context on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend_sync(
+	struct xfs_ioend	*ioend)
+{
+	if (atomic_dec_and_test(&ioend->io_remaining))
+		xfs_end_io(&ioend->io_work);
+}
+
+/*
+ * Allocate and initialise an IO completion structure.
+ * We need to track unwritten extent write completion here initially.
+ * We'll need to extend this for updating the ondisk inode size later
+ * (vs. incore size).
+ */
+STATIC xfs_ioend_t *
+xfs_alloc_ioend(
+	struct inode		*inode,
+	unsigned int		type)
+{
+	xfs_ioend_t		*ioend;
+
+	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
+
+	/*
+	 * Set the count to 1 initially, which will prevent an I/O
+	 * completion callback from happening before we have started
+	 * all the I/O from calling the completion routine too early.
+	 */
+	atomic_set(&ioend->io_remaining, 1);
+	ioend->io_isdirect = 0;
+	ioend->io_error = 0;
+	ioend->io_list = NULL;
+	ioend->io_type = type;
+	ioend->io_inode = inode;
+	ioend->io_buffer_head = NULL;
+	ioend->io_buffer_tail = NULL;
+	ioend->io_offset = 0;
+	ioend->io_size = 0;
+	ioend->io_append_trans = NULL;
+
+	INIT_WORK(&ioend->io_work, xfs_end_io);
+	return ioend;
+}
+
+STATIC int
+xfs_map_blocks(
+	struct inode		*inode,
+	loff_t			offset,
+	struct xfs_bmbt_irec	*imap,
+	int			type,
+	int			nonblocking)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			count = 1 << inode->i_blkbits;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			error = 0;
+	int			bmapi_flags = XFS_BMAPI_ENTIRE;
+	int			nimaps = 1;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	if (type == XFS_IO_UNWRITTEN)
+		bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+		if (nonblocking)
+			return -XFS_ERROR(EAGAIN);
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+	}
+
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       (ip->i_df.if_flags & XFS_IFEXTENTS));
+	ASSERT(offset <= mp->m_super->s_maxbytes);
+
+	if (offset + count > mp->m_super->s_maxbytes)
+		count = mp->m_super->s_maxbytes - offset;
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				imap, &nimaps, bmapi_flags);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (error)
+		return -XFS_ERROR(error);
+
+	if (type == XFS_IO_DELALLOC &&
+	    (!nimaps || isnullstartblock(imap->br_startblock))) {
+		error = xfs_iomap_write_allocate(ip, offset, imap);
+		if (!error)
+			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+		return -XFS_ERROR(error);
+	}
+
+#ifdef DEBUG
+	if (type == XFS_IO_UNWRITTEN) {
+		ASSERT(nimaps);
+		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+	}
+#endif
+	if (nimaps)
+		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+	return 0;
+}
+
+STATIC int
+xfs_imap_valid(
+	struct inode		*inode,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	offset >>= inode->i_blkbits;
+
+	return offset >= imap->br_startoff &&
+		offset < imap->br_startoff + imap->br_blockcount;
+}
+
+/*
+ * BIO completion handler for buffered IO.
+ */
+STATIC void
+xfs_end_bio(
+	struct bio		*bio,
+	int			error)
+{
+	xfs_ioend_t		*ioend = bio->bi_private;
+
+	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
+	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
+
+	/* Toss bio and pass work off to an xfsdatad thread */
+	bio->bi_private = NULL;
+	bio->bi_end_io = NULL;
+	bio_put(bio);
+
+	xfs_finish_ioend(ioend);
+}
+
+STATIC void
+xfs_submit_ioend_bio(
+	struct writeback_control *wbc,
+	xfs_ioend_t		*ioend,
+	struct bio		*bio)
+{
+	atomic_inc(&ioend->io_remaining);
+	bio->bi_private = ioend;
+	bio->bi_end_io = xfs_end_bio;
+	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
+}
+
+STATIC struct bio *
+xfs_alloc_ioend_bio(
+	struct buffer_head	*bh)
+{
+	int			nvecs = bio_get_nr_vecs(bh->b_bdev);
+	struct bio		*bio = bio_alloc(GFP_NOIO, nvecs);
+
+	ASSERT(bio->bi_private == NULL);
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	return bio;
+}
+
+STATIC void
+xfs_start_buffer_writeback(
+	struct buffer_head	*bh)
+{
+	ASSERT(buffer_mapped(bh));
+	ASSERT(buffer_locked(bh));
+	ASSERT(!buffer_delay(bh));
+	ASSERT(!buffer_unwritten(bh));
+
+	mark_buffer_async_write(bh);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
+}
+
+STATIC void
+xfs_start_page_writeback(
+	struct page		*page,
+	int			clear_dirty,
+	int			buffers)
+{
+	ASSERT(PageLocked(page));
+	ASSERT(!PageWriteback(page));
+	if (clear_dirty)
+		clear_page_dirty_for_io(page);
+	set_page_writeback(page);
+	unlock_page(page);
+	/* If no buffers on the page are to be written, finish it here */
+	if (!buffers)
+		end_page_writeback(page);
+}
+
+static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
+{
+	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+}
+
+/*
+ * Submit all of the bios for all of the ioends we have saved up, covering the
+ * initial writepage page and also any probed pages.
+ *
+ * Because we may have multiple ioends spanning a page, we need to start
+ * writeback on all the buffers before we submit them for I/O. If we mark the
+ * buffers as we got, then we can end up with a page that only has buffers
+ * marked async write and I/O complete on can occur before we mark the other
+ * buffers async write.
+ *
+ * The end result of this is that we trip a bug in end_page_writeback() because
+ * we call it twice for the one page as the code in end_buffer_async_write()
+ * assumes that all buffers on the page are started at the same time.
+ *
+ * The fix is two passes across the ioend list - one to start writeback on the
+ * buffer_heads, and then submit them for I/O on the second pass.
+ *
+ * If @fail is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the ioend chain rather
+ * than submit it to IO. This typically only happens on a filesystem shutdown.
+ */
+STATIC void
+xfs_submit_ioend(
+	struct writeback_control *wbc,
+	xfs_ioend_t		*ioend,
+	int			fail)
+{
+	xfs_ioend_t		*head = ioend;
+	xfs_ioend_t		*next;
+	struct buffer_head	*bh;
+	struct bio		*bio;
+	sector_t		lastblock = 0;
+
+	/* Pass 1 - start writeback */
+	do {
+		next = ioend->io_list;
+		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
+			xfs_start_buffer_writeback(bh);
+	} while ((ioend = next) != NULL);
+
+	/* Pass 2 - submit I/O */
+	ioend = head;
+	do {
+		next = ioend->io_list;
+		bio = NULL;
+
+		/*
+		 * If we are failing the IO now, just mark the ioend with an
+		 * error and finish it. This will run IO completion immediately
+		 * as there is only one reference to the ioend at this point in
+		 * time.
+		 */
+		if (fail) {
+			ioend->io_error = -fail;
+			xfs_finish_ioend(ioend);
+			continue;
+		}
+
+		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+
+			if (!bio) {
+ retry:
+				bio = xfs_alloc_ioend_bio(bh);
+			} else if (bh->b_blocknr != lastblock + 1) {
+				xfs_submit_ioend_bio(wbc, ioend, bio);
+				goto retry;
+			}
+
+			if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
+				xfs_submit_ioend_bio(wbc, ioend, bio);
+				goto retry;
+			}
+
+			lastblock = bh->b_blocknr;
+		}
+		if (bio)
+			xfs_submit_ioend_bio(wbc, ioend, bio);
+		xfs_finish_ioend(ioend);
+	} while ((ioend = next) != NULL);
+}
+
+/*
+ * Cancel submission of all buffer_heads so far in this endio.
+ * Toss the endio too.  Only ever called for the initial page
+ * in a writepage request, so only ever one page.
+ */
+STATIC void
+xfs_cancel_ioend(
+	xfs_ioend_t		*ioend)
+{
+	xfs_ioend_t		*next;
+	struct buffer_head	*bh, *next_bh;
+
+	do {
+		next = ioend->io_list;
+		bh = ioend->io_buffer_head;
+		do {
+			next_bh = bh->b_private;
+			clear_buffer_async_write(bh);
+			unlock_buffer(bh);
+		} while ((bh = next_bh) != NULL);
+
+		mempool_free(ioend, xfs_ioend_pool);
+	} while ((ioend = next) != NULL);
+}
+
+/*
+ * Test to see if we've been building up a completion structure for
+ * earlier buffers -- if so, we try to append to this ioend if we
+ * can, otherwise we finish off any current ioend and start another.
+ * Return true if we've finished the given ioend.
+ */
+STATIC void
+xfs_add_to_ioend(
+	struct inode		*inode,
+	struct buffer_head	*bh,
+	xfs_off_t		offset,
+	unsigned int		type,
+	xfs_ioend_t		**result,
+	int			need_ioend)
+{
+	xfs_ioend_t		*ioend = *result;
+
+	if (!ioend || need_ioend || type != ioend->io_type) {
+		xfs_ioend_t	*previous = *result;
+
+		ioend = xfs_alloc_ioend(inode, type);
+		ioend->io_offset = offset;
+		ioend->io_buffer_head = bh;
+		ioend->io_buffer_tail = bh;
+		if (previous)
+			previous->io_list = ioend;
+		*result = ioend;
+	} else {
+		ioend->io_buffer_tail->b_private = bh;
+		ioend->io_buffer_tail = bh;
+	}
+
+	bh->b_private = NULL;
+	ioend->io_size += bh->b_size;
+}
+
+STATIC void
+xfs_map_buffer(
+	struct inode		*inode,
+	struct buffer_head	*bh,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	sector_t		bn;
+	struct xfs_mount	*m = XFS_I(inode)->i_mount;
+	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
+
+	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
+	      ((offset - iomap_offset) >> inode->i_blkbits);
+
+	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
+
+	bh->b_blocknr = bn;
+	set_buffer_mapped(bh);
+}
+
+STATIC void
+xfs_map_at_offset(
+	struct inode		*inode,
+	struct buffer_head	*bh,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t		offset)
+{
+	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+	xfs_map_buffer(inode, bh, imap, offset);
+	set_buffer_mapped(bh);
+	clear_buffer_delay(bh);
+	clear_buffer_unwritten(bh);
+}
+
+/*
+ * Test if a given page contains at least one buffer of a given @type.
+ * If @check_all_buffers is true, then we walk all the buffers in the page to
+ * try to find one of the type passed in. If it is not set, then the caller only
+ * needs to check the first buffer on the page for a match.
+ */
+STATIC bool
+xfs_check_page_type(
+	struct page		*page,
+	unsigned int		type,
+	bool			check_all_buffers)
+{
+	struct buffer_head	*bh;
+	struct buffer_head	*head;
+
+	if (PageWriteback(page))
+		return false;
+	if (!page->mapping)
+		return false;
+	if (!page_has_buffers(page))
+		return false;
+
+	bh = head = page_buffers(page);
+	do {
+		if (buffer_unwritten(bh)) {
+			if (type == XFS_IO_UNWRITTEN)
+				return true;
+		} else if (buffer_delay(bh)) {
+			if (type == XFS_IO_DELALLOC)
+				return true;
+		} else if (buffer_dirty(bh) && buffer_mapped(bh)) {
+			if (type == XFS_IO_OVERWRITE)
+				return true;
+		}
+
+		/* If we are only checking the first buffer, we are done now. */
+		if (!check_all_buffers)
+			break;
+	} while ((bh = bh->b_this_page) != head);
+
+	return false;
+}
+
+/*
+ * Allocate & map buffers for page given the extent map. Write it out.
+ * except for the original page of a writepage, this is called on
+ * delalloc/unwritten pages only, for the original page it is possible
+ * that the page has no mapping at all.
+ */
+STATIC int
+xfs_convert_page(
+	struct inode		*inode,
+	struct page		*page,
+	loff_t			tindex,
+	struct xfs_bmbt_irec	*imap,
+	xfs_ioend_t		**ioendp,
+	struct writeback_control *wbc)
+{
+	struct buffer_head	*bh, *head;
+	xfs_off_t		end_offset;
+	unsigned long		p_offset;
+	unsigned int		type;
+	int			len, page_dirty;
+	int			count = 0, done = 0, uptodate = 1;
+ 	xfs_off_t		offset = page_offset(page);
+
+	if (page->index != tindex)
+		goto fail;
+	if (!trylock_page(page))
+		goto fail;
+	if (PageWriteback(page))
+		goto fail_unlock_page;
+	if (page->mapping != inode->i_mapping)
+		goto fail_unlock_page;
+	if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
+		goto fail_unlock_page;
+
+	/*
+	 * page_dirty is initially a count of buffers on the page before
+	 * EOF and is decremented as we move each into a cleanable state.
+	 *
+	 * Derivation:
+	 *
+	 * End offset is the highest offset that this page should represent.
+	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
+	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
+	 * hence give us the correct page_dirty count. On any other page,
+	 * it will be zero and in that case we need page_dirty to be the
+	 * count of buffers on the page.
+	 */
+	end_offset = min_t(unsigned long long,
+			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+			i_size_read(inode));
+
+	/*
+	 * If the current map does not span the entire page we are about to try
+	 * to write, then give up. The only way we can write a page that spans
+	 * multiple mappings in a single writeback iteration is via the
+	 * xfs_vm_writepage() function. Data integrity writeback requires the
+	 * entire page to be written in a single attempt, otherwise the part of
+	 * the page we don't write here doesn't get written as part of the data
+	 * integrity sync.
+	 *
+	 * For normal writeback, we also don't attempt to write partial pages
+	 * here as it simply means that write_cache_pages() will see it under
+	 * writeback and ignore the page until some point in the future, at
+	 * which time this will be the only page in the file that needs
+	 * writeback.  Hence for more optimal IO patterns, we should always
+	 * avoid partial page writeback due to multiple mappings on a page here.
+	 */
+	if (!xfs_imap_valid(inode, imap, end_offset))
+		goto fail_unlock_page;
+
+	len = 1 << inode->i_blkbits;
+	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
+					PAGE_CACHE_SIZE);
+	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
+	page_dirty = p_offset / len;
+
+	/*
+	 * The moment we find a buffer that doesn't match our current type
+	 * specification or can't be written, abort the loop and start
+	 * writeback. As per the above xfs_imap_valid() check, only
+	 * xfs_vm_writepage() can handle partial page writeback fully - we are
+	 * limited here to the buffers that are contiguous with the current
+	 * ioend, and hence a buffer we can't write breaks that contiguity and
+	 * we have to defer the rest of the IO to xfs_vm_writepage().
+	 */
+	bh = head = page_buffers(page);
+	do {
+		if (offset >= end_offset)
+			break;
+		if (!buffer_uptodate(bh))
+			uptodate = 0;
+		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
+			done = 1;
+			break;
+		}
+
+		if (buffer_unwritten(bh) || buffer_delay(bh) ||
+		    buffer_mapped(bh)) {
+			if (buffer_unwritten(bh))
+				type = XFS_IO_UNWRITTEN;
+			else if (buffer_delay(bh))
+				type = XFS_IO_DELALLOC;
+			else
+				type = XFS_IO_OVERWRITE;
+
+			/*
+			 * imap should always be valid because of the above
+			 * partial page end_offset check on the imap.
+			 */
+			ASSERT(xfs_imap_valid(inode, imap, offset));
+
+			lock_buffer(bh);
+			if (type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, type,
+					 ioendp, done);
+
+			page_dirty--;
+			count++;
+		} else {
+			done = 1;
+			break;
+		}
+	} while (offset += len, (bh = bh->b_this_page) != head);
+
+	if (uptodate && bh == head)
+		SetPageUptodate(page);
+
+	if (count) {
+		if (--wbc->nr_to_write <= 0 &&
+		    wbc->sync_mode == WB_SYNC_NONE)
+			done = 1;
+	}
+	xfs_start_page_writeback(page, !page_dirty, count);
+
+	return done;
+ fail_unlock_page:
+	unlock_page(page);
+ fail:
+	return 1;
+}
+
+/*
+ * Convert & write out a cluster of pages in the same extent as defined
+ * by mp and following the start page.
+ */
+STATIC void
+xfs_cluster_write(
+	struct inode		*inode,
+	pgoff_t			tindex,
+	struct xfs_bmbt_irec	*imap,
+	xfs_ioend_t		**ioendp,
+	struct writeback_control *wbc,
+	pgoff_t			tlast)
+{
+	struct pagevec		pvec;
+	int			done = 0, i;
+
+	pagevec_init(&pvec, 0);
+	while (!done && tindex <= tlast) {
+		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
+
+		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
+			break;
+
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
+					imap, ioendp, wbc);
+			if (done)
+				break;
+		}
+
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+STATIC void
+xfs_vm_invalidatepage(
+	struct page		*page,
+	unsigned int		offset,
+	unsigned int		length)
+{
+	trace_xfs_invalidatepage(page->mapping->host, page, offset,
+				 length);
+	block_invalidatepage(page, offset, length);
+}
+
+/*
+ * If the page has delalloc buffers on it, we need to punch them out before we
+ * invalidate the page. If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
+ * is done on that same region - the delalloc extent is returned when none is
+ * supposed to be there.
+ *
+ * We prevent this by truncating away the delalloc regions on the page before
+ * invalidating it. Because they are delalloc, we can do this without needing a
+ * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
+ * truncation without a transaction as there is no space left for block
+ * reservation (typically why we see a ENOSPC in writeback).
+ *
+ * This is not a performance critical path, so for now just do the punching a
+ * buffer head at a time.
+ */
+STATIC void
+xfs_aops_discard_page(
+	struct page		*page)
+{
+	struct inode		*inode = page->mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct buffer_head	*bh, *head;
+	loff_t			offset = page_offset(page);
+
+	if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
+		goto out_invalidate;
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		goto out_invalidate;
+
+	xfs_alert(ip->i_mount,
+		"page discard on page %p, inode 0x%llx, offset %llu.",
+			page, ip->i_ino, offset);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	bh = head = page_buffers(page);
+	do {
+		int		error;
+		xfs_fileoff_t	start_fsb;
+
+		if (!buffer_delay(bh))
+			goto next_buffer;
+
+		start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
+		if (error) {
+			/* something screwed, just bail */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_alert(ip->i_mount,
+			"page discard unable to remove delalloc mapping.");
+			}
+			break;
+		}
+next_buffer:
+		offset += 1 << inode->i_blkbits;
+
+	} while ((bh = bh->b_this_page) != head);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_invalidate:
+	xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+	return;
+}
+
+/*
+ * Write out a dirty page.
+ *
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ * For any other dirty buffer heads on the page we should flush them.
+ */
+STATIC int
+xfs_vm_writepage(
+	struct page		*page,
+	struct writeback_control *wbc)
+{
+	struct inode		*inode = page->mapping->host;
+	struct buffer_head	*bh, *head;
+	struct xfs_bmbt_irec	imap;
+	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
+	loff_t			offset;
+	unsigned int		type;
+	__uint64_t              end_offset;
+	pgoff_t                 end_index, last_index;
+	ssize_t			len;
+	int			err, imap_valid = 0, uptodate = 1;
+	int			count = 0;
+	int			nonblocking = 0;
+
+	trace_xfs_writepage(inode, page, 0, 0);
+
+	ASSERT(page_has_buffers(page));
+
+	/*
+	 * Refuse to write the page out if we are called from reclaim context.
+	 *
+	 * This avoids stack overflows when called from deeply used stacks in
+	 * random callers for direct reclaim or memcg reclaim.  We explicitly
+	 * allow reclaim from kswapd as the stack usage there is relatively low.
+	 *
+	 * This should never happen except in the case of a VM regression so
+	 * warn about it.
+	 */
+	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+			PF_MEMALLOC))
+		goto redirty;
+
+	/*
+	 * Given that we do not allow direct reclaim to call us, we should
+	 * never be called while in a filesystem transaction.
+	 */
+	if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
+		goto redirty;
+
+	/* Is this page beyond the end of the file? */
+	offset = i_size_read(inode);
+	end_index = offset >> PAGE_CACHE_SHIFT;
+	last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
+
+	/*
+	 * The page index is less than the end_index, adjust the end_offset
+	 * to the highest offset that this page should represent.
+	 * -----------------------------------------------------
+	 * |			file mapping	       | <EOF> |
+	 * -----------------------------------------------------
+	 * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
+	 * ^--------------------------------^----------|--------
+	 * |     desired writeback range    |      see else    |
+	 * ---------------------------------^------------------|
+	 */
+	if (page->index < end_index)
+		end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
+	else {
+		/*
+		 * Check whether the page to write out is beyond or straddles
+		 * i_size or not.
+		 * -------------------------------------------------------
+		 * |		file mapping		        | <EOF>  |
+		 * -------------------------------------------------------
+		 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
+		 * ^--------------------------------^-----------|---------
+		 * |				    |      Straddles     |
+		 * ---------------------------------^-----------|--------|
+		 */
+		unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
+
+		/*
+		 * Skip the page if it is fully outside i_size, e.g. due to a
+		 * truncate operation that is in progress. We must redirty the
+		 * page so that reclaim stops reclaiming it. Otherwise
+		 * xfs_vm_releasepage() is called on it and gets confused.
+		 *
+		 * Note that the end_index is unsigned long, it would overflow
+		 * if the given offset is greater than 16TB on 32-bit system
+		 * and if we do check the page is fully outside i_size or not
+		 * via "if (page->index >= end_index + 1)" as "end_index + 1"
+		 * will be evaluated to 0.  Hence this page will be redirtied
+		 * and be written out repeatedly which would result in an
+		 * infinite loop, the user program that perform this operation
+		 * will hang.  Instead, we can verify this situation by checking
+		 * if the page to write is totally beyond the i_size or if it's
+		 * offset is just equal to the EOF.
+		 */
+		if (page->index > end_index ||
+		    (page->index == end_index && offset_into_page == 0))
+			goto redirty;
+
+		/*
+		 * The page straddles i_size.  It must be zeroed out on each
+		 * and every writepage invocation because it may be mmapped.
+		 * "A file is mapped in multiples of the page size.  For a file
+		 * that is not a multiple of the page size, the remaining
+		 * memory is zeroed when mapped, and writes to that region are
+		 * not written out to the file."
+		 */
+		zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
+
+		/* Adjust the end_offset to the end of file */
+		end_offset = offset;
+	}
+
+	len = 1 << inode->i_blkbits;
+
+	bh = head = page_buffers(page);
+	offset = page_offset(page);
+	type = XFS_IO_OVERWRITE;
+
+	if (wbc->sync_mode == WB_SYNC_NONE)
+		nonblocking = 1;
+
+	do {
+		int new_ioend = 0;
+
+		if (offset >= end_offset)
+			break;
+		if (!buffer_uptodate(bh))
+			uptodate = 0;
+
+		/*
+		 * set_page_dirty dirties all buffers in a page, independent
+		 * of their state.  The dirty state however is entirely
+		 * meaningless for holes (!mapped && uptodate), so skip
+		 * buffers covering holes here.
+		 */
+		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+			imap_valid = 0;
+			continue;
+		}
+
+		if (buffer_unwritten(bh)) {
+			if (type != XFS_IO_UNWRITTEN) {
+				type = XFS_IO_UNWRITTEN;
+				imap_valid = 0;
+			}
+		} else if (buffer_delay(bh)) {
+			if (type != XFS_IO_DELALLOC) {
+				type = XFS_IO_DELALLOC;
+				imap_valid = 0;
+			}
+		} else if (buffer_uptodate(bh)) {
+			if (type != XFS_IO_OVERWRITE) {
+				type = XFS_IO_OVERWRITE;
+				imap_valid = 0;
+			}
+		} else {
+			if (PageUptodate(page))
+				ASSERT(buffer_mapped(bh));
+			/*
+			 * This buffer is not uptodate and will not be
+			 * written to disk.  Ensure that we will put any
+			 * subsequent writeable buffers into a new
+			 * ioend.
+			 */
+			imap_valid = 0;
+			continue;
+		}
+
+		if (imap_valid)
+			imap_valid = xfs_imap_valid(inode, &imap, offset);
+		if (!imap_valid) {
+			/*
+			 * If we didn't have a valid mapping then we need to
+			 * put the new mapping into a separate ioend structure.
+			 * This ensures non-contiguous extents always have
+			 * separate ioends, which is particularly important
+			 * for unwritten extent conversion at I/O completion
+			 * time.
+			 */
+			new_ioend = 1;
+			err = xfs_map_blocks(inode, offset, &imap, type,
+					     nonblocking);
+			if (err)
+				goto error;
+			imap_valid = xfs_imap_valid(inode, &imap, offset);
+		}
+		if (imap_valid) {
+			lock_buffer(bh);
+			if (type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
+					 new_ioend);
+			count++;
+		}
+
+		if (!iohead)
+			iohead = ioend;
+
+	} while (offset += len, ((bh = bh->b_this_page) != head));
+
+	if (uptodate && bh == head)
+		SetPageUptodate(page);
+
+	xfs_start_page_writeback(page, 1, count);
+
+	/* if there is no IO to be submitted for this page, we are done */
+	if (!ioend)
+		return 0;
+
+	ASSERT(iohead);
+
+	/*
+	 * Any errors from this point onwards need tobe reported through the IO
+	 * completion path as we have marked the initial page as under writeback
+	 * and unlocked it.
+	 */
+	if (imap_valid) {
+		xfs_off_t		end_index;
+
+		end_index = imap.br_startoff + imap.br_blockcount;
+
+		/* to bytes */
+		end_index <<= inode->i_blkbits;
+
+		/* to pages */
+		end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
+
+		/* check against file size */
+		if (end_index > last_index)
+			end_index = last_index;
+
+		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
+				  wbc, end_index);
+	}
+
+
+	/*
+	 * Reserve log space if we might write beyond the on-disk inode size.
+	 */
+	err = 0;
+	if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+		err = xfs_setfilesize_trans_alloc(ioend);
+
+	xfs_submit_ioend(wbc, iohead, err);
+
+	return 0;
+
+error:
+	if (iohead)
+		xfs_cancel_ioend(iohead);
+
+	if (err == -EAGAIN)
+		goto redirty;
+
+	xfs_aops_discard_page(page);
+	ClearPageUptodate(page);
+	unlock_page(page);
+	return err;
+
+redirty:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return 0;
+}
+
+STATIC int
+xfs_vm_writepages(
+	struct address_space	*mapping,
+	struct writeback_control *wbc)
+{
+	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+	return generic_writepages(mapping, wbc);
+}
+
+/*
+ * Called to move a page into cleanable state - and from there
+ * to be released. The page should already be clean. We always
+ * have buffer heads in this call.
+ *
+ * Returns 1 if the page is ok to release, 0 otherwise.
+ */
+STATIC int
+xfs_vm_releasepage(
+	struct page		*page,
+	gfp_t			gfp_mask)
+{
+	int			delalloc, unwritten;
+
+	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
+
+	xfs_count_page_state(page, &delalloc, &unwritten);
+
+	if (WARN_ON_ONCE(delalloc))
+		return 0;
+	if (WARN_ON_ONCE(unwritten))
+		return 0;
+
+	return try_to_free_buffers(page);
+}
+
+STATIC int
+__xfs_get_blocks(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create,
+	int			direct)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			error = 0;
+	int			lockmode = 0;
+	struct xfs_bmbt_irec	imap;
+	int			nimaps = 1;
+	xfs_off_t		offset;
+	ssize_t			size;
+	int			new = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	offset = (xfs_off_t)iblock << inode->i_blkbits;
+	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
+	size = bh_result->b_size;
+
+	if (!create && direct && offset >= i_size_read(inode))
+		return 0;
+
+	/*
+	 * Direct I/O is usually done on preallocated files, so try getting
+	 * a block mapping without an exclusive lock first.  For buffered
+	 * writes we already have the exclusive iolock anyway, so avoiding
+	 * a lock roundtrip here by taking the ilock exclusive from the
+	 * beginning is a useful micro optimization.
+	 */
+	if (create && !direct) {
+		lockmode = XFS_ILOCK_EXCL;
+		xfs_ilock(ip, lockmode);
+	} else {
+		lockmode = xfs_ilock_data_map_shared(ip);
+	}
+
+	ASSERT(offset <= mp->m_super->s_maxbytes);
+	if (offset + size > mp->m_super->s_maxbytes)
+		size = mp->m_super->s_maxbytes - offset;
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				&imap, &nimaps, XFS_BMAPI_ENTIRE);
+	if (error)
+		goto out_unlock;
+
+	if (create &&
+	    (!nimaps ||
+	     (imap.br_startblock == HOLESTARTBLOCK ||
+	      imap.br_startblock == DELAYSTARTBLOCK))) {
+		if (direct || xfs_get_extsz_hint(ip)) {
+			/*
+			 * Drop the ilock in preparation for starting the block
+			 * allocation transaction.  It will be retaken
+			 * exclusively inside xfs_iomap_write_direct for the
+			 * actual allocation.
+			 */
+			xfs_iunlock(ip, lockmode);
+			error = xfs_iomap_write_direct(ip, offset, size,
+						       &imap, nimaps);
+			if (error)
+				return -error;
+			new = 1;
+		} else {
+			/*
+			 * Delalloc reservations do not require a transaction,
+			 * we can go on without dropping the lock here. If we
+			 * are allocating a new delalloc block, make sure that
+			 * we set the new flag so that we mark the buffer new so
+			 * that we know that it is newly allocated if the write
+			 * fails.
+			 */
+			if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+				new = 1;
+			error = xfs_iomap_write_delay(ip, offset, size, &imap);
+			if (error)
+				goto out_unlock;
+
+			xfs_iunlock(ip, lockmode);
+		}
+
+		trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+	} else if (nimaps) {
+		trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+		xfs_iunlock(ip, lockmode);
+	} else {
+		trace_xfs_get_blocks_notfound(ip, offset, size);
+		goto out_unlock;
+	}
+
+	if (imap.br_startblock != HOLESTARTBLOCK &&
+	    imap.br_startblock != DELAYSTARTBLOCK) {
+		/*
+		 * For unwritten extents do not report a disk address on
+		 * the read case (treat as if we're reading into a hole).
+		 */
+		if (create || !ISUNWRITTEN(&imap))
+			xfs_map_buffer(inode, bh_result, &imap, offset);
+		if (create && ISUNWRITTEN(&imap)) {
+			if (direct) {
+				bh_result->b_private = inode;
+				set_buffer_defer_completion(bh_result);
+			}
+			set_buffer_unwritten(bh_result);
+		}
+	}
+
+	/*
+	 * If this is a realtime file, data may be on a different device.
+	 * to that pointed to from the buffer_head b_bdev currently.
+	 */
+	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
+
+	/*
+	 * If we previously allocated a block out beyond eof and we are now
+	 * coming back to use it then we will need to flag it as new even if it
+	 * has a disk address.
+	 *
+	 * With sub-block writes into unwritten extents we also need to mark
+	 * the buffer as new so that the unwritten parts of the buffer gets
+	 * correctly zeroed.
+	 */
+	if (create &&
+	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
+	     (offset >= i_size_read(inode)) ||
+	     (new || ISUNWRITTEN(&imap))))
+		set_buffer_new(bh_result);
+
+	if (imap.br_startblock == DELAYSTARTBLOCK) {
+		BUG_ON(direct);
+		if (create) {
+			set_buffer_uptodate(bh_result);
+			set_buffer_mapped(bh_result);
+			set_buffer_delay(bh_result);
+		}
+	}
+
+	/*
+	 * If this is O_DIRECT or the mpage code calling tell them how large
+	 * the mapping is, so that we can avoid repeated get_blocks calls.
+	 *
+	 * If the mapping spans EOF, then we have to break the mapping up as the
+	 * mapping for blocks beyond EOF must be marked new so that sub block
+	 * regions can be correctly zeroed. We can't do this for mappings within
+	 * EOF unless the mapping was just allocated or is unwritten, otherwise
+	 * the callers would overwrite existing data with zeros. Hence we have
+	 * to split the mapping into a range up to and including EOF, and a
+	 * second mapping for beyond EOF.
+	 */
+	if (direct || size > (1 << inode->i_blkbits)) {
+		xfs_off_t		mapping_size;
+
+		mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
+		mapping_size <<= inode->i_blkbits;
+
+		ASSERT(mapping_size > 0);
+		if (mapping_size > size)
+			mapping_size = size;
+		if (offset < i_size_read(inode) &&
+		    offset + mapping_size >= i_size_read(inode)) {
+			/* limit mapping to block that spans EOF */
+			mapping_size = roundup_64(i_size_read(inode) - offset,
+						  1 << inode->i_blkbits);
+		}
+		if (mapping_size > LONG_MAX)
+			mapping_size = LONG_MAX;
+
+		bh_result->b_size = mapping_size;
+	}
+
+	return 0;
+
+out_unlock:
+	xfs_iunlock(ip, lockmode);
+	return -error;
+}
+
+int
+xfs_get_blocks(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+}
+
+STATIC int
+xfs_get_blocks_direct(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+}
+
+/*
+ * Complete a direct I/O write request.
+ *
+ * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * need to issue a transaction to convert the range from unwritten to written
+ * extents.  In case this is regular synchronous I/O we just call xfs_end_io
+ * to do this and we are done.  But in case this was a successful AIO
+ * request this handler is called from interrupt context, from which we
+ * can't start transactions.  In that case offload the I/O completion to
+ * the workqueues we also use for buffered I/O completion.
+ */
+STATIC void
+xfs_end_io_direct_write(
+	struct kiocb		*iocb,
+	loff_t			offset,
+	ssize_t			size,
+	void			*private)
+{
+	struct xfs_ioend	*ioend = iocb->private;
+
+	/*
+	 * While the generic direct I/O code updates the inode size, it does
+	 * so only after the end_io handler is called, which means our
+	 * end_io handler thinks the on-disk size is outside the in-core
+	 * size.  To prevent this just update it a little bit earlier here.
+	 */
+	if (offset + size > i_size_read(ioend->io_inode))
+		i_size_write(ioend->io_inode, offset + size);
+
+	/*
+	 * blockdev_direct_IO can return an error even after the I/O
+	 * completion handler was called.  Thus we need to protect
+	 * against double-freeing.
+	 */
+	iocb->private = NULL;
+
+	ioend->io_offset = offset;
+	ioend->io_size = size;
+	if (private && size > 0)
+		ioend->io_type = XFS_IO_UNWRITTEN;
+
+	xfs_finish_ioend_sync(ioend);
+}
+
+STATIC ssize_t
+xfs_vm_direct_IO(
+	int			rw,
+	struct kiocb		*iocb,
+	struct iov_iter		*iter,
+	loff_t			offset)
+{
+	struct inode		*inode = iocb->ki_filp->f_mapping->host;
+	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
+	struct xfs_ioend	*ioend = NULL;
+	ssize_t			ret;
+
+	if (rw & WRITE) {
+		size_t size = iov_iter_count(iter);
+
+		/*
+		 * We cannot preallocate a size update transaction here as we
+		 * don't know whether allocation is necessary or not. Hence we
+		 * can only tell IO completion that one is necessary if we are
+		 * not doing unwritten extent conversion.
+		 */
+		iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
+		if (offset + size > XFS_I(inode)->i_d.di_size)
+			ioend->io_isdirect = 1;
+
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+					    offset, xfs_get_blocks_direct,
+					    xfs_end_io_direct_write, NULL,
+					    DIO_ASYNC_EXTEND);
+		if (ret != -EIOCBQUEUED && iocb->private)
+			goto out_destroy_ioend;
+	} else {
+		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+					    offset, xfs_get_blocks_direct,
+					    NULL, NULL, 0);
+	}
+
+	return ret;
+
+out_destroy_ioend:
+	xfs_destroy_ioend(ioend);
+	return ret;
+}
+
+/*
+ * Punch out the delalloc blocks we have already allocated.
+ *
+ * Don't bother with xfs_setattr given that nothing can have made it to disk yet
+ * as the page is still locked at this point.
+ */
+STATIC void
+xfs_vm_kill_delalloc_range(
+	struct inode		*inode,
+	loff_t			start,
+	loff_t			end)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	xfs_fileoff_t		start_fsb;
+	xfs_fileoff_t		end_fsb;
+	int			error;
+
+	start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
+	if (end_fsb <= start_fsb)
+		return;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+						end_fsb - start_fsb);
+	if (error) {
+		/* something screwed, just bail */
+		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+			xfs_alert(ip->i_mount,
+		"xfs_vm_write_failed: unable to clean up ino %lld",
+					ip->i_ino);
+		}
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+}
+
+STATIC void
+xfs_vm_write_failed(
+	struct inode		*inode,
+	struct page		*page,
+	loff_t			pos,
+	unsigned		len)
+{
+	loff_t			block_offset;
+	loff_t			block_start;
+	loff_t			block_end;
+	loff_t			from = pos & (PAGE_CACHE_SIZE - 1);
+	loff_t			to = from + len;
+	struct buffer_head	*bh, *head;
+
+	/*
+	 * The request pos offset might be 32 or 64 bit, this is all fine
+	 * on 64-bit platform.  However, for 64-bit pos request on 32-bit
+	 * platform, the high 32-bit will be masked off if we evaluate the
+	 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
+	 * 0xfffff000 as an unsigned long, hence the result is incorrect
+	 * which could cause the following ASSERT failed in most cases.
+	 * In order to avoid this, we can evaluate the block_offset of the
+	 * start of the page by using shifts rather than masks the mismatch
+	 * problem.
+	 */
+	block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+
+	ASSERT(block_offset + from == pos);
+
+	head = page_buffers(page);
+	block_start = 0;
+	for (bh = head; bh != head || !block_start;
+	     bh = bh->b_this_page, block_start = block_end,
+				   block_offset += bh->b_size) {
+		block_end = block_start + bh->b_size;
+
+		/* skip buffers before the write */
+		if (block_end <= from)
+			continue;
+
+		/* if the buffer is after the write, we're done */
+		if (block_start >= to)
+			break;
+
+		if (!buffer_delay(bh))
+			continue;
+
+		if (!buffer_new(bh) && block_offset < i_size_read(inode))
+			continue;
+
+		xfs_vm_kill_delalloc_range(inode, block_offset,
+					   block_offset + bh->b_size);
+
+		/*
+		 * This buffer does not contain data anymore. make sure anyone
+		 * who finds it knows that for certain.
+		 */
+		clear_buffer_delay(bh);
+		clear_buffer_uptodate(bh);
+		clear_buffer_mapped(bh);
+		clear_buffer_new(bh);
+		clear_buffer_dirty(bh);
+	}
+
+}
+
+/*
+ * This used to call block_write_begin(), but it unlocks and releases the page
+ * on error, and we need that page to be able to punch stale delalloc blocks out
+ * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
+ * the appropriate point.
+ */
+STATIC int
+xfs_vm_write_begin(
+	struct file		*file,
+	struct address_space	*mapping,
+	loff_t			pos,
+	unsigned		len,
+	unsigned		flags,
+	struct page		**pagep,
+	void			**fsdata)
+{
+	pgoff_t			index = pos >> PAGE_CACHE_SHIFT;
+	struct page		*page;
+	int			status;
+
+	ASSERT(len <= PAGE_CACHE_SIZE);
+
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	status = __block_write_begin(page, pos, len, xfs_get_blocks);
+	if (unlikely(status)) {
+		struct inode	*inode = mapping->host;
+		size_t		isize = i_size_read(inode);
+
+		xfs_vm_write_failed(inode, page, pos, len);
+		unlock_page(page);
+
+		/*
+		 * If the write is beyond EOF, we only want to kill blocks
+		 * allocated in this write, not blocks that were previously
+		 * written successfully.
+		 */
+		if (pos + len > isize) {
+			ssize_t start = max_t(ssize_t, pos, isize);
+
+			truncate_pagecache_range(inode, start, pos + len);
+		}
+
+		page_cache_release(page);
+		page = NULL;
+	}
+
+	*pagep = page;
+	return status;
+}
+
+/*
+ * On failure, we only need to kill delalloc blocks beyond EOF in the range of
+ * this specific write because they will never be written. Previous writes
+ * beyond EOF where block allocation succeeded do not need to be trashed, so
+ * only new blocks from this write should be trashed. For blocks within
+ * EOF, generic_write_end() zeros them so they are safe to leave alone and be
+ * written with all the other valid data.
+ */
+STATIC int
+xfs_vm_write_end(
+	struct file		*file,
+	struct address_space	*mapping,
+	loff_t			pos,
+	unsigned		len,
+	unsigned		copied,
+	struct page		*page,
+	void			*fsdata)
+{
+	int			ret;
+
+	ASSERT(len <= PAGE_CACHE_SIZE);
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (unlikely(ret < len)) {
+		struct inode	*inode = mapping->host;
+		size_t		isize = i_size_read(inode);
+		loff_t		to = pos + len;
+
+		if (to > isize) {
+			/* only kill blocks in this write beyond EOF */
+			if (pos > isize)
+				isize = pos;
+			xfs_vm_kill_delalloc_range(inode, isize, to);
+			truncate_pagecache_range(inode, isize, to);
+		}
+	}
+	return ret;
+}
+
+STATIC sector_t
+xfs_vm_bmap(
+	struct address_space	*mapping,
+	sector_t		block)
+{
+	struct inode		*inode = (struct inode *)mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	trace_xfs_vm_bmap(XFS_I(inode));
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	filemap_write_and_wait(mapping);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	return generic_block_bmap(mapping, block, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpage(
+	struct file		*unused,
+	struct page		*page)
+{
+	return mpage_readpage(page, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpages(
+	struct file		*unused,
+	struct address_space	*mapping,
+	struct list_head	*pages,
+	unsigned		nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
+}
+
+const struct address_space_operations xfs_address_space_operations = {
+	.readpage		= xfs_vm_readpage,
+	.readpages		= xfs_vm_readpages,
+	.writepage		= xfs_vm_writepage,
+	.writepages		= xfs_vm_writepages,
+	.releasepage		= xfs_vm_releasepage,
+	.invalidatepage		= xfs_vm_invalidatepage,
+	.write_begin		= xfs_vm_write_begin,
+	.write_end		= xfs_vm_write_end,
+	.bmap			= xfs_vm_bmap,
+	.direct_IO		= xfs_vm_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
+};
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/xfs_aops.h